2nd/_tools/p_reinforce_merge.py

"""
P-Reinforce Phase 2 — Auto-MERGE same-concept filename-variant clusters.

Scope (conservative — only the safest auto-merge cases):
    Within each cluster, only merge if EVERY member shares the same
    `norm_name` (case-insensitive, punctuation-stripped). This catches:
        Bellman Equation.md / Bellman-Equation.md / Bellman_Equation.md
        Best-of-N Sampling.md / Best-of-N-Sampling.md / Best-of-N_Sampling.md
        Computer Vision.md / Computer-Vision.md / Computer_Vision.md
    These are virtually always the same concept under different naming
    conventions. Cross-folder is OK (same norm_name across AI/ vs AI_and_ML/).

    Skipped automatically:
    - Clusters where members have DIFFERENT norm_names (i.e. unioned only by
      body fingerprint — could be coincidence).
    - Clusters with mixed redirects + canonical (already partly merged).
    - Operational paths (sessions/, _agents/, etc).

Canonical selection rule per cluster:
    1. Highest body_chars (most content wins) ...
    2. ... ties broken by latest last_reinforced ...
    3. ... ties broken by lex-shortest filename (stability).

Action per non-canonical member:
    - If it already has redirect_to — leave it alone (already merged).
    - Otherwise rewrite to a redirect stub pointing at canonical.
    - Move to 01_Archive/MERGED/<date>/<original-relative-path>

Outputs:
    20_Meta/ReviewQueue/auto_merge_log.md   - log of every cluster decision
    01_Archive/MERGED/<YYYY-MM-DD>/...      - moved-out original files
"""

from __future__ import annotations

import json
import re
import shutil
import sys
from collections import defaultdict
from dataclasses import dataclass
from datetime import date
from pathlib import Path

ROOT = Path(r"E:/Wiki/2nd")
INDEX_JSON = ROOT / "20_Meta" / "ReviewQueue" / "_index.json"
CLUSTERS_JSON = ROOT / "20_Meta" / "ReviewQueue" / "_clusters.json"
LOG_MD = ROOT / "20_Meta" / "ReviewQueue" / "auto_merge_log.md"
ARCHIVE_BASE = ROOT / "01_Archive" / "MERGED"

EXCLUDE_FRAG = (
    "/sessions/", "/_agents/", "/_company/", "/memory/",
    "/Project_Logs/", "/Harness_Research_", "/docs/records/",
    "/_Archive_Orphans/", "/Post_Drafts/", "/UX_Scenarios/",
)


def is_operational(rel_path: str) -> bool:
    rel = "/" + rel_path.replace("\\", "/")
    return any(x in rel for x in EXCLUDE_FRAG)


def load_clusters() -> list[dict]:
    return json.loads(CLUSTERS_JSON.read_text(encoding="utf-8"))


def load_index() -> dict[str, dict]:
    arr = json.loads(INDEX_JSON.read_text(encoding="utf-8"))
    return {e["path"]: e for e in arr}


def pick_canonical(members: list[dict], idx: dict[str, dict]) -> dict:
    def keyfn(m: dict) -> tuple:
        e = idx.get(m["path"], {})
        return (
            -e.get("body_chars", 0),
            -1 * (1 if e.get("fm_last_reinforced") else 0),  # prefer files that have a date
            (e.get("fm_last_reinforced") or "0000-00-00") * -1 if False else (e.get("fm_last_reinforced") or "0000-00-00"),
            len(m["filename"]),
            m["filename"],
        )
    # explicit: largest body, then most-recent last_reinforced (later date wins),
    # then shortest filename, then lexicographic
    def sort_key(m: dict) -> tuple:
        e = idx.get(m["path"], {})
        return (
            -e.get("body_chars", 0),                          # bigger first
            "0000-00-00" if not e.get("fm_last_reinforced") else _neg_date(e["fm_last_reinforced"]),
            len(m["filename"]),
            m["filename"],
        )
    return sorted(members, key=sort_key)[0]


def _neg_date(d: str) -> str:
    # Map YYYY-MM-DD to a string that sorts later-dates-first when sorted ascending.
    parts = d.split("-")
    if len(parts) != 3:
        return "ZZZZ"
    try:
        y = 9999 - int(parts[0])
        m = 99 - int(parts[1])
        day = 99 - int(parts[2])
        return f"{y:04d}-{m:02d}-{day:02d}"
    except ValueError:
        return "ZZZZ"


REDIRECT_TEMPLATE = """---
id: {id}
title: {title}
category: {category}
status: merged
redirect_to: {target}
canonical_id: {target}
aliases: []
duplicate_of: none
source_trust_level: A
confidence_score: 0.92
tags: [redirect]
raw_sources: []
last_reinforced: {today}
github_commit: pending
inferred_by: Claude Opus 4.7 (auto-merge 2026-05-08)
---

# {title}

> [!IMPORTANT]
> 이 문서는 P-Reinforce Phase 2 자동 MERGE에 의해 **[[{target}]]**로 통합되었습니다.

---
*Redirected to: [[{target}]]*
"""


def make_redirect_stub(member: dict, canonical: dict, today: str) -> str:
    title = member["filename"].replace("-", " ").replace("_", " ")
    target = canonical["filename"].replace("-", " ").replace("_", " ").replace(" ", "_")
    # use canonical filename (without ext, with underscores) as wiki-link target
    target_clean = canonical["filename"]
    return REDIRECT_TEMPLATE.format(
        id=f"wiki-{today.replace('-', '')[:8]}-{re.sub(r'[^a-z0-9]+', '-', title.lower())[:32]}-redir",
        title=title,
        category=member.get("folder") or "10_Wiki/Topics",
        target=target_clean,
        today=today,
    )


@dataclass
class MergePlan:
    cluster_id: int
    canonical_path: str
    losers: list[str]
    norm_name_set: set[str]


def plan_merges(clusters: list[dict], idx: dict[str, dict]) -> tuple[list[MergePlan], list[dict]]:
    plans: list[MergePlan] = []
    skipped: list[dict] = []
    for cid, c in enumerate(clusters):
        size = c["size"]
        members = c["members"]
        if size < 2:
            continue

        # Filter out operational paths
        if any(is_operational(m["path"]) for m in members):
            skipped.append({"cluster_id": cid, "reason": "operational-path", "members": [m["path"] for m in members]})
            continue

        # Need every member to share norm_name
        norm_names = set()
        for m in members:
            e = idx.get(m["path"], {})
            norm_names.add(e.get("norm_name", ""))
        if len(norm_names) != 1 or "" in norm_names:
            skipped.append({"cluster_id": cid, "reason": "norm-name-mismatch", "norm_names": list(norm_names), "members": [m["path"] for m in members]})
            continue

        # Drop any members already in MERGED archive (shouldn't happen but safe)
        members = [m for m in members if "01_Archive/" not in m["path"]]
        if len(members) < 2:
            continue

        canonical = pick_canonical(members, idx)
        losers = [m for m in members if m["path"] != canonical["path"]]
        plans.append(MergePlan(cluster_id=cid, canonical_path=canonical["path"], losers=[l["path"] for l in losers], norm_name_set=norm_names))
    return plans, skipped


def apply_plan(plans: list[MergePlan], idx: dict[str, dict]) -> dict:
    today = date.today().isoformat()
    archive_dir = ARCHIVE_BASE / today
    archive_dir.mkdir(parents=True, exist_ok=True)

    n_merged = 0
    n_archived = 0
    log_lines: list[str] = []
    log_lines.append(f"# Auto-merge log — {today}\n")
    log_lines.append(f"\n총 plan: **{len(plans)}** clusters\n")

    for plan in plans:
        canonical_abs = ROOT / plan.canonical_path
        log_lines.append(f"\n## Cluster {plan.cluster_id} — canonical: `{plan.canonical_path}`")
        log_lines.append(f"\n- **norm_name**: `{list(plan.norm_name_set)[0]}`")
        log_lines.append(f"- **canonical** (kept): [{plan.canonical_path}](/{plan.canonical_path})")
        log_lines.append(f"- **merged-into-redirect**:")
        for loser_rel in plan.losers:
            loser_abs = ROOT / loser_rel
            if not loser_abs.exists():
                log_lines.append(f"  - ~~{loser_rel}~~ (already missing — skipped)")
                continue
            # Construct redirect stub pointing at canonical
            member = {"filename": loser_abs.stem, "path": loser_rel,
                      "folder": loser_abs.parent.name}
            canonical = {"filename": canonical_abs.stem, "path": plan.canonical_path}
            stub_text = make_redirect_stub(member, canonical, today)

            # Move original to archive (preserving relative path)
            archive_target = archive_dir / loser_rel
            archive_target.parent.mkdir(parents=True, exist_ok=True)
            shutil.move(str(loser_abs), str(archive_target))
            n_archived += 1
            # Write redirect stub at original location (so wiki-links don't break)
            loser_abs.write_text(stub_text, encoding="utf-8")
            n_merged += 1
            log_lines.append(f"  - {loser_rel} → archived to `01_Archive/MERGED/{today}/{loser_rel}`, replaced with redirect → `{canonical_abs.stem}`")

    LOG_MD.write_text("\n".join(log_lines), encoding="utf-8")
    return {"merged": n_merged, "archived": n_archived, "plans": len(plans)}


def main() -> int:
    if not CLUSTERS_JSON.exists() or not INDEX_JSON.exists():
        print(f"ERROR: run p_reinforce_index.py first", file=sys.stderr)
        return 2
    clusters = load_clusters()
    idx = load_index()
    plans, skipped = plan_merges(clusters, idx)
    print(f"Planned {len(plans)} merge clusters, skipped {len(skipped)}", file=sys.stderr)
    if "--dry-run" in sys.argv:
        for p in plans[:30]:
            print(f"  cluster {p.cluster_id}: keep {p.canonical_path}, redirect {len(p.losers)}", file=sys.stderr)
        return 0
    result = apply_plan(plans, idx)
    print(f"DONE: merged={result['merged']}, archived={result['archived']}", file=sys.stderr)
    print(f"Log: {LOG_MD}", file=sys.stderr)
    return 0


if __name__ == "__main__":
    sys.exit(main())