""" P-Reinforce Phase 2 — Auto-MERGE same-concept filename-variant clusters. Scope (conservative — only the safest auto-merge cases): Within each cluster, only merge if EVERY member shares the same `norm_name` (case-insensitive, punctuation-stripped). This catches: Bellman Equation.md / Bellman-Equation.md / Bellman_Equation.md Best-of-N Sampling.md / Best-of-N-Sampling.md / Best-of-N_Sampling.md Computer Vision.md / Computer-Vision.md / Computer_Vision.md These are virtually always the same concept under different naming conventions. Cross-folder is OK (same norm_name across AI/ vs AI_and_ML/). Skipped automatically: - Clusters where members have DIFFERENT norm_names (i.e. unioned only by body fingerprint — could be coincidence). - Clusters with mixed redirects + canonical (already partly merged). - Operational paths (sessions/, _agents/, etc). Canonical selection rule per cluster: 1. Highest body_chars (most content wins) ... 2. ... ties broken by latest last_reinforced ... 3. ... ties broken by lex-shortest filename (stability). Action per non-canonical member: - If it already has redirect_to — leave it alone (already merged). - Otherwise rewrite to a redirect stub pointing at canonical. - Move to 01_Archive/MERGED// Outputs: 20_Meta/ReviewQueue/auto_merge_log.md - log of every cluster decision 01_Archive/MERGED//... - moved-out original files """ from __future__ import annotations import json import re import shutil import sys from collections import defaultdict from dataclasses import dataclass from datetime import date from pathlib import Path ROOT = Path(r"E:/Wiki/2nd") INDEX_JSON = ROOT / "20_Meta" / "ReviewQueue" / "_index.json" CLUSTERS_JSON = ROOT / "20_Meta" / "ReviewQueue" / "_clusters.json" LOG_MD = ROOT / "20_Meta" / "ReviewQueue" / "auto_merge_log.md" ARCHIVE_BASE = ROOT / "01_Archive" / "MERGED" EXCLUDE_FRAG = ( "/sessions/", "/_agents/", "/_company/", "/memory/", "/Project_Logs/", "/Harness_Research_", "/docs/records/", "/_Archive_Orphans/", "/Post_Drafts/", "/UX_Scenarios/", ) def is_operational(rel_path: str) -> bool: rel = "/" + rel_path.replace("\\", "/") return any(x in rel for x in EXCLUDE_FRAG) def load_clusters() -> list[dict]: return json.loads(CLUSTERS_JSON.read_text(encoding="utf-8")) def load_index() -> dict[str, dict]: arr = json.loads(INDEX_JSON.read_text(encoding="utf-8")) return {e["path"]: e for e in arr} def pick_canonical(members: list[dict], idx: dict[str, dict]) -> dict: def keyfn(m: dict) -> tuple: e = idx.get(m["path"], {}) return ( -e.get("body_chars", 0), -1 * (1 if e.get("fm_last_reinforced") else 0), # prefer files that have a date (e.get("fm_last_reinforced") or "0000-00-00") * -1 if False else (e.get("fm_last_reinforced") or "0000-00-00"), len(m["filename"]), m["filename"], ) # explicit: largest body, then most-recent last_reinforced (later date wins), # then shortest filename, then lexicographic def sort_key(m: dict) -> tuple: e = idx.get(m["path"], {}) return ( -e.get("body_chars", 0), # bigger first "0000-00-00" if not e.get("fm_last_reinforced") else _neg_date(e["fm_last_reinforced"]), len(m["filename"]), m["filename"], ) return sorted(members, key=sort_key)[0] def _neg_date(d: str) -> str: # Map YYYY-MM-DD to a string that sorts later-dates-first when sorted ascending. parts = d.split("-") if len(parts) != 3: return "ZZZZ" try: y = 9999 - int(parts[0]) m = 99 - int(parts[1]) day = 99 - int(parts[2]) return f"{y:04d}-{m:02d}-{day:02d}" except ValueError: return "ZZZZ" REDIRECT_TEMPLATE = """--- id: {id} title: {title} category: {category} status: merged redirect_to: {target} canonical_id: {target} aliases: [] duplicate_of: none source_trust_level: A confidence_score: 0.92 tags: [redirect] raw_sources: [] last_reinforced: {today} github_commit: pending inferred_by: Claude Opus 4.7 (auto-merge 2026-05-08) --- # {title} > [!IMPORTANT] > 이 문서는 P-Reinforce Phase 2 자동 MERGE에 의해 **[[{target}]]**로 통합되었습니다. --- *Redirected to: [[{target}]]* """ def make_redirect_stub(member: dict, canonical: dict, today: str) -> str: title = member["filename"].replace("-", " ").replace("_", " ") target = canonical["filename"].replace("-", " ").replace("_", " ").replace(" ", "_") # use canonical filename (without ext, with underscores) as wiki-link target target_clean = canonical["filename"] return REDIRECT_TEMPLATE.format( id=f"wiki-{today.replace('-', '')[:8]}-{re.sub(r'[^a-z0-9]+', '-', title.lower())[:32]}-redir", title=title, category=member.get("folder") or "10_Wiki/Topics", target=target_clean, today=today, ) @dataclass class MergePlan: cluster_id: int canonical_path: str losers: list[str] norm_name_set: set[str] def plan_merges(clusters: list[dict], idx: dict[str, dict]) -> tuple[list[MergePlan], list[dict]]: plans: list[MergePlan] = [] skipped: list[dict] = [] for cid, c in enumerate(clusters): size = c["size"] members = c["members"] if size < 2: continue # Filter out operational paths if any(is_operational(m["path"]) for m in members): skipped.append({"cluster_id": cid, "reason": "operational-path", "members": [m["path"] for m in members]}) continue # Need every member to share norm_name norm_names = set() for m in members: e = idx.get(m["path"], {}) norm_names.add(e.get("norm_name", "")) if len(norm_names) != 1 or "" in norm_names: skipped.append({"cluster_id": cid, "reason": "norm-name-mismatch", "norm_names": list(norm_names), "members": [m["path"] for m in members]}) continue # Drop any members already in MERGED archive (shouldn't happen but safe) members = [m for m in members if "01_Archive/" not in m["path"]] if len(members) < 2: continue canonical = pick_canonical(members, idx) losers = [m for m in members if m["path"] != canonical["path"]] plans.append(MergePlan(cluster_id=cid, canonical_path=canonical["path"], losers=[l["path"] for l in losers], norm_name_set=norm_names)) return plans, skipped def apply_plan(plans: list[MergePlan], idx: dict[str, dict]) -> dict: today = date.today().isoformat() archive_dir = ARCHIVE_BASE / today archive_dir.mkdir(parents=True, exist_ok=True) n_merged = 0 n_archived = 0 log_lines: list[str] = [] log_lines.append(f"# Auto-merge log — {today}\n") log_lines.append(f"\n총 plan: **{len(plans)}** clusters\n") for plan in plans: canonical_abs = ROOT / plan.canonical_path log_lines.append(f"\n## Cluster {plan.cluster_id} — canonical: `{plan.canonical_path}`") log_lines.append(f"\n- **norm_name**: `{list(plan.norm_name_set)[0]}`") log_lines.append(f"- **canonical** (kept): [{plan.canonical_path}](/{plan.canonical_path})") log_lines.append(f"- **merged-into-redirect**:") for loser_rel in plan.losers: loser_abs = ROOT / loser_rel if not loser_abs.exists(): log_lines.append(f" - ~~{loser_rel}~~ (already missing — skipped)") continue # Construct redirect stub pointing at canonical member = {"filename": loser_abs.stem, "path": loser_rel, "folder": loser_abs.parent.name} canonical = {"filename": canonical_abs.stem, "path": plan.canonical_path} stub_text = make_redirect_stub(member, canonical, today) # Move original to archive (preserving relative path) archive_target = archive_dir / loser_rel archive_target.parent.mkdir(parents=True, exist_ok=True) shutil.move(str(loser_abs), str(archive_target)) n_archived += 1 # Write redirect stub at original location (so wiki-links don't break) loser_abs.write_text(stub_text, encoding="utf-8") n_merged += 1 log_lines.append(f" - {loser_rel} → archived to `01_Archive/MERGED/{today}/{loser_rel}`, replaced with redirect → `{canonical_abs.stem}`") LOG_MD.write_text("\n".join(log_lines), encoding="utf-8") return {"merged": n_merged, "archived": n_archived, "plans": len(plans)} def main() -> int: if not CLUSTERS_JSON.exists() or not INDEX_JSON.exists(): print(f"ERROR: run p_reinforce_index.py first", file=sys.stderr) return 2 clusters = load_clusters() idx = load_index() plans, skipped = plan_merges(clusters, idx) print(f"Planned {len(plans)} merge clusters, skipped {len(skipped)}", file=sys.stderr) if "--dry-run" in sys.argv: for p in plans[:30]: print(f" cluster {p.cluster_id}: keep {p.canonical_path}, redirect {len(p.losers)}", file=sys.stderr) return 0 result = apply_plan(plans, idx) print(f"DONE: merged={result['merged']}, archived={result['archived']}", file=sys.stderr) print(f"Log: {LOG_MD}", file=sys.stderr) return 0 if __name__ == "__main__": sys.exit(main())