Files
2nd/_tools/p_reinforce_merge.py

255 lines
9.4 KiB
Python

"""
P-Reinforce Phase 2 — Auto-MERGE same-concept filename-variant clusters.
Scope (conservative — only the safest auto-merge cases):
Within each cluster, only merge if EVERY member shares the same
`norm_name` (case-insensitive, punctuation-stripped). This catches:
Bellman Equation.md / Bellman-Equation.md / Bellman_Equation.md
Best-of-N Sampling.md / Best-of-N-Sampling.md / Best-of-N_Sampling.md
Computer Vision.md / Computer-Vision.md / Computer_Vision.md
These are virtually always the same concept under different naming
conventions. Cross-folder is OK (same norm_name across AI/ vs AI_and_ML/).
Skipped automatically:
- Clusters where members have DIFFERENT norm_names (i.e. unioned only by
body fingerprint — could be coincidence).
- Clusters with mixed redirects + canonical (already partly merged).
- Operational paths (sessions/, _agents/, etc).
Canonical selection rule per cluster:
1. Highest body_chars (most content wins) ...
2. ... ties broken by latest last_reinforced ...
3. ... ties broken by lex-shortest filename (stability).
Action per non-canonical member:
- If it already has redirect_to — leave it alone (already merged).
- Otherwise rewrite to a redirect stub pointing at canonical.
- Move to 01_Archive/MERGED/<date>/<original-relative-path>
Outputs:
20_Meta/ReviewQueue/auto_merge_log.md - log of every cluster decision
01_Archive/MERGED/<YYYY-MM-DD>/... - moved-out original files
"""
from __future__ import annotations
import json
import re
import shutil
import sys
from collections import defaultdict
from dataclasses import dataclass
from datetime import date
from pathlib import Path
ROOT = Path(r"E:/Wiki/2nd")
INDEX_JSON = ROOT / "20_Meta" / "ReviewQueue" / "_index.json"
CLUSTERS_JSON = ROOT / "20_Meta" / "ReviewQueue" / "_clusters.json"
LOG_MD = ROOT / "20_Meta" / "ReviewQueue" / "auto_merge_log.md"
ARCHIVE_BASE = ROOT / "01_Archive" / "MERGED"
EXCLUDE_FRAG = (
"/sessions/", "/_agents/", "/_company/", "/memory/",
"/Project_Logs/", "/Harness_Research_", "/docs/records/",
"/_Archive_Orphans/", "/Post_Drafts/", "/UX_Scenarios/",
)
def is_operational(rel_path: str) -> bool:
rel = "/" + rel_path.replace("\\", "/")
return any(x in rel for x in EXCLUDE_FRAG)
def load_clusters() -> list[dict]:
return json.loads(CLUSTERS_JSON.read_text(encoding="utf-8"))
def load_index() -> dict[str, dict]:
arr = json.loads(INDEX_JSON.read_text(encoding="utf-8"))
return {e["path"]: e for e in arr}
def pick_canonical(members: list[dict], idx: dict[str, dict]) -> dict:
def keyfn(m: dict) -> tuple:
e = idx.get(m["path"], {})
return (
-e.get("body_chars", 0),
-1 * (1 if e.get("fm_last_reinforced") else 0), # prefer files that have a date
(e.get("fm_last_reinforced") or "0000-00-00") * -1 if False else (e.get("fm_last_reinforced") or "0000-00-00"),
len(m["filename"]),
m["filename"],
)
# explicit: largest body, then most-recent last_reinforced (later date wins),
# then shortest filename, then lexicographic
def sort_key(m: dict) -> tuple:
e = idx.get(m["path"], {})
return (
-e.get("body_chars", 0), # bigger first
"0000-00-00" if not e.get("fm_last_reinforced") else _neg_date(e["fm_last_reinforced"]),
len(m["filename"]),
m["filename"],
)
return sorted(members, key=sort_key)[0]
def _neg_date(d: str) -> str:
# Map YYYY-MM-DD to a string that sorts later-dates-first when sorted ascending.
parts = d.split("-")
if len(parts) != 3:
return "ZZZZ"
try:
y = 9999 - int(parts[0])
m = 99 - int(parts[1])
day = 99 - int(parts[2])
return f"{y:04d}-{m:02d}-{day:02d}"
except ValueError:
return "ZZZZ"
REDIRECT_TEMPLATE = """---
id: {id}
title: {title}
category: {category}
status: merged
redirect_to: {target}
canonical_id: {target}
aliases: []
duplicate_of: none
source_trust_level: A
confidence_score: 0.92
tags: [redirect]
raw_sources: []
last_reinforced: {today}
github_commit: pending
inferred_by: Claude Opus 4.7 (auto-merge 2026-05-08)
---
# {title}
> [!IMPORTANT]
> 이 문서는 P-Reinforce Phase 2 자동 MERGE에 의해 **[[{target}]]**로 통합되었습니다.
---
*Redirected to: [[{target}]]*
"""
def make_redirect_stub(member: dict, canonical: dict, today: str) -> str:
title = member["filename"].replace("-", " ").replace("_", " ")
target = canonical["filename"].replace("-", " ").replace("_", " ").replace(" ", "_")
# use canonical filename (without ext, with underscores) as wiki-link target
target_clean = canonical["filename"]
return REDIRECT_TEMPLATE.format(
id=f"wiki-{today.replace('-', '')[:8]}-{re.sub(r'[^a-z0-9]+', '-', title.lower())[:32]}-redir",
title=title,
category=member.get("folder") or "10_Wiki/Topics",
target=target_clean,
today=today,
)
@dataclass
class MergePlan:
cluster_id: int
canonical_path: str
losers: list[str]
norm_name_set: set[str]
def plan_merges(clusters: list[dict], idx: dict[str, dict]) -> tuple[list[MergePlan], list[dict]]:
plans: list[MergePlan] = []
skipped: list[dict] = []
for cid, c in enumerate(clusters):
size = c["size"]
members = c["members"]
if size < 2:
continue
# Filter out operational paths
if any(is_operational(m["path"]) for m in members):
skipped.append({"cluster_id": cid, "reason": "operational-path", "members": [m["path"] for m in members]})
continue
# Need every member to share norm_name
norm_names = set()
for m in members:
e = idx.get(m["path"], {})
norm_names.add(e.get("norm_name", ""))
if len(norm_names) != 1 or "" in norm_names:
skipped.append({"cluster_id": cid, "reason": "norm-name-mismatch", "norm_names": list(norm_names), "members": [m["path"] for m in members]})
continue
# Drop any members already in MERGED archive (shouldn't happen but safe)
members = [m for m in members if "01_Archive/" not in m["path"]]
if len(members) < 2:
continue
canonical = pick_canonical(members, idx)
losers = [m for m in members if m["path"] != canonical["path"]]
plans.append(MergePlan(cluster_id=cid, canonical_path=canonical["path"], losers=[l["path"] for l in losers], norm_name_set=norm_names))
return plans, skipped
def apply_plan(plans: list[MergePlan], idx: dict[str, dict]) -> dict:
today = date.today().isoformat()
archive_dir = ARCHIVE_BASE / today
archive_dir.mkdir(parents=True, exist_ok=True)
n_merged = 0
n_archived = 0
log_lines: list[str] = []
log_lines.append(f"# Auto-merge log — {today}\n")
log_lines.append(f"\n총 plan: **{len(plans)}** clusters\n")
for plan in plans:
canonical_abs = ROOT / plan.canonical_path
log_lines.append(f"\n## Cluster {plan.cluster_id} — canonical: `{plan.canonical_path}`")
log_lines.append(f"\n- **norm_name**: `{list(plan.norm_name_set)[0]}`")
log_lines.append(f"- **canonical** (kept): [{plan.canonical_path}](/{plan.canonical_path})")
log_lines.append(f"- **merged-into-redirect**:")
for loser_rel in plan.losers:
loser_abs = ROOT / loser_rel
if not loser_abs.exists():
log_lines.append(f" - ~~{loser_rel}~~ (already missing — skipped)")
continue
# Construct redirect stub pointing at canonical
member = {"filename": loser_abs.stem, "path": loser_rel,
"folder": loser_abs.parent.name}
canonical = {"filename": canonical_abs.stem, "path": plan.canonical_path}
stub_text = make_redirect_stub(member, canonical, today)
# Move original to archive (preserving relative path)
archive_target = archive_dir / loser_rel
archive_target.parent.mkdir(parents=True, exist_ok=True)
shutil.move(str(loser_abs), str(archive_target))
n_archived += 1
# Write redirect stub at original location (so wiki-links don't break)
loser_abs.write_text(stub_text, encoding="utf-8")
n_merged += 1
log_lines.append(f" - {loser_rel} → archived to `01_Archive/MERGED/{today}/{loser_rel}`, replaced with redirect → `{canonical_abs.stem}`")
LOG_MD.write_text("\n".join(log_lines), encoding="utf-8")
return {"merged": n_merged, "archived": n_archived, "plans": len(plans)}
def main() -> int:
if not CLUSTERS_JSON.exists() or not INDEX_JSON.exists():
print(f"ERROR: run p_reinforce_index.py first", file=sys.stderr)
return 2
clusters = load_clusters()
idx = load_index()
plans, skipped = plan_merges(clusters, idx)
print(f"Planned {len(plans)} merge clusters, skipped {len(skipped)}", file=sys.stderr)
if "--dry-run" in sys.argv:
for p in plans[:30]:
print(f" cluster {p.cluster_id}: keep {p.canonical_path}, redirect {len(p.losers)}", file=sys.stderr)
return 0
result = apply_plan(plans, idx)
print(f"DONE: merged={result['merged']}, archived={result['archived']}", file=sys.stderr)
print(f"Log: {LOG_MD}", file=sys.stderr)
return 0
if __name__ == "__main__":
sys.exit(main())