255 lines
9.4 KiB
Python
255 lines
9.4 KiB
Python
"""
|
|
P-Reinforce Phase 2 — Auto-MERGE same-concept filename-variant clusters.
|
|
|
|
Scope (conservative — only the safest auto-merge cases):
|
|
Within each cluster, only merge if EVERY member shares the same
|
|
`norm_name` (case-insensitive, punctuation-stripped). This catches:
|
|
Bellman Equation.md / Bellman-Equation.md / Bellman_Equation.md
|
|
Best-of-N Sampling.md / Best-of-N-Sampling.md / Best-of-N_Sampling.md
|
|
Computer Vision.md / Computer-Vision.md / Computer_Vision.md
|
|
These are virtually always the same concept under different naming
|
|
conventions. Cross-folder is OK (same norm_name across AI/ vs AI_and_ML/).
|
|
|
|
Skipped automatically:
|
|
- Clusters where members have DIFFERENT norm_names (i.e. unioned only by
|
|
body fingerprint — could be coincidence).
|
|
- Clusters with mixed redirects + canonical (already partly merged).
|
|
- Operational paths (sessions/, _agents/, etc).
|
|
|
|
Canonical selection rule per cluster:
|
|
1. Highest body_chars (most content wins) ...
|
|
2. ... ties broken by latest last_reinforced ...
|
|
3. ... ties broken by lex-shortest filename (stability).
|
|
|
|
Action per non-canonical member:
|
|
- If it already has redirect_to — leave it alone (already merged).
|
|
- Otherwise rewrite to a redirect stub pointing at canonical.
|
|
- Move to 01_Archive/MERGED/<date>/<original-relative-path>
|
|
|
|
Outputs:
|
|
20_Meta/ReviewQueue/auto_merge_log.md - log of every cluster decision
|
|
01_Archive/MERGED/<YYYY-MM-DD>/... - moved-out original files
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import re
|
|
import shutil
|
|
import sys
|
|
from collections import defaultdict
|
|
from dataclasses import dataclass
|
|
from datetime import date
|
|
from pathlib import Path
|
|
|
|
ROOT = Path(r"E:/Wiki/2nd")
|
|
INDEX_JSON = ROOT / "20_Meta" / "ReviewQueue" / "_index.json"
|
|
CLUSTERS_JSON = ROOT / "20_Meta" / "ReviewQueue" / "_clusters.json"
|
|
LOG_MD = ROOT / "20_Meta" / "ReviewQueue" / "auto_merge_log.md"
|
|
ARCHIVE_BASE = ROOT / "01_Archive" / "MERGED"
|
|
|
|
EXCLUDE_FRAG = (
|
|
"/sessions/", "/_agents/", "/_company/", "/memory/",
|
|
"/Project_Logs/", "/Harness_Research_", "/docs/records/",
|
|
"/_Archive_Orphans/", "/Post_Drafts/", "/UX_Scenarios/",
|
|
)
|
|
|
|
|
|
def is_operational(rel_path: str) -> bool:
|
|
rel = "/" + rel_path.replace("\\", "/")
|
|
return any(x in rel for x in EXCLUDE_FRAG)
|
|
|
|
|
|
def load_clusters() -> list[dict]:
|
|
return json.loads(CLUSTERS_JSON.read_text(encoding="utf-8"))
|
|
|
|
|
|
def load_index() -> dict[str, dict]:
|
|
arr = json.loads(INDEX_JSON.read_text(encoding="utf-8"))
|
|
return {e["path"]: e for e in arr}
|
|
|
|
|
|
def pick_canonical(members: list[dict], idx: dict[str, dict]) -> dict:
|
|
def keyfn(m: dict) -> tuple:
|
|
e = idx.get(m["path"], {})
|
|
return (
|
|
-e.get("body_chars", 0),
|
|
-1 * (1 if e.get("fm_last_reinforced") else 0), # prefer files that have a date
|
|
(e.get("fm_last_reinforced") or "0000-00-00") * -1 if False else (e.get("fm_last_reinforced") or "0000-00-00"),
|
|
len(m["filename"]),
|
|
m["filename"],
|
|
)
|
|
# explicit: largest body, then most-recent last_reinforced (later date wins),
|
|
# then shortest filename, then lexicographic
|
|
def sort_key(m: dict) -> tuple:
|
|
e = idx.get(m["path"], {})
|
|
return (
|
|
-e.get("body_chars", 0), # bigger first
|
|
"0000-00-00" if not e.get("fm_last_reinforced") else _neg_date(e["fm_last_reinforced"]),
|
|
len(m["filename"]),
|
|
m["filename"],
|
|
)
|
|
return sorted(members, key=sort_key)[0]
|
|
|
|
|
|
def _neg_date(d: str) -> str:
|
|
# Map YYYY-MM-DD to a string that sorts later-dates-first when sorted ascending.
|
|
parts = d.split("-")
|
|
if len(parts) != 3:
|
|
return "ZZZZ"
|
|
try:
|
|
y = 9999 - int(parts[0])
|
|
m = 99 - int(parts[1])
|
|
day = 99 - int(parts[2])
|
|
return f"{y:04d}-{m:02d}-{day:02d}"
|
|
except ValueError:
|
|
return "ZZZZ"
|
|
|
|
|
|
REDIRECT_TEMPLATE = """---
|
|
id: {id}
|
|
title: {title}
|
|
category: {category}
|
|
status: merged
|
|
redirect_to: {target}
|
|
canonical_id: {target}
|
|
aliases: []
|
|
duplicate_of: none
|
|
source_trust_level: A
|
|
confidence_score: 0.92
|
|
tags: [redirect]
|
|
raw_sources: []
|
|
last_reinforced: {today}
|
|
github_commit: pending
|
|
inferred_by: Claude Opus 4.7 (auto-merge 2026-05-08)
|
|
---
|
|
|
|
# {title}
|
|
|
|
> [!IMPORTANT]
|
|
> 이 문서는 P-Reinforce Phase 2 자동 MERGE에 의해 **[[{target}]]**로 통합되었습니다.
|
|
|
|
---
|
|
*Redirected to: [[{target}]]*
|
|
"""
|
|
|
|
|
|
def make_redirect_stub(member: dict, canonical: dict, today: str) -> str:
|
|
title = member["filename"].replace("-", " ").replace("_", " ")
|
|
target = canonical["filename"].replace("-", " ").replace("_", " ").replace(" ", "_")
|
|
# use canonical filename (without ext, with underscores) as wiki-link target
|
|
target_clean = canonical["filename"]
|
|
return REDIRECT_TEMPLATE.format(
|
|
id=f"wiki-{today.replace('-', '')[:8]}-{re.sub(r'[^a-z0-9]+', '-', title.lower())[:32]}-redir",
|
|
title=title,
|
|
category=member.get("folder") or "10_Wiki/Topics",
|
|
target=target_clean,
|
|
today=today,
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class MergePlan:
|
|
cluster_id: int
|
|
canonical_path: str
|
|
losers: list[str]
|
|
norm_name_set: set[str]
|
|
|
|
|
|
def plan_merges(clusters: list[dict], idx: dict[str, dict]) -> tuple[list[MergePlan], list[dict]]:
|
|
plans: list[MergePlan] = []
|
|
skipped: list[dict] = []
|
|
for cid, c in enumerate(clusters):
|
|
size = c["size"]
|
|
members = c["members"]
|
|
if size < 2:
|
|
continue
|
|
|
|
# Filter out operational paths
|
|
if any(is_operational(m["path"]) for m in members):
|
|
skipped.append({"cluster_id": cid, "reason": "operational-path", "members": [m["path"] for m in members]})
|
|
continue
|
|
|
|
# Need every member to share norm_name
|
|
norm_names = set()
|
|
for m in members:
|
|
e = idx.get(m["path"], {})
|
|
norm_names.add(e.get("norm_name", ""))
|
|
if len(norm_names) != 1 or "" in norm_names:
|
|
skipped.append({"cluster_id": cid, "reason": "norm-name-mismatch", "norm_names": list(norm_names), "members": [m["path"] for m in members]})
|
|
continue
|
|
|
|
# Drop any members already in MERGED archive (shouldn't happen but safe)
|
|
members = [m for m in members if "01_Archive/" not in m["path"]]
|
|
if len(members) < 2:
|
|
continue
|
|
|
|
canonical = pick_canonical(members, idx)
|
|
losers = [m for m in members if m["path"] != canonical["path"]]
|
|
plans.append(MergePlan(cluster_id=cid, canonical_path=canonical["path"], losers=[l["path"] for l in losers], norm_name_set=norm_names))
|
|
return plans, skipped
|
|
|
|
|
|
def apply_plan(plans: list[MergePlan], idx: dict[str, dict]) -> dict:
|
|
today = date.today().isoformat()
|
|
archive_dir = ARCHIVE_BASE / today
|
|
archive_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
n_merged = 0
|
|
n_archived = 0
|
|
log_lines: list[str] = []
|
|
log_lines.append(f"# Auto-merge log — {today}\n")
|
|
log_lines.append(f"\n총 plan: **{len(plans)}** clusters\n")
|
|
|
|
for plan in plans:
|
|
canonical_abs = ROOT / plan.canonical_path
|
|
log_lines.append(f"\n## Cluster {plan.cluster_id} — canonical: `{plan.canonical_path}`")
|
|
log_lines.append(f"\n- **norm_name**: `{list(plan.norm_name_set)[0]}`")
|
|
log_lines.append(f"- **canonical** (kept): [{plan.canonical_path}](/{plan.canonical_path})")
|
|
log_lines.append(f"- **merged-into-redirect**:")
|
|
for loser_rel in plan.losers:
|
|
loser_abs = ROOT / loser_rel
|
|
if not loser_abs.exists():
|
|
log_lines.append(f" - ~~{loser_rel}~~ (already missing — skipped)")
|
|
continue
|
|
# Construct redirect stub pointing at canonical
|
|
member = {"filename": loser_abs.stem, "path": loser_rel,
|
|
"folder": loser_abs.parent.name}
|
|
canonical = {"filename": canonical_abs.stem, "path": plan.canonical_path}
|
|
stub_text = make_redirect_stub(member, canonical, today)
|
|
|
|
# Move original to archive (preserving relative path)
|
|
archive_target = archive_dir / loser_rel
|
|
archive_target.parent.mkdir(parents=True, exist_ok=True)
|
|
shutil.move(str(loser_abs), str(archive_target))
|
|
n_archived += 1
|
|
# Write redirect stub at original location (so wiki-links don't break)
|
|
loser_abs.write_text(stub_text, encoding="utf-8")
|
|
n_merged += 1
|
|
log_lines.append(f" - {loser_rel} → archived to `01_Archive/MERGED/{today}/{loser_rel}`, replaced with redirect → `{canonical_abs.stem}`")
|
|
|
|
LOG_MD.write_text("\n".join(log_lines), encoding="utf-8")
|
|
return {"merged": n_merged, "archived": n_archived, "plans": len(plans)}
|
|
|
|
|
|
def main() -> int:
|
|
if not CLUSTERS_JSON.exists() or not INDEX_JSON.exists():
|
|
print(f"ERROR: run p_reinforce_index.py first", file=sys.stderr)
|
|
return 2
|
|
clusters = load_clusters()
|
|
idx = load_index()
|
|
plans, skipped = plan_merges(clusters, idx)
|
|
print(f"Planned {len(plans)} merge clusters, skipped {len(skipped)}", file=sys.stderr)
|
|
if "--dry-run" in sys.argv:
|
|
for p in plans[:30]:
|
|
print(f" cluster {p.cluster_id}: keep {p.canonical_path}, redirect {len(p.losers)}", file=sys.stderr)
|
|
return 0
|
|
result = apply_plan(plans, idx)
|
|
print(f"DONE: merged={result['merged']}, archived={result['archived']}", file=sys.stderr)
|
|
print(f"Log: {LOG_MD}", file=sys.stderr)
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|