""" P-Reinforce Phase 1 — Duplicate Detection Indexer ================================================== Scans 10_Wiki/Topics/, builds an index of every .md file, and emits duplicate-candidate clusters into 20_Meta/ReviewQueue/. Read-only with respect to wiki content. No file is modified or moved. Outputs: 20_Meta/ReviewQueue/_index.json - per-file metadata 20_Meta/ReviewQueue/duplicate_candidates.md - human-readable cluster report 20_Meta/ReviewQueue/_clusters.json - machine-readable clusters Detection channels (any one match -> candidate cluster): 1. Normalized filename match (case-insensitive, strips spaces/underscores/hyphens/parens) 2. Normalized frontmatter title match 3. Normalized first-paragraph fingerprint (first 400 chars of body) 4. Alias intersection (frontmatter aliases overlap) Similarity tiers per P-Reinforce rules: >= 0.92 : near-duplicate (UPDATE candidate) 0.80-0.92 : duplicate candidate (ReviewQueue) 0.65-0.80 : related (link-only candidate) """ from __future__ import annotations import hashlib import json import os import re import sys import unicodedata from collections import defaultdict from dataclasses import dataclass, field, asdict from difflib import SequenceMatcher from pathlib import Path from typing import Iterable ROOT = Path(r"E:/Wiki/2nd") TOPICS = ROOT / "10_Wiki" / "Topics" OUT_DIR = ROOT / "20_Meta" / "ReviewQueue" INDEX_JSON = OUT_DIR / "_index.json" CLUSTERS_JSON = OUT_DIR / "_clusters.json" REPORT_MD = OUT_DIR / "duplicate_candidates.md" SKIP_DIR_NAMES = {".obsidian", ".git", "__pycache__", "node_modules"} # Path components that mark "operational logs / agent runtime", not knowledge. # Files under any of these are scanned for awareness but excluded from # duplicate-cluster building so they don't drown out real concept duplicates. EXCLUDE_PATH_FRAGMENTS = ( "/sessions/", "/_agents/", "/_company/", "/memory/", "/Project_Logs/", "/Harness_Research_", "/docs/records/", "/_Archive_Orphans/", "/Post_Drafts/", "/UX_Scenarios/", ) FRONTMATTER_RE = re.compile(r"^---\s*\n(.*?)\n---\s*\n?", re.DOTALL) WIKI_LINK_RE = re.compile(r"\[\[([^\]|]+?)(?:\|[^\]]+)?\]\]") CODE_BLOCK_RE = re.compile(r"```.*?```", re.DOTALL) HEADING_RE = re.compile(r"^#{1,6}\s+.*$", re.MULTILINE) WHITESPACE_RE = re.compile(r"\s+") NONALNUM_RE = re.compile(r"[^0-9a-z가-힣]+") @dataclass class FileEntry: path: str # relative to ROOT abs_path: str folder: str # immediate parent folder name under Topics filename: str # base name without extension norm_name: str # normalized filename for matching title: str # H1 title or filename norm_title: str fm_id: str | None fm_aliases: list[str] = field(default_factory=list) fm_tags: list[str] = field(default_factory=list) fm_status: str | None = None fm_trust: str | None = None fm_last_reinforced: str | None = None fm_redirect_to: str | None = None # if present, this is a merged-stub placeholder fm_canonical_id: str | None = None body_chars: int = 0 body_first_para_hash: str = "" body_fingerprint: str = "" # short normalized excerpt for similarity is_stub: bool = False # body < 200 chars is_huge: bool = False # body > 50 KB is_redirect: bool = False # already-merged redirect placeholder is_operational: bool = False # under sessions/, _agents/, etc — excluded from clustering def normalize(s: str) -> str: """Aggressive normalization for fuzzy match.""" if not s: return "" s = unicodedata.normalize("NFKC", s).lower() s = NONALNUM_RE.sub("", s) return s def parse_frontmatter(text: str) -> tuple[dict, str]: """Cheap YAML-ish parser. Tolerates the malformed [[wiki-link]] tags and other quirks present in this wiki — no PyYAML dependency.""" m = FRONTMATTER_RE.match(text) if not m: return {}, text raw = m.group(1) body = text[m.end():] fm: dict = {} current_key: str | None = None for line in raw.splitlines(): if not line.strip() or line.lstrip().startswith("#"): continue if line.startswith((" ", "\t")) and current_key: fm[current_key] = (str(fm.get(current_key, "")) + " " + line.strip()).strip() continue if ":" not in line: continue key, _, val = line.partition(":") key = key.strip() val = val.strip() # list form if val.startswith("[") and val.endswith("]"): inner = val[1:-1].strip() items = [] for it in re.split(r",(?![^\[]*\])", inner): it = it.strip().strip("'\"") # strip [[wiki-link]] decoration to bare alias wm = WIKI_LINK_RE.fullmatch(it) if wm: it = wm.group(1) if it: items.append(it) fm[key] = items else: fm[key] = val.strip("'\"") current_key = key return fm, body def first_h1(body: str) -> str | None: for line in body.splitlines(): if line.startswith("# ") and not line.startswith("##"): return line[2:].strip().lstrip("[").rstrip("]").split("|")[0].strip() return None def fingerprint_body(body: str, max_chars: int = 600) -> str: """Strip frontmatter/headings/code/links, lowercase, collapse whitespace, take leading max_chars. Used for SequenceMatcher similarity.""" b = CODE_BLOCK_RE.sub(" ", body) b = HEADING_RE.sub(" ", b) b = WIKI_LINK_RE.sub(lambda m: m.group(1), b) b = re.sub(r"[*_`>#\-]+", " ", b) b = WHITESPACE_RE.sub(" ", b).strip().lower() return b[:max_chars] def first_para_hash(body: str) -> str: fp = fingerprint_body(body, 400) if not fp: return "" return hashlib.sha1(fp.encode("utf-8")).hexdigest()[:12] def iter_md_files(root: Path) -> Iterable[Path]: for dirpath, dirs, files in os.walk(root): dirs[:] = [d for d in dirs if d not in SKIP_DIR_NAMES] for f in files: if f.endswith(".md"): yield Path(dirpath) / f def scan() -> list[FileEntry]: entries: list[FileEntry] = [] for p in iter_md_files(TOPICS): try: text = p.read_text(encoding="utf-8", errors="replace") except OSError as e: print(f"WARN read fail {p}: {e}", file=sys.stderr) continue fm, body = parse_frontmatter(text) filename = p.stem title = first_h1(body) or filename body_strip = body.strip() redirect_to = fm.get("redirect_to") if isinstance(redirect_to, list): redirect_to = redirect_to[0] if redirect_to else None is_redirect = bool(redirect_to) or ( title.strip().lower() == "redirect" and len(body_strip) < 400 ) rel_path = str(p.relative_to(ROOT)).replace("\\", "/") is_operational = any(frag in "/" + rel_path for frag in EXCLUDE_PATH_FRAGMENTS) e = FileEntry( path=rel_path, abs_path=str(p), folder=p.parent.name, filename=filename, norm_name=normalize(filename), title=title, norm_title=normalize(title), fm_id=str(fm.get("id")) if fm.get("id") else None, fm_aliases=fm.get("aliases", []) if isinstance(fm.get("aliases"), list) else [], fm_tags=fm.get("tags", []) if isinstance(fm.get("tags"), list) else [], fm_status=fm.get("status"), fm_trust=fm.get("source_trust_level"), fm_last_reinforced=fm.get("last_reinforced"), fm_redirect_to=str(redirect_to) if redirect_to else None, fm_canonical_id=str(fm.get("canonical_id")) if fm.get("canonical_id") else None, body_chars=len(body_strip), body_first_para_hash=first_para_hash(body_strip), body_fingerprint=fingerprint_body(body_strip, 600), is_stub=len(body_strip) < 200, is_huge=len(body_strip) > 50000, is_redirect=is_redirect, is_operational=is_operational, ) entries.append(e) return entries def build_clusters(entries: list[FileEntry]) -> list[list[FileEntry]]: """Union-find by exact-match channels: norm_name, norm_title, body_first_para_hash. Redirect placeholders are NEVER unioned via body fingerprint (they all share the same boilerplate, which would create a giant false-positive cluster). They're still unioned via norm_name / norm_title so that a redirect and its canonical document end up in the same cluster — that's the relationship we want to surface. """ parent = list(range(len(entries))) def find(x: int) -> int: while parent[x] != x: parent[x] = parent[parent[x]] x = parent[x] return x def union(a: int, b: int) -> None: ra, rb = find(a), find(b) if ra != rb: parent[ra] = rb by_name: dict[str, list[int]] = defaultdict(list) by_title: dict[str, list[int]] = defaultdict(list) by_hash: dict[str, list[int]] = defaultdict(list) for i, e in enumerate(entries): if e.is_operational: continue # session/agent runtime files: not knowledge candidates if e.norm_name: by_name[e.norm_name].append(i) if e.norm_title and not e.is_redirect: # redirects all titled "Redirect" by_title[e.norm_title].append(i) # body fingerprint: only meaningful, non-redirect, non-stub bodies if ( e.body_first_para_hash and len(e.body_fingerprint) >= 200 and not e.is_redirect and not e.is_stub ): by_hash[e.body_first_para_hash].append(i) for group in list(by_name.values()) + list(by_title.values()) + list(by_hash.values()): if len(group) > 1: for i in group[1:]: union(group[0], i) clusters_map: dict[int, list[int]] = defaultdict(list) for i in range(len(entries)): clusters_map[find(i)].append(i) clusters = [[entries[i] for i in idxs] for idxs in clusters_map.values() if len(idxs) > 1] # sort: largest cluster first, then by first member's title clusters.sort(key=lambda c: (-len(c), c[0].norm_title or c[0].norm_name)) return clusters def cluster_similarity(c: list[FileEntry]) -> dict: """Compute pairwise body fingerprint similarity within a cluster. Returns max/min/avg similarity and the dominant tier.""" if len(c) < 2: return {"max": 1.0, "min": 1.0, "avg": 1.0, "tier": "solo"} sims: list[float] = [] for i in range(len(c)): for j in range(i + 1, len(c)): a = c[i].body_fingerprint b = c[j].body_fingerprint if not a or not b: sims.append(0.5) continue sims.append(SequenceMatcher(None, a, b).ratio()) if not sims: return {"max": 0.0, "min": 0.0, "avg": 0.0, "tier": "unknown"} mx, mn = max(sims), min(sims) avg = sum(sims) / len(sims) tier = ( "near-dup (>=0.92)" if mx >= 0.92 else "duplicate-candidate (0.80-0.92)" if mx >= 0.80 else "related (0.65-0.80)" if mx >= 0.65 else "weak-link (<0.65)" ) return {"max": round(mx, 3), "min": round(mn, 3), "avg": round(avg, 3), "tier": tier} def write_index(entries: list[FileEntry]) -> None: OUT_DIR.mkdir(parents=True, exist_ok=True) with INDEX_JSON.open("w", encoding="utf-8") as f: json.dump([asdict(e) for e in entries], f, ensure_ascii=False, indent=1) def write_clusters(clusters: list[list[FileEntry]], stats_per_cluster: list[dict]) -> None: payload = [] for c, s in zip(clusters, stats_per_cluster): payload.append({ "size": len(c), "stats": s, "members": [ { "path": e.path, "folder": e.folder, "filename": e.filename, "title": e.title, "body_chars": e.body_chars, "fm_trust": e.fm_trust, "fm_last_reinforced": e.fm_last_reinforced, "is_stub": e.is_stub, } for e in c ], }) with CLUSTERS_JSON.open("w", encoding="utf-8") as f: json.dump(payload, f, ensure_ascii=False, indent=1) def write_report(entries: list[FileEntry], clusters: list[list[FileEntry]], stats: list[dict]) -> None: n_files = len(entries) n_clustered = sum(len(c) for c in clusters) n_stub = sum(1 for e in entries if e.is_stub) n_huge = sum(1 for e in entries if e.is_huge) n_redirect = sum(1 for e in entries if e.is_redirect) n_operational = sum(1 for e in entries if e.is_operational) near_dup = [c for c, s in zip(clusters, stats) if s["max"] >= 0.92] dup_cand = [c for c, s in zip(clusters, stats) if 0.80 <= s["max"] < 0.92] related = [c for c, s in zip(clusters, stats) if 0.65 <= s["max"] < 0.80] folder_dup_pairs: dict[tuple[str, str], int] = defaultdict(int) for c in clusters: folders = sorted({e.folder for e in c}) if len(folders) >= 2: for i in range(len(folders)): for j in range(i + 1, len(folders)): folder_dup_pairs[(folders[i], folders[j])] += 1 lines: list[str] = [] lines.append("# Duplicate Candidates (P-Reinforce Phase 1 Index)\n") lines.append("> 자동 생성. 이 보고서는 **변경 제안**일 뿐 실제 파일은 수정되지 않았다.\n") lines.append("> 사용자가 클러스터별로 검토하고 MERGE/UPDATE/CREATE/REJECT 판단을 내려야 한다.\n") lines.append("") lines.append("## 요약\n") lines.append(f"- 총 파일: **{n_files}**") lines.append(f"- 중복 후보 클러스터에 포함된 파일: **{n_clustered}**") lines.append(f"- 클러스터 수: **{len(clusters)}** (>=0.92 near-dup: {len(near_dup)}, 0.80-0.92 dup-cand: {len(dup_cand)}, 0.65-0.80 related: {len(related)})") lines.append(f"- 이미 merged (`redirect_to` 필드 보유): **{n_redirect}**") lines.append(f"- 운영 로그 (sessions/_agents/_company 등, 클러스터링 제외): **{n_operational}**") lines.append(f"- 지식 문서 후보 (총수 - 운영 로그): **{n_files - n_operational}**") lines.append(f"- 빈약 stub (<200 chars, redirect 제외): **{n_stub - n_redirect}**") lines.append(f"- 거대 문서 (>50KB): **{n_huge}**") lines.append("") if folder_dup_pairs: lines.append("## 폴더 간 중복 핫스팟 (Top 20)\n") lines.append("| 폴더 A | 폴더 B | 공유 클러스터 |") lines.append("|---|---|---|") for (a, b), n in sorted(folder_dup_pairs.items(), key=lambda x: -x[1])[:20]: lines.append(f"| `{a}` | `{b}` | {n} |") lines.append("") def emit_section(title: str, group: list[list[FileEntry]], group_stats: list[dict], cap: int = 80) -> None: if not group: return lines.append(f"## {title} (총 {len(group)})\n") if len(group) > cap: lines.append(f"> 상위 {cap}개만 표시. 전체는 `_clusters.json` 참조.\n") for c, s in list(zip(group, group_stats))[:cap]: head = c[0].title or c[0].filename lines.append(f"### `{head}` (members: {len(c)}, max_sim: {s['max']}, tier: {s['tier']})") for e in c: stub_tag = " *[stub]*" if e.is_stub else "" huge_tag = " *[huge]*" if e.is_huge else "" lr = e.fm_last_reinforced or "?" trust = e.fm_trust or "?" lines.append(f"- [{e.path}]({e.path}) — {e.body_chars} chars, trust={trust}, last={lr}{stub_tag}{huge_tag}") lines.append("") pairs = list(zip(clusters, stats)) emit_section("🔴 Near-duplicate (>=0.92) — UPDATE 권장", [c for c, s in pairs if s["max"] >= 0.92], [s for c, s in pairs if s["max"] >= 0.92]) emit_section("🟡 Duplicate candidate (0.80-0.92) — 검토 필요", [c for c, s in pairs if 0.80 <= s["max"] < 0.92], [s for c, s in pairs if 0.80 <= s["max"] < 0.92]) emit_section("🟢 Related (0.65-0.80) — 연결만 권장", [c for c, s in pairs if 0.65 <= s["max"] < 0.80], [s for c, s in pairs if 0.65 <= s["max"] < 0.80]) emit_section("⚪ Weak-link (<0.65) — 동명/동일 hash지만 내용 다름", [c for c, s in pairs if s["max"] < 0.65], [s for c, s in pairs if s["max"] < 0.65]) REPORT_MD.write_text("\n".join(lines), encoding="utf-8") def main() -> None: print(f"[1/4] Scanning {TOPICS} ...", file=sys.stderr) entries = scan() print(f" {len(entries)} files indexed", file=sys.stderr) print(f"[2/4] Writing per-file index -> {INDEX_JSON}", file=sys.stderr) write_index(entries) print(f"[3/4] Building duplicate clusters ...", file=sys.stderr) clusters = build_clusters(entries) stats = [cluster_similarity(c) for c in clusters] print(f" {len(clusters)} clusters with >=2 members", file=sys.stderr) print(f"[4/4] Writing report -> {REPORT_MD}", file=sys.stderr) write_clusters(clusters, stats) write_report(entries, clusters, stats) print("DONE.", file=sys.stderr) if __name__ == "__main__": main()