""" P-Reinforce Phase 1 β€” Template Normalizer (DRY-RUN by default) ============================================================== Reads a wiki .md file and emits a normalized version that conforms to the current P-Reinforce template (templates/wiki_document.md). What it does mechanically (NO LLM calls): 1. Cleans frontmatter: - Strips [[wiki-link]] decoration from id/category/tags - Adds missing fields: canonical_id, aliases, status, source_trust_level, raw_sources, duplicate_of, tech_stack (if tech) - Re-derives source_trust_level from confidence_score if missing - Preserves the original id under `legacy_id` and `aliases` 2. Renames legacy section headers to current template: - "Brief Summary" -> "πŸ“Œ ν•œ 쀄 톡찰 (The Karpathy Summary)" - "Core Content" -> "πŸ“– κ΅¬μ‘°ν™”λœ 지식 (Synthesized Content)" - "Trade-offs & Caveats" -> merged into "⚠️ λͺ¨μˆœ 및 μ—…λ°μ΄νŠΈ" - "Knowledge Connections" -> "πŸ”— 지식 μ—°κ²° (Graph)" 3. Adds missing required sections as scaffold (with TODO markers) so LLM can fill them later. Tech docs get Code Patterns / Decision Criteria / Anti-Patterns scaffolds. 4. For stubs (<200 body chars), inserts a `πŸ€– [AI μΆ”λ‘  보강 ν•„μš”]` block β€” does NOT generate content (that step is interactive). Usage: python p_reinforce_normalize.py # dry-run, prints diff python p_reinforce_normalize.py --out PATH # write to PATH python p_reinforce_normalize.py --batch # multi-file dry-run Default mode is DRY-RUN. No source files are modified. """ from __future__ import annotations import argparse import json import re import sys import unicodedata from datetime import date from pathlib import Path ROOT = Path(r"E:/Wiki/2nd") TOPICS = ROOT / "10_Wiki" / "Topics" FRONTMATTER_RE = re.compile(r"^---\s*\n(.*?)\n---\s*\n?", re.DOTALL) WIKI_LINK_RE = re.compile(r"\[\[([^\]|]+?)(?:\|[^\]]+)?\]\]") H1_RE = re.compile(r"^# +(.+)$", re.MULTILINE) HEADING_RE = re.compile(r"^(#{2,6})\s+(.+?)\s*$", re.MULTILINE) CODE_FENCE_RE = re.compile(r"```([a-zA-Z0-9_-]*)\n", re.MULTILINE) # Header rename map: matched against the heading text only (after stripping # leading emoji/punctuation). Keys are normalized lowercase. HEADER_RENAMES = { "brief summary": "πŸ“Œ ν•œ 쀄 톡찰 (The Karpathy Summary)", "the karpathy summary": "πŸ“Œ ν•œ 쀄 톡찰 (The Karpathy Summary)", "ν•œ 쀄 톡찰": "πŸ“Œ ν•œ 쀄 톡찰 (The Karpathy Summary)", "ν•œ 쀄 톡찰 (the karpathy summary)": "πŸ“Œ ν•œ 쀄 톡찰 (The Karpathy Summary)", "core content": "πŸ“– κ΅¬μ‘°ν™”λœ 지식 (Synthesized Content)", "synthesized content": "πŸ“– κ΅¬μ‘°ν™”λœ 지식 (Synthesized Content)", "κ΅¬μ‘°ν™”λœ 지식": "πŸ“– κ΅¬μ‘°ν™”λœ 지식 (Synthesized Content)", "κ΅¬μ‘°ν™”λœ 지식 (synthesized content)": "πŸ“– κ΅¬μ‘°ν™”λœ 지식 (Synthesized Content)", "trade-offs & caveats": "⚠️ λͺ¨μˆœ 및 μ—…λ°μ΄νŠΈ (Contradictions & Updates)", "tradeoffs & caveats": "⚠️ λͺ¨μˆœ 및 μ—…λ°μ΄νŠΈ (Contradictions & Updates)", "trade-offs and caveats": "⚠️ λͺ¨μˆœ 및 μ—…λ°μ΄νŠΈ (Contradictions & Updates)", "contradictions & rl update": "⚠️ λͺ¨μˆœ 및 μ—…λ°μ΄νŠΈ (Contradictions & Updates)", "contradictions & updates": "⚠️ λͺ¨μˆœ 및 μ—…λ°μ΄νŠΈ (Contradictions & Updates)", "rl update": "⚠️ λͺ¨μˆœ 및 μ—…λ°μ΄νŠΈ (Contradictions & Updates)", "λͺ¨μˆœ 및 μ—…λ°μ΄νŠΈ": "⚠️ λͺ¨μˆœ 및 μ—…λ°μ΄νŠΈ (Contradictions & Updates)", "λͺ¨μˆœ 및 μ—…λ°μ΄νŠΈ (contradictions & updates)": "⚠️ λͺ¨μˆœ 및 μ—…λ°μ΄νŠΈ (Contradictions & Updates)", "λͺ¨μˆœ 및 μ—…λ°μ΄νŠΈ (contradictions & rl update)": "⚠️ λͺ¨μˆœ 및 μ—…λ°μ΄νŠΈ (Contradictions & Updates)", "knowledge connections": "πŸ”— 지식 μ—°κ²° (Graph)", "graph": "πŸ”— 지식 μ—°κ²° (Graph)", "지식 μ—°κ²°": "πŸ”— 지식 μ—°κ²° (Graph)", "지식 μ—°κ²° (graph)": "πŸ”— 지식 μ—°κ²° (Graph)", "code patterns": "πŸ’» μ½”λ“œ νŒ¨ν„΄ (Code Patterns)", "μ½”λ“œ νŒ¨ν„΄": "πŸ’» μ½”λ“œ νŒ¨ν„΄ (Code Patterns)", "decision criteria": "πŸ€” μ˜μ‚¬κ²°μ • κΈ°μ€€ (Decision Criteria)", "μ˜μ‚¬κ²°μ • κΈ°μ€€": "πŸ€” μ˜μ‚¬κ²°μ • κΈ°μ€€ (Decision Criteria)", "anti-patterns": "❌ μ•ˆν‹°νŒ¨ν„΄ (Anti-Patterns)", "antipatterns": "❌ μ•ˆν‹°νŒ¨ν„΄ (Anti-Patterns)", "μ•ˆν‹°νŒ¨ν„΄": "❌ μ•ˆν‹°νŒ¨ν„΄ (Anti-Patterns)", "how to use this knowledge": "πŸ€– LLM ν™œμš© 힌트 (How to Use This Knowledge)", "llm ν™œμš© 힌트": "πŸ€– LLM ν™œμš© 힌트 (How to Use This Knowledge)", "validation": "πŸ§ͺ 검증 μƒνƒœ (Validation)", "검증 μƒνƒœ": "πŸ§ͺ 검증 μƒνƒœ (Validation)", "duplicate check": "🧬 쀑볡 검사 (Duplicate Check)", "쀑볡 검사": "🧬 쀑볡 검사 (Duplicate Check)", "changelog": "πŸ•“ λ³€κ²½ 이λ ₯ (Changelog)", "λ³€κ²½ 이λ ₯": "πŸ•“ λ³€κ²½ 이λ ₯ (Changelog)", } EMOJI_AT_START_RE = re.compile(r"^[^\wκ°€-힣]*", re.UNICODE) # folder -> category mapping (best-effort; not authoritative) FOLDER_CATEGORY_HINTS = { "AI": "10_Wiki/Topics", "AI_and_ML": "10_Wiki/Topics", "Architecture": "10_Wiki/Topics", "Backend": "10_Wiki/Topics", "Frontend": "10_Wiki/Topics", "Frontend_Mastery": "10_Wiki/Topics", "DevOps_and_Security": "10_Wiki/Topics", "Computer_Science_and_Theory": "10_Wiki/Topics", "Programming & Language": "10_Wiki/Topics", "Programming & Tools": "10_Wiki/Topics", "Programming & Web": "10_Wiki/Topics", "Programming & Formal Methods": "10_Wiki/Topics", "Visual_Effects": "10_Wiki/Topics_Art", "Graphics & Performance": "10_Wiki/Topics_Art", "UI_UX_Assets": "10_Wiki/Topics_Art", "Design & Experience": "10_Wiki/Topics_Art", "Game Design": "10_Wiki/Topics_GD", "Game_Design": "10_Wiki/Topics_GD", "Level_Design": "10_Wiki/Topics_GD", "Balancing": "10_Wiki/Topics_GD", "Core_Systems": "10_Wiki/Topics_GD", "Skybound": "10_Wiki/Topics_GD", "Storytelling": "10_Wiki/Topics_GD", "Economics": "10_Wiki/Topics_Biz", "Economy": "10_Wiki/Topics_Biz", "Economics & Algorithms": "10_Wiki/Topics_Biz", "Business_Strategy": "10_Wiki/Topics_Biz", "Market_Research": "10_Wiki/Topics_Biz", "Partnerships": "10_Wiki/Topics_Biz", "Content_Strategy": "10_Wiki/Topics_Blog", "Post_Drafts": "10_Wiki/Topics_Blog", "External_Media": "10_Wiki/Topics_Blog", } TECH_KEYWORDS = { "architecture", "algorithm", "programming", "code", "frontend", "backend", "compiler", "interpreter", "runtime", "framework", "api", "rest", "graphql", "kubernetes", "docker", "kafka", "fastapi", "react", "vue", "svelte", "next", "typescript", "python", "rust", "javascript", "go ", "java ", "c++", "swift", "database", "sql", "postgres", "redis", "mongo", "cache", "distributed", } def parse_frontmatter(text: str) -> tuple[dict, str, str]: """Returns (fm_dict, body, fm_raw). Same forgiving parser as indexer.""" m = FRONTMATTER_RE.match(text) if not m: return {}, text, "" raw = m.group(1) body = text[m.end():] fm: dict = {} current_key: str | None = None for line in raw.splitlines(): if not line.strip() or line.lstrip().startswith("#"): continue if line.startswith((" ", "\t")) and current_key: existing = fm.get(current_key) fm[current_key] = (str(existing) + " " + line.strip()).strip() if existing else line.strip() continue if ":" not in line: continue key, _, val = line.partition(":") key = key.strip() val = val.strip() if val.startswith("[") and val.endswith("]"): inner = val[1:-1].strip() items = [] for it in re.split(r",(?![^\[]*\])", inner): it = it.strip().strip("'\"") wm = WIKI_LINK_RE.fullmatch(it) if wm: it = wm.group(1) if it: items.append(it) fm[key] = items else: # strip [[wiki-link|alias]] decoration if scalar wm = WIKI_LINK_RE.fullmatch(val.strip("'\"")) if val else None if wm: val = wm.group(1) fm[key] = val.strip("'\"") current_key = key return fm, body, raw def detect_tech(folder: str, tags: list[str], body: str) -> bool: haystack = (folder + " " + " ".join(tags) + " " + body[:2000]).lower() if CODE_FENCE_RE.search(body): return True return any(k in haystack for k in TECH_KEYWORDS) def trust_from_confidence(conf_str: str | None) -> str: """Per-user policy (2026-05-08): LLM-augmented entries are graded A (with `inferred_by` metadata for traceability), not C as the default P-Reinforce skill suggests. The mapping below is therefore biased upward relative to the skill spec. """ if not conf_str: return "A" # user policy: trust the model try: c = float(conf_str) except (TypeError, ValueError): return "A" if c >= 0.95: return "A" if c >= 0.80: return "A" if c >= 0.65: return "B" return "C" # Markers in body text that indicate a redirect even without `redirect_to` # in the frontmatter (e.g. older P-Reinforce passes left text-only redirects). TEXT_REDIRECT_PATTERNS = [ re.compile(r"\*?Redirected to:\s*\[\[([^\]|]+?)(?:\|[^\]]+)?\]\]\*?", re.IGNORECASE), re.compile(r"\[\[([^\]|]+?)(?:\|[^\]]+)?\]\]\s*(?:으|둜|둜)?\s*ν†΅ν•©λ˜μ—ˆμŠ΅λ‹ˆλ‹€"), re.compile(r"ν†΅ν•©λ˜μ—ˆμŠ΅λ‹ˆλ‹€.*?\[\[([^\]|]+?)(?:\|[^\]]+)?\]\]"), re.compile(r"이 λ¬Έμ„œλŠ”.*?\[\[([^\]|]+?)(?:\|[^\]]+)?\]\].*?둜 톡합"), ] def detect_text_redirect(body: str) -> str | None: """Look for "Redirected to: [[X]]" or "[[X]]둜 ν†΅ν•©λ˜μ—ˆμŠ΅λ‹ˆλ‹€" in body. Returns the canonical target name if found, else None.""" body_top = body[:1500] # only check the top of the doc for pat in TEXT_REDIRECT_PATTERNS: m = pat.search(body_top) if m: return m.group(1).strip() return None def slugify_id(filename: str, today: str | None = None) -> str: today = today or date.today().isoformat().replace("-", "")[:8] slug = unicodedata.normalize("NFKC", filename).lower() slug = re.sub(r"[^a-z0-9κ°€-힣]+", "-", slug).strip("-") if not slug: slug = "doc" return f"wiki-{today[:4]}-{today[4:8]}-{slug[:32]}" def quote_yaml(s: str) -> str: if s is None: return '""' s = str(s) if any(ch in s for ch in [':', '#', '[', ']', '{', '}', ',', '&', '*', '!', '|', '>', "'", '"', '%', '@', '`']) or "\n" in s: return '"' + s.replace('"', '\\"') + '"' return s def build_frontmatter(fm: dict, file_path: Path, body: str) -> dict: """Compute the new normalized frontmatter from old fm + path heuristics.""" folder = file_path.parent.name filename = file_path.stem new_fm: dict = {} # Detect text-only redirect (no redirect_to field but body says so) text_redirect = detect_text_redirect(body) if not fm.get("redirect_to") else None # ID β€” preserve legacy id if it already looks like our slug legacy_id = fm.get("id") legacy_id_str = str(legacy_id) if legacy_id else "" if legacy_id_str.startswith("wiki-") and not WIKI_LINK_RE.search(legacy_id_str): new_id = legacy_id_str else: new_id = slugify_id(filename) new_fm["id"] = new_id # title title = filename.replace("-", " ").replace("_", " ").strip() new_fm["title"] = title # category β€” strip emoji-decorated old categories, normalize wiki-link form cat = fm.get("category") if cat in (None, "", "Unified") or (isinstance(cat, str) and cat.startswith("[[")): cat = FOLDER_CATEGORY_HINTS.get(folder, "10_Wiki/Topics") elif isinstance(cat, str) and "πŸ’‘" in cat: # "10_Wiki/πŸ’‘ Topics/AI" -> hinted category cat = FOLDER_CATEGORY_HINTS.get(folder, "10_Wiki/Topics") new_fm["category"] = cat # If this file is a redirect, override status and add redirect_to. if fm.get("redirect_to") or text_redirect: target = fm.get("redirect_to") or text_redirect new_fm["redirect_to"] = target new_fm["status"] = "merged" new_fm["canonical_id"] = fm.get("canonical_id") or target # best-effort else: # status: draft for stubs, needs_review otherwise; verified only if pre-tagged existing_status = fm.get("status") if existing_status in ("verified", "merged", "deprecated"): new_fm["status"] = existing_status else: new_fm["status"] = "draft" if len(body.strip()) < 200 else "needs_review" new_fm["canonical_id"] = fm.get("canonical_id") or "self" # aliases β€” preserve legacy id and any prior aliases aliases = fm.get("aliases") or [] if isinstance(aliases, str): aliases = [aliases] if legacy_id and isinstance(legacy_id, str): legacy_id_clean = WIKI_LINK_RE.sub(r"\1", legacy_id) if legacy_id_clean and legacy_id_clean not in aliases: aliases = [legacy_id_clean] + aliases new_fm["aliases"] = aliases # duplicate_of new_fm["duplicate_of"] = fm.get("duplicate_of") or "none" # source_trust_level new_fm["source_trust_level"] = fm.get("source_trust_level") or trust_from_confidence(fm.get("confidence_score")) # confidence_score β€” preserve if present, else policy default 0.92 cs = fm.get("confidence_score") if cs: try: new_fm["confidence_score"] = float(cs) except (TypeError, ValueError): new_fm["confidence_score"] = 0.92 else: new_fm["confidence_score"] = 0.92 # tags β€” strip wiki link decoration tags = fm.get("tags") or [] if isinstance(tags, str): tags = [tags] cleaned_tags = [] for t in tags: if isinstance(t, str): t = WIKI_LINK_RE.sub(r"\1", t).strip("[]'\" ") if t: cleaned_tags.append(t) new_fm["tags"] = cleaned_tags or ["uncategorized"] # raw_sources rs = fm.get("raw_sources") or [] if isinstance(rs, str): rs = [rs] new_fm["raw_sources"] = rs # last_reinforced new_fm["last_reinforced"] = fm.get("last_reinforced") or date.today().isoformat() # github_commit new_fm["github_commit"] = fm.get("github_commit") or "pending" # inferred_by β€” traceability for LLM-augmented entries (per user policy) if fm.get("inferred_by"): new_fm["inferred_by"] = fm["inferred_by"] elif new_fm["source_trust_level"] == "A" and not fm.get("source_trust_level"): # we just promoted this to A; record provenance new_fm["inferred_by"] = "Claude Opus 4.7 (auto-normalize 2026-05-08)" # tech_stack β€” only if detected if detect_tech(folder, cleaned_tags, body): ts_old = fm.get("tech_stack") if isinstance(ts_old, dict): new_fm["tech_stack"] = ts_old else: new_fm["tech_stack"] = {"language": "unspecified", "framework": "unspecified"} return new_fm def render_frontmatter(fm: dict) -> str: lines = ["---"] order = [ "id", "title", "category", "status", "redirect_to", "canonical_id", "aliases", "duplicate_of", "source_trust_level", "confidence_score", "tags", "raw_sources", "last_reinforced", "github_commit", "inferred_by", "tech_stack", ] for k in order: if k not in fm: continue v = fm[k] if isinstance(v, list): if not v: lines.append(f"{k}: []") else: items = ", ".join(quote_yaml(x) for x in v) lines.append(f"{k}: [{items}]") elif isinstance(v, dict): lines.append(f"{k}:") for kk, vv in v.items(): lines.append(f" {kk}: {quote_yaml(vv)}") elif isinstance(v, float): lines.append(f"{k}: {v}") else: lines.append(f"{k}: {quote_yaml(v)}") lines.append("---") return "\n".join(lines) def normalize_headers(body: str) -> str: """Rewrite legacy section headers to match the template names.""" def repl(m: re.Match) -> str: hashes = m.group(1) text = m.group(2).strip() # strip leading emoji/punctuation/numbering for matching norm = EMOJI_AT_START_RE.sub("", text).strip() norm = re.sub(r"\s*\(.*?\)\s*$", "", norm).strip() # drop trailing paren key = norm.lower() # also try without punctuation if key not in HEADER_RENAMES: key2 = re.sub(r"[^\wκ°€-힣 -]", "", key).strip() if key2 in HEADER_RENAMES: key = key2 if key in HEADER_RENAMES: return f"{hashes} {HEADER_RENAMES[key]}" return m.group(0) return HEADING_RE.sub(repl, body) def find_section_starts(body: str) -> dict[str, int]: """Return {section_canonical_name: line_offset}""" found = {} for m in HEADING_RE.finditer(body): text = m.group(2).strip() norm = EMOJI_AT_START_RE.sub("", text).strip().lower() norm = re.sub(r"\s*\(.*?\)\s*$", "", norm).strip() for k, canonical in HEADER_RENAMES.items(): if norm == k.lower() or text == canonical: if canonical not in found: found[canonical] = m.start() return found REQUIRED_SECTIONS_BASE = [ "πŸ“Œ ν•œ 쀄 톡찰 (The Karpathy Summary)", "πŸ“– κ΅¬μ‘°ν™”λœ 지식 (Synthesized Content)", "πŸ€– LLM ν™œμš© 힌트 (How to Use This Knowledge)", "πŸ§ͺ 검증 μƒνƒœ (Validation)", "🧬 쀑볡 검사 (Duplicate Check)", "⚠️ λͺ¨μˆœ 및 μ—…λ°μ΄νŠΈ (Contradictions & Updates)", "πŸ”— 지식 μ—°κ²° (Graph)", "πŸ•“ λ³€κ²½ 이λ ₯ (Changelog)", ] REQUIRED_SECTIONS_TECH_EXTRA = [ "πŸ’» μ½”λ“œ νŒ¨ν„΄ (Code Patterns)", "πŸ€” μ˜μ‚¬κ²°μ • κΈ°μ€€ (Decision Criteria)", "❌ μ•ˆν‹°νŒ¨ν„΄ (Anti-Patterns)", ] def append_missing_sections(body: str, is_tech: bool, is_stub: bool, fm: dict) -> str: """For each required section not present, append a scaffold with TODO marker.""" found = find_section_starts(body) missing = [] base = REQUIRED_SECTIONS_BASE + (REQUIRED_SECTIONS_TECH_EXTRA if is_tech else []) for s in base: if s not in found: missing.append(s) if not missing and not is_stub: return body out = [body.rstrip()] if is_stub: out.append("\n\n> πŸ€– **[AI μΆ”λ‘  보강 ν•„μš”]** β€” 본문이 200자 미만이라 P-Reinforceκ°€ λΉˆμ•½ stub으둜 λΆ„λ₯˜ν–ˆμŠ΅λ‹ˆλ‹€.") out.append(f"> source_trust_level=`C` (AI 보강뢄), confidence_score=`{fm.get('confidence_score', 0.7)}`둜 ν‘œμ‹œλ˜μ–΄ μžˆμŠ΅λ‹ˆλ‹€.") out.append("> μ‚¬μš©μž 검증 ν›„ trust_level 상ν–₯ μ‘°μ • κ°€λŠ₯.\n") for s in missing: out.append(f"\n## {s}\n") out.append(_scaffold_for(s, fm)) return "\n".join(out) def _scaffold_for(section: str, fm: dict) -> str: if section.startswith("πŸ“Œ"): return "> *(TODO: ν•œ λ¬Έμž₯으둜 핡심 톡찰을 μž‘μ„±. \"XλŠ” Y μ‘°κ±΄μ—μ„œ Z 효과λ₯Ό λ‚Έλ‹€\" ꡬ쑰 ꢌμž₯.)*" if section.startswith("πŸ“–"): return "**μΆ”μΆœλœ νŒ¨ν„΄:**\n> *(TODO)*\n\n**μ„ΈλΆ€ λ‚΄μš©:**\n- *(TODO)*" if section.startswith("πŸ’»"): return "**νŒ¨ν„΄ 1:** *(TODO: 이 ν”„λ‘œμ νŠΈ μ»¨λ²€μ…˜ λ°˜μ˜ν•œ ꡬ쑰 μŠ€μΌˆλ ˆν†€)*\n\n```text\n# TODO\n```" if section.startswith("πŸ€”"): return "**선택 Aλ₯Ό 써야 ν•  λ•Œ:**\n- *(TODO)*\n\n**선택 Bλ₯Ό 써야 ν•  λ•Œ:**\n- *(TODO)*\n\n**κΈ°λ³Έκ°’:**\n> *(TODO)*" if section.startswith("❌"): return "- **[μ•ˆν‹°νŒ¨ν„΄]:** *(TODO: 무엇을 ν•˜λ©΄ μ•ˆ λ˜λŠ”κ°€ + 이유 + λŒ€μ‹  무엇을)*" if section.startswith("πŸ€–") and "ν™œμš© 힌트" in section: return "**μ–Έμ œ 이 지식을 μ“°λŠ”κ°€:**\n- *(TODO)*\n\n**μ–Έμ œ μ“°λ©΄ μ•ˆ λ˜λŠ”κ°€:**\n- *(TODO)*" if section.startswith("πŸ§ͺ"): return ( f"- **정보 μƒνƒœ:** {fm.get('status', 'draft')}\n" f"- **좜처 신뒰도:** {fm.get('source_trust_level', 'C')}\n" f"- **κ²€ν†  이유:** *(P-Reinforce Phase 1 μžλ™ μ •κ·œν™”. λ³Έλ¬Έ 검증 ν•„μš”.)*" ) if section.startswith("🧬"): return ( "- **κΈ°μ‘΄ μœ μ‚¬ λ¬Έμ„œ:** *(TODO: μΈλ±μ„œ ν΄λŸ¬μŠ€ν„° 리포트 μ°Έμ‘°)*\n" "- **처리 방식:** UPDATE (μžλ™ μ •κ·œν™”)\n" "- **처리 이유:** Phase 1 μ •κ·œν™” β€” μ˜› ν…œν”Œλ¦Ώ/λˆ„λ½ ν•„λ“œ 보강." ) if section.startswith("⚠️"): return "- **κ³Όκ±° λ°μ΄ν„°μ™€μ˜ 좩돌:** μ—†μŒ\n- **μ •μ±… λ³€ν™”:** μ—†μŒ" if section.startswith("πŸ”—"): return ( "- **Parent:** [[10_Wiki/Topics]]\n" "- **Related:** *(TODO: μ΅œμ†Œ 2개)*\n" "- **Opposite / Trade-off:** *(TODO)*\n" "- **Raw Source:** 직접 μž…λ ₯" ) if section.startswith("πŸ•“"): today = date.today().isoformat() trust = fm.get("source_trust_level", "C") return ( "| λ‚ μ§œ | λ³€κ²½ λ‚΄μš© | 처리 방식 | 신뒰도 |\n" "|------|-----------|-----------|--------|\n" f"| {today} | P-Reinforce Phase 1 μ •κ·œν™” (frontmatter + 헀더 ν‘œμ€€ν™”) | UPDATE | {trust} |" ) return "*(TODO)*" def normalize_file(file_path: Path) -> str: text = file_path.read_text(encoding="utf-8", errors="replace") fm_old, body, _raw = parse_frontmatter(text) fm_new = build_frontmatter(fm_old, file_path, body) # Redirect documents stay minimal β€” don't add scaffold sections. if fm_new.get("redirect_to"): return render_frontmatter(fm_new) + "\n\n" + body.lstrip("\n") # Header renames first body2 = normalize_headers(body) is_tech = detect_tech(file_path.parent.name, fm_new.get("tags", []), body2) is_stub = len(body2.strip()) < 200 body3 = append_missing_sections(body2, is_tech, is_stub, fm_new) return render_frontmatter(fm_new) + "\n\n" + body3.lstrip("\n") def iter_knowledge_files() -> list[Path]: """All .md under Topics/, excluding operational paths (sessions, _agents, _company, etc) and non-content dirs.""" EXCLUDE_FRAG = ( "/sessions/", "/_agents/", "/_company/", "/memory/", "/Project_Logs/", "/Harness_Research_", "/docs/records/", "/_Archive_Orphans/", "/Post_Drafts/", "/UX_Scenarios/", ) SKIP_DIRS = {".obsidian", ".git", "__pycache__", "node_modules"} out = [] import os as _os for dirpath, dirs, files in _os.walk(TOPICS): dirs[:] = [d for d in dirs if d not in SKIP_DIRS] for f in files: if not f.endswith(".md"): continue p = Path(dirpath) / f rel = "/" + str(p.relative_to(ROOT)).replace("\\", "/") if any(x in rel for x in EXCLUDE_FRAG): continue out.append(p) return out def main() -> int: parser = argparse.ArgumentParser() parser.add_argument("path", nargs="?", help="Relative path under E:/Wiki/2nd/ or absolute") parser.add_argument("--out", help="Write normalized output to PATH instead of stdout") parser.add_argument("--batch", help="Path to a text file with one input path per line; outputs go under _tools/sample_normalized/") parser.add_argument("--apply-all", action="store_true", help="Normalize ALL knowledge files in-place (operational paths excluded). DESTRUCTIVE.") args = parser.parse_args() if args.apply_all: files = iter_knowledge_files() ok = err = 0 errors: list[tuple[str, str]] = [] for i, p in enumerate(files, 1): try: normalized = normalize_file(p) p.write_text(normalized, encoding="utf-8") ok += 1 except Exception as e: err += 1 errors.append((str(p), str(e))) if i % 200 == 0: print(f" ...{i}/{len(files)}", file=sys.stderr) print(f"DONE: {ok} OK, {err} errors out of {len(files)} files", file=sys.stderr) if errors: log = ROOT / "_tools" / "normalize_errors.log" log.write_text("\n".join(f"{p}\t{e}" for p, e in errors), encoding="utf-8") print(f" errors written to {log}", file=sys.stderr) return 0 if err == 0 else 1 if args.batch: listing = Path(args.batch).read_text(encoding="utf-8").splitlines() out_dir = ROOT / "_tools" / "sample_normalized" out_dir.mkdir(parents=True, exist_ok=True) results = [] for line in listing: line = line.strip() if not line or line.startswith("#"): continue p = Path(line) if not p.is_absolute(): p = ROOT / p if not p.exists(): print(f"SKIP missing: {p}", file=sys.stderr) continue try: normalized = normalize_file(p) except Exception as e: print(f"FAIL {p}: {e}", file=sys.stderr) continue out_path = out_dir / (p.stem + ".normalized.md") out_path.write_text(normalized, encoding="utf-8") results.append({"src": str(p.relative_to(ROOT)).replace("\\", "/"), "out": str(out_path.relative_to(ROOT)).replace("\\", "/"), "src_chars": len(p.read_text(encoding="utf-8", errors="replace")), "out_chars": len(normalized)}) (out_dir / "_manifest.json").write_text(json.dumps(results, ensure_ascii=False, indent=1), encoding="utf-8") print(f"Wrote {len(results)} normalized samples to {out_dir}", file=sys.stderr) return 0 if not args.path: parser.error("path or --batch required") p = Path(args.path) if not p.is_absolute(): p = ROOT / p if not p.exists(): print(f"FAIL: {p} does not exist", file=sys.stderr) return 2 out = normalize_file(p) if args.out: Path(args.out).write_text(out, encoding="utf-8") print(f"Wrote {args.out}", file=sys.stderr) else: sys.stdout.write(out) return 0 if __name__ == "__main__": sys.exit(main())