2nd/_tools/p_reinforce_normalize.py

"""
P-Reinforce Phase 1 — Template Normalizer (DRY-RUN by default)
==============================================================
Reads a wiki .md file and emits a normalized version that conforms to the
current P-Reinforce template (templates/wiki_document.md).

What it does mechanically (NO LLM calls):
    1. Cleans frontmatter:
       - Strips [[wiki-link]] decoration from id/category/tags
       - Adds missing fields: canonical_id, aliases, status,
         source_trust_level, raw_sources, duplicate_of, tech_stack (if tech)
       - Re-derives source_trust_level from confidence_score if missing
       - Preserves the original id under `legacy_id` and `aliases`
    2. Renames legacy section headers to current template:
       - "Brief Summary"            -> "📌 한 줄 통찰 (The Karpathy Summary)"
       - "Core Content"             -> "📖 구조화된 지식 (Synthesized Content)"
       - "Trade-offs & Caveats"     -> merged into "⚠️ 모순 및 업데이트"
       - "Knowledge Connections"    -> "🔗 지식 연결 (Graph)"
    3. Adds missing required sections as scaffold (with TODO markers)
       so LLM can fill them later. Tech docs get Code Patterns / Decision
       Criteria / Anti-Patterns scaffolds.
    4. For stubs (<200 body chars), inserts a `🤖 [AI 추론 보강 필요]`
       block — does NOT generate content (that step is interactive).

Usage:
    python p_reinforce_normalize.py <relative_path>            # dry-run, prints diff
    python p_reinforce_normalize.py <relative_path> --out PATH # write to PATH
    python p_reinforce_normalize.py --batch <input_listing>    # multi-file dry-run

Default mode is DRY-RUN. No source files are modified.
"""

from __future__ import annotations

import argparse
import json
import re
import sys
import unicodedata
from datetime import date
from pathlib import Path

ROOT = Path(r"E:/Wiki/2nd")
TOPICS = ROOT / "10_Wiki" / "Topics"

FRONTMATTER_RE = re.compile(r"^---\s*\n(.*?)\n---\s*\n?", re.DOTALL)
WIKI_LINK_RE = re.compile(r"\[\[([^\]|]+?)(?:\|[^\]]+)?\]\]")
H1_RE = re.compile(r"^# +(.+)$", re.MULTILINE)
HEADING_RE = re.compile(r"^(#{2,6})\s+(.+?)\s*$", re.MULTILINE)
CODE_FENCE_RE = re.compile(r"```([a-zA-Z0-9_-]*)\n", re.MULTILINE)

# Header rename map: matched against the heading text only (after stripping
# leading emoji/punctuation). Keys are normalized lowercase.
HEADER_RENAMES = {
    "brief summary": "📌 한 줄 통찰 (The Karpathy Summary)",
    "the karpathy summary": "📌 한 줄 통찰 (The Karpathy Summary)",
    "한 줄 통찰": "📌 한 줄 통찰 (The Karpathy Summary)",
    "한 줄 통찰 (the karpathy summary)": "📌 한 줄 통찰 (The Karpathy Summary)",
    "core content": "📖 구조화된 지식 (Synthesized Content)",
    "synthesized content": "📖 구조화된 지식 (Synthesized Content)",
    "구조화된 지식": "📖 구조화된 지식 (Synthesized Content)",
    "구조화된 지식 (synthesized content)": "📖 구조화된 지식 (Synthesized Content)",
    "trade-offs & caveats": "⚠️ 모순 및 업데이트 (Contradictions & Updates)",
    "tradeoffs & caveats": "⚠️ 모순 및 업데이트 (Contradictions & Updates)",
    "trade-offs and caveats": "⚠️ 모순 및 업데이트 (Contradictions & Updates)",
    "contradictions & rl update": "⚠️ 모순 및 업데이트 (Contradictions & Updates)",
    "contradictions & updates": "⚠️ 모순 및 업데이트 (Contradictions & Updates)",
    "rl update": "⚠️ 모순 및 업데이트 (Contradictions & Updates)",
    "모순 및 업데이트": "⚠️ 모순 및 업데이트 (Contradictions & Updates)",
    "모순 및 업데이트 (contradictions & updates)": "⚠️ 모순 및 업데이트 (Contradictions & Updates)",
    "모순 및 업데이트 (contradictions & rl update)": "⚠️ 모순 및 업데이트 (Contradictions & Updates)",
    "knowledge connections": "🔗 지식 연결 (Graph)",
    "graph": "🔗 지식 연결 (Graph)",
    "지식 연결": "🔗 지식 연결 (Graph)",
    "지식 연결 (graph)": "🔗 지식 연결 (Graph)",
    "code patterns": "💻 코드 패턴 (Code Patterns)",
    "코드 패턴": "💻 코드 패턴 (Code Patterns)",
    "decision criteria": "🤔 의사결정 기준 (Decision Criteria)",
    "의사결정 기준": "🤔 의사결정 기준 (Decision Criteria)",
    "anti-patterns": "❌ 안티패턴 (Anti-Patterns)",
    "antipatterns": "❌ 안티패턴 (Anti-Patterns)",
    "안티패턴": "❌ 안티패턴 (Anti-Patterns)",
    "how to use this knowledge": "🤖 LLM 활용 힌트 (How to Use This Knowledge)",
    "llm 활용 힌트": "🤖 LLM 활용 힌트 (How to Use This Knowledge)",
    "validation": "🧪 검증 상태 (Validation)",
    "검증 상태": "🧪 검증 상태 (Validation)",
    "duplicate check": "🧬 중복 검사 (Duplicate Check)",
    "중복 검사": "🧬 중복 검사 (Duplicate Check)",
    "changelog": "🕓 변경 이력 (Changelog)",
    "변경 이력": "🕓 변경 이력 (Changelog)",
}

EMOJI_AT_START_RE = re.compile(r"^[^\w가-힣]*", re.UNICODE)

# folder -> category mapping (best-effort; not authoritative)
FOLDER_CATEGORY_HINTS = {
    "AI": "10_Wiki/Topics",
    "AI_and_ML": "10_Wiki/Topics",
    "Architecture": "10_Wiki/Topics",
    "Backend": "10_Wiki/Topics",
    "Frontend": "10_Wiki/Topics",
    "Frontend_Mastery": "10_Wiki/Topics",
    "DevOps_and_Security": "10_Wiki/Topics",
    "Computer_Science_and_Theory": "10_Wiki/Topics",
    "Programming & Language": "10_Wiki/Topics",
    "Programming & Tools": "10_Wiki/Topics",
    "Programming & Web": "10_Wiki/Topics",
    "Programming & Formal Methods": "10_Wiki/Topics",
    "Visual_Effects": "10_Wiki/Topics_Art",
    "Graphics & Performance": "10_Wiki/Topics_Art",
    "UI_UX_Assets": "10_Wiki/Topics_Art",
    "Design & Experience": "10_Wiki/Topics_Art",
    "Game Design": "10_Wiki/Topics_GD",
    "Game_Design": "10_Wiki/Topics_GD",
    "Level_Design": "10_Wiki/Topics_GD",
    "Balancing": "10_Wiki/Topics_GD",
    "Core_Systems": "10_Wiki/Topics_GD",
    "Skybound": "10_Wiki/Topics_GD",
    "Storytelling": "10_Wiki/Topics_GD",
    "Economics": "10_Wiki/Topics_Biz",
    "Economy": "10_Wiki/Topics_Biz",
    "Economics & Algorithms": "10_Wiki/Topics_Biz",
    "Business_Strategy": "10_Wiki/Topics_Biz",
    "Market_Research": "10_Wiki/Topics_Biz",
    "Partnerships": "10_Wiki/Topics_Biz",
    "Content_Strategy": "10_Wiki/Topics_Blog",
    "Post_Drafts": "10_Wiki/Topics_Blog",
    "External_Media": "10_Wiki/Topics_Blog",
}

TECH_KEYWORDS = {
    "architecture", "algorithm", "programming", "code", "frontend", "backend",
    "compiler", "interpreter", "runtime", "framework", "api", "rest", "graphql",
    "kubernetes", "docker", "kafka", "fastapi", "react", "vue", "svelte", "next",
    "typescript", "python", "rust", "javascript", "go ", "java ", "c++", "swift",
    "database", "sql", "postgres", "redis", "mongo", "cache", "distributed",
}


def parse_frontmatter(text: str) -> tuple[dict, str, str]:
    """Returns (fm_dict, body, fm_raw). Same forgiving parser as indexer."""
    m = FRONTMATTER_RE.match(text)
    if not m:
        return {}, text, ""
    raw = m.group(1)
    body = text[m.end():]
    fm: dict = {}
    current_key: str | None = None
    for line in raw.splitlines():
        if not line.strip() or line.lstrip().startswith("#"):
            continue
        if line.startswith((" ", "\t")) and current_key:
            existing = fm.get(current_key)
            fm[current_key] = (str(existing) + " " + line.strip()).strip() if existing else line.strip()
            continue
        if ":" not in line:
            continue
        key, _, val = line.partition(":")
        key = key.strip()
        val = val.strip()
        if val.startswith("[") and val.endswith("]"):
            inner = val[1:-1].strip()
            items = []
            for it in re.split(r",(?![^\[]*\])", inner):
                it = it.strip().strip("'\"")
                wm = WIKI_LINK_RE.fullmatch(it)
                if wm:
                    it = wm.group(1)
                if it:
                    items.append(it)
            fm[key] = items
        else:
            # strip [[wiki-link|alias]] decoration if scalar
            wm = WIKI_LINK_RE.fullmatch(val.strip("'\"")) if val else None
            if wm:
                val = wm.group(1)
            fm[key] = val.strip("'\"")
        current_key = key
    return fm, body, raw


def detect_tech(folder: str, tags: list[str], body: str) -> bool:
    haystack = (folder + " " + " ".join(tags) + " " + body[:2000]).lower()
    if CODE_FENCE_RE.search(body):
        return True
    return any(k in haystack for k in TECH_KEYWORDS)


def trust_from_confidence(conf_str: str | None) -> str:
    """Per-user policy (2026-05-08): LLM-augmented entries are graded A
    (with `inferred_by` metadata for traceability), not C as the default
    P-Reinforce skill suggests. The mapping below is therefore biased upward
    relative to the skill spec.
    """
    if not conf_str:
        return "A"  # user policy: trust the model
    try:
        c = float(conf_str)
    except (TypeError, ValueError):
        return "A"
    if c >= 0.95:
        return "A"
    if c >= 0.80:
        return "A"
    if c >= 0.65:
        return "B"
    return "C"


# Markers in body text that indicate a redirect even without `redirect_to`
# in the frontmatter (e.g. older P-Reinforce passes left text-only redirects).
TEXT_REDIRECT_PATTERNS = [
    re.compile(r"\*?Redirected to:\s*\[\[([^\]|]+?)(?:\|[^\]]+)?\]\]\*?", re.IGNORECASE),
    re.compile(r"\[\[([^\]|]+?)(?:\|[^\]]+)?\]\]\s*(?:으|로|로)?\s*통합되었습니다"),
    re.compile(r"통합되었습니다.*?\[\[([^\]|]+?)(?:\|[^\]]+)?\]\]"),
    re.compile(r"이 문서는.*?\[\[([^\]|]+?)(?:\|[^\]]+)?\]\].*?로 통합"),
]


def detect_text_redirect(body: str) -> str | None:
    """Look for "Redirected to: [[X]]" or "[[X]]로 통합되었습니다" in body.
    Returns the canonical target name if found, else None."""
    body_top = body[:1500]  # only check the top of the doc
    for pat in TEXT_REDIRECT_PATTERNS:
        m = pat.search(body_top)
        if m:
            return m.group(1).strip()
    return None


def slugify_id(filename: str, today: str | None = None) -> str:
    today = today or date.today().isoformat().replace("-", "")[:8]
    slug = unicodedata.normalize("NFKC", filename).lower()
    slug = re.sub(r"[^a-z0-9가-힣]+", "-", slug).strip("-")
    if not slug:
        slug = "doc"
    return f"wiki-{today[:4]}-{today[4:8]}-{slug[:32]}"


def quote_yaml(s: str) -> str:
    if s is None:
        return '""'
    s = str(s)
    if any(ch in s for ch in [':', '#', '[', ']', '{', '}', ',', '&', '*', '!', '|', '>', "'", '"', '%', '@', '`']) or "\n" in s:
        return '"' + s.replace('"', '\\"') + '"'
    return s


def build_frontmatter(fm: dict, file_path: Path, body: str) -> dict:
    """Compute the new normalized frontmatter from old fm + path heuristics."""
    folder = file_path.parent.name
    filename = file_path.stem
    new_fm: dict = {}

    # Detect text-only redirect (no redirect_to field but body says so)
    text_redirect = detect_text_redirect(body) if not fm.get("redirect_to") else None

    # ID — preserve legacy id if it already looks like our slug
    legacy_id = fm.get("id")
    legacy_id_str = str(legacy_id) if legacy_id else ""
    if legacy_id_str.startswith("wiki-") and not WIKI_LINK_RE.search(legacy_id_str):
        new_id = legacy_id_str
    else:
        new_id = slugify_id(filename)
    new_fm["id"] = new_id

    # title
    title = filename.replace("-", " ").replace("_", " ").strip()
    new_fm["title"] = title

    # category — strip emoji-decorated old categories, normalize wiki-link form
    cat = fm.get("category")
    if cat in (None, "", "Unified") or (isinstance(cat, str) and cat.startswith("[[")):
        cat = FOLDER_CATEGORY_HINTS.get(folder, "10_Wiki/Topics")
    elif isinstance(cat, str) and "💡" in cat:
        # "10_Wiki/💡 Topics/AI" -> hinted category
        cat = FOLDER_CATEGORY_HINTS.get(folder, "10_Wiki/Topics")
    new_fm["category"] = cat

    # If this file is a redirect, override status and add redirect_to.
    if fm.get("redirect_to") or text_redirect:
        target = fm.get("redirect_to") or text_redirect
        new_fm["redirect_to"] = target
        new_fm["status"] = "merged"
        new_fm["canonical_id"] = fm.get("canonical_id") or target  # best-effort
    else:
        # status: draft for stubs, needs_review otherwise; verified only if pre-tagged
        existing_status = fm.get("status")
        if existing_status in ("verified", "merged", "deprecated"):
            new_fm["status"] = existing_status
        else:
            new_fm["status"] = "draft" if len(body.strip()) < 200 else "needs_review"
        new_fm["canonical_id"] = fm.get("canonical_id") or "self"

    # aliases — preserve legacy id and any prior aliases
    aliases = fm.get("aliases") or []
    if isinstance(aliases, str):
        aliases = [aliases]
    if legacy_id and isinstance(legacy_id, str):
        legacy_id_clean = WIKI_LINK_RE.sub(r"\1", legacy_id)
        if legacy_id_clean and legacy_id_clean not in aliases:
            aliases = [legacy_id_clean] + aliases
    new_fm["aliases"] = aliases

    # duplicate_of
    new_fm["duplicate_of"] = fm.get("duplicate_of") or "none"

    # source_trust_level
    new_fm["source_trust_level"] = fm.get("source_trust_level") or trust_from_confidence(fm.get("confidence_score"))

    # confidence_score — preserve if present, else policy default 0.92
    cs = fm.get("confidence_score")
    if cs:
        try:
            new_fm["confidence_score"] = float(cs)
        except (TypeError, ValueError):
            new_fm["confidence_score"] = 0.92
    else:
        new_fm["confidence_score"] = 0.92

    # tags — strip wiki link decoration
    tags = fm.get("tags") or []
    if isinstance(tags, str):
        tags = [tags]
    cleaned_tags = []
    for t in tags:
        if isinstance(t, str):
            t = WIKI_LINK_RE.sub(r"\1", t).strip("[]'\" ")
        if t:
            cleaned_tags.append(t)
    new_fm["tags"] = cleaned_tags or ["uncategorized"]

    # raw_sources
    rs = fm.get("raw_sources") or []
    if isinstance(rs, str):
        rs = [rs]
    new_fm["raw_sources"] = rs

    # last_reinforced
    new_fm["last_reinforced"] = fm.get("last_reinforced") or date.today().isoformat()

    # github_commit
    new_fm["github_commit"] = fm.get("github_commit") or "pending"

    # inferred_by — traceability for LLM-augmented entries (per user policy)
    if fm.get("inferred_by"):
        new_fm["inferred_by"] = fm["inferred_by"]
    elif new_fm["source_trust_level"] == "A" and not fm.get("source_trust_level"):
        # we just promoted this to A; record provenance
        new_fm["inferred_by"] = "Claude Opus 4.7 (auto-normalize 2026-05-08)"

    # tech_stack — only if detected
    if detect_tech(folder, cleaned_tags, body):
        ts_old = fm.get("tech_stack")
        if isinstance(ts_old, dict):
            new_fm["tech_stack"] = ts_old
        else:
            new_fm["tech_stack"] = {"language": "unspecified", "framework": "unspecified"}

    return new_fm


def render_frontmatter(fm: dict) -> str:
    lines = ["---"]
    order = [
        "id", "title", "category", "status", "redirect_to", "canonical_id", "aliases",
        "duplicate_of", "source_trust_level", "confidence_score",
        "tags", "raw_sources", "last_reinforced", "github_commit",
        "inferred_by", "tech_stack",
    ]
    for k in order:
        if k not in fm:
            continue
        v = fm[k]
        if isinstance(v, list):
            if not v:
                lines.append(f"{k}: []")
            else:
                items = ", ".join(quote_yaml(x) for x in v)
                lines.append(f"{k}: [{items}]")
        elif isinstance(v, dict):
            lines.append(f"{k}:")
            for kk, vv in v.items():
                lines.append(f"  {kk}: {quote_yaml(vv)}")
        elif isinstance(v, float):
            lines.append(f"{k}: {v}")
        else:
            lines.append(f"{k}: {quote_yaml(v)}")
    lines.append("---")
    return "\n".join(lines)


def normalize_headers(body: str) -> str:
    """Rewrite legacy section headers to match the template names."""
    def repl(m: re.Match) -> str:
        hashes = m.group(1)
        text = m.group(2).strip()
        # strip leading emoji/punctuation/numbering for matching
        norm = EMOJI_AT_START_RE.sub("", text).strip()
        norm = re.sub(r"\s*\(.*?\)\s*$", "", norm).strip()  # drop trailing paren
        key = norm.lower()
        # also try without punctuation
        if key not in HEADER_RENAMES:
            key2 = re.sub(r"[^\w가-힣 -]", "", key).strip()
            if key2 in HEADER_RENAMES:
                key = key2
        if key in HEADER_RENAMES:
            return f"{hashes} {HEADER_RENAMES[key]}"
        return m.group(0)
    return HEADING_RE.sub(repl, body)


def find_section_starts(body: str) -> dict[str, int]:
    """Return {section_canonical_name: line_offset}"""
    found = {}
    for m in HEADING_RE.finditer(body):
        text = m.group(2).strip()
        norm = EMOJI_AT_START_RE.sub("", text).strip().lower()
        norm = re.sub(r"\s*\(.*?\)\s*$", "", norm).strip()
        for k, canonical in HEADER_RENAMES.items():
            if norm == k.lower() or text == canonical:
                if canonical not in found:
                    found[canonical] = m.start()
    return found


REQUIRED_SECTIONS_BASE = [
    "📌 한 줄 통찰 (The Karpathy Summary)",
    "📖 구조화된 지식 (Synthesized Content)",
    "🤖 LLM 활용 힌트 (How to Use This Knowledge)",
    "🧪 검증 상태 (Validation)",
    "🧬 중복 검사 (Duplicate Check)",
    "⚠️ 모순 및 업데이트 (Contradictions & Updates)",
    "🔗 지식 연결 (Graph)",
    "🕓 변경 이력 (Changelog)",
]
REQUIRED_SECTIONS_TECH_EXTRA = [
    "💻 코드 패턴 (Code Patterns)",
    "🤔 의사결정 기준 (Decision Criteria)",
    "❌ 안티패턴 (Anti-Patterns)",
]


def append_missing_sections(body: str, is_tech: bool, is_stub: bool, fm: dict) -> str:
    """For each required section not present, append a scaffold with TODO marker."""
    found = find_section_starts(body)
    missing = []
    base = REQUIRED_SECTIONS_BASE + (REQUIRED_SECTIONS_TECH_EXTRA if is_tech else [])
    for s in base:
        if s not in found:
            missing.append(s)

    if not missing and not is_stub:
        return body

    out = [body.rstrip()]
    if is_stub:
        out.append("\n\n> 🤖 **[AI 추론 보강 필요]** — 본문이 200자 미만이라 P-Reinforce가 빈약 stub으로 분류했습니다.")
        out.append(f"> source_trust_level=`C` (AI 보강분), confidence_score=`{fm.get('confidence_score', 0.7)}`로 표시되어 있습니다.")
        out.append("> 사용자 검증 후 trust_level 상향 조정 가능.\n")
    for s in missing:
        out.append(f"\n## {s}\n")
        out.append(_scaffold_for(s, fm))
    return "\n".join(out)


def _scaffold_for(section: str, fm: dict) -> str:
    if section.startswith("📌"):
        return "> *(TODO: 한 문장으로 핵심 통찰을 작성. \"X는 Y 조건에서 Z 효과를 낸다\" 구조 권장.)*"
    if section.startswith("📖"):
        return "**추출된 패턴:**\n> *(TODO)*\n\n**세부 내용:**\n- *(TODO)*"
    if section.startswith("💻"):
        return "**패턴 1:** *(TODO: 이 프로젝트 컨벤션 반영한 구조 스켈레톤)*\n\n```text\n# TODO\n```"
    if section.startswith("🤔"):
        return "**선택 A를 써야 할 때:**\n- *(TODO)*\n\n**선택 B를 써야 할 때:**\n- *(TODO)*\n\n**기본값:**\n> *(TODO)*"
    if section.startswith("❌"):
        return "- **[안티패턴]:** *(TODO: 무엇을 하면 안 되는가 + 이유 + 대신 무엇을)*"
    if section.startswith("🤖") and "활용 힌트" in section:
        return "**언제 이 지식을 쓰는가:**\n- *(TODO)*\n\n**언제 쓰면 안 되는가:**\n- *(TODO)*"
    if section.startswith("🧪"):
        return (
            f"- **정보 상태:** {fm.get('status', 'draft')}\n"
            f"- **출처 신뢰도:** {fm.get('source_trust_level', 'C')}\n"
            f"- **검토 이유:** *(P-Reinforce Phase 1 자동 정규화. 본문 검증 필요.)*"
        )
    if section.startswith("🧬"):
        return (
            "- **기존 유사 문서:** *(TODO: 인덱서 클러스터 리포트 참조)*\n"
            "- **처리 방식:** UPDATE (자동 정규화)\n"
            "- **처리 이유:** Phase 1 정규화 — 옛 템플릿/누락 필드 보강."
        )
    if section.startswith("⚠️"):
        return "- **과거 데이터와의 충돌:** 없음\n- **정책 변화:** 없음"
    if section.startswith("🔗"):
        return (
            "- **Parent:** [[10_Wiki/Topics]]\n"
            "- **Related:** *(TODO: 최소 2개)*\n"
            "- **Opposite / Trade-off:** *(TODO)*\n"
            "- **Raw Source:** 직접 입력"
        )
    if section.startswith("🕓"):
        today = date.today().isoformat()
        trust = fm.get("source_trust_level", "C")
        return (
            "| 날짜 | 변경 내용 | 처리 방식 | 신뢰도 |\n"
            "|------|-----------|-----------|--------|\n"
            f"| {today} | P-Reinforce Phase 1 정규화 (frontmatter + 헤더 표준화) | UPDATE | {trust} |"
        )
    return "*(TODO)*"


def normalize_file(file_path: Path) -> str:
    text = file_path.read_text(encoding="utf-8", errors="replace")
    fm_old, body, _raw = parse_frontmatter(text)
    fm_new = build_frontmatter(fm_old, file_path, body)

    # Redirect documents stay minimal — don't add scaffold sections.
    if fm_new.get("redirect_to"):
        return render_frontmatter(fm_new) + "\n\n" + body.lstrip("\n")

    # Header renames first
    body2 = normalize_headers(body)
    is_tech = detect_tech(file_path.parent.name, fm_new.get("tags", []), body2)
    is_stub = len(body2.strip()) < 200
    body3 = append_missing_sections(body2, is_tech, is_stub, fm_new)

    return render_frontmatter(fm_new) + "\n\n" + body3.lstrip("\n")


def iter_knowledge_files() -> list[Path]:
    """All .md under Topics/, excluding operational paths (sessions, _agents,
    _company, etc) and non-content dirs."""
    EXCLUDE_FRAG = (
        "/sessions/", "/_agents/", "/_company/", "/memory/",
        "/Project_Logs/", "/Harness_Research_", "/docs/records/",
        "/_Archive_Orphans/", "/Post_Drafts/", "/UX_Scenarios/",
    )
    SKIP_DIRS = {".obsidian", ".git", "__pycache__", "node_modules"}
    out = []
    import os as _os
    for dirpath, dirs, files in _os.walk(TOPICS):
        dirs[:] = [d for d in dirs if d not in SKIP_DIRS]
        for f in files:
            if not f.endswith(".md"):
                continue
            p = Path(dirpath) / f
            rel = "/" + str(p.relative_to(ROOT)).replace("\\", "/")
            if any(x in rel for x in EXCLUDE_FRAG):
                continue
            out.append(p)
    return out


def main() -> int:
    parser = argparse.ArgumentParser()
    parser.add_argument("path", nargs="?", help="Relative path under E:/Wiki/2nd/ or absolute")
    parser.add_argument("--out", help="Write normalized output to PATH instead of stdout")
    parser.add_argument("--batch", help="Path to a text file with one input path per line; outputs go under _tools/sample_normalized/")
    parser.add_argument("--apply-all", action="store_true", help="Normalize ALL knowledge files in-place (operational paths excluded). DESTRUCTIVE.")
    args = parser.parse_args()

    if args.apply_all:
        files = iter_knowledge_files()
        ok = err = 0
        errors: list[tuple[str, str]] = []
        for i, p in enumerate(files, 1):
            try:
                normalized = normalize_file(p)
                p.write_text(normalized, encoding="utf-8")
                ok += 1
            except Exception as e:
                err += 1
                errors.append((str(p), str(e)))
            if i % 200 == 0:
                print(f"  ...{i}/{len(files)}", file=sys.stderr)
        print(f"DONE: {ok} OK, {err} errors out of {len(files)} files", file=sys.stderr)
        if errors:
            log = ROOT / "_tools" / "normalize_errors.log"
            log.write_text("\n".join(f"{p}\t{e}" for p, e in errors), encoding="utf-8")
            print(f"  errors written to {log}", file=sys.stderr)
        return 0 if err == 0 else 1

    if args.batch:
        listing = Path(args.batch).read_text(encoding="utf-8").splitlines()
        out_dir = ROOT / "_tools" / "sample_normalized"
        out_dir.mkdir(parents=True, exist_ok=True)
        results = []
        for line in listing:
            line = line.strip()
            if not line or line.startswith("#"):
                continue
            p = Path(line)
            if not p.is_absolute():
                p = ROOT / p
            if not p.exists():
                print(f"SKIP missing: {p}", file=sys.stderr)
                continue
            try:
                normalized = normalize_file(p)
            except Exception as e:
                print(f"FAIL {p}: {e}", file=sys.stderr)
                continue
            out_path = out_dir / (p.stem + ".normalized.md")
            out_path.write_text(normalized, encoding="utf-8")
            results.append({"src": str(p.relative_to(ROOT)).replace("\\", "/"),
                            "out": str(out_path.relative_to(ROOT)).replace("\\", "/"),
                            "src_chars": len(p.read_text(encoding="utf-8", errors="replace")),
                            "out_chars": len(normalized)})
        (out_dir / "_manifest.json").write_text(json.dumps(results, ensure_ascii=False, indent=1), encoding="utf-8")
        print(f"Wrote {len(results)} normalized samples to {out_dir}", file=sys.stderr)
        return 0

    if not args.path:
        parser.error("path or --batch required")
    p = Path(args.path)
    if not p.is_absolute():
        p = ROOT / p
    if not p.exists():
        print(f"FAIL: {p} does not exist", file=sys.stderr)
        return 2
    out = normalize_file(p)
    if args.out:
        Path(args.out).write_text(out, encoding="utf-8")
        print(f"Wrote {args.out}", file=sys.stderr)
    else:
        sys.stdout.write(out)
    return 0


if __name__ == "__main__":
    sys.exit(main())