Files
2nd/_tools/p_reinforce_normalize.py

632 lines
26 KiB
Python

"""
P-Reinforce Phase 1 — Template Normalizer (DRY-RUN by default)
==============================================================
Reads a wiki .md file and emits a normalized version that conforms to the
current P-Reinforce template (templates/wiki_document.md).
What it does mechanically (NO LLM calls):
1. Cleans frontmatter:
- Strips [[wiki-link]] decoration from id/category/tags
- Adds missing fields: canonical_id, aliases, status,
source_trust_level, raw_sources, duplicate_of, tech_stack (if tech)
- Re-derives source_trust_level from confidence_score if missing
- Preserves the original id under `legacy_id` and `aliases`
2. Renames legacy section headers to current template:
- "Brief Summary" -> "📌 한 줄 통찰 (The Karpathy Summary)"
- "Core Content" -> "📖 구조화된 지식 (Synthesized Content)"
- "Trade-offs & Caveats" -> merged into "⚠️ 모순 및 업데이트"
- "Knowledge Connections" -> "🔗 지식 연결 (Graph)"
3. Adds missing required sections as scaffold (with TODO markers)
so LLM can fill them later. Tech docs get Code Patterns / Decision
Criteria / Anti-Patterns scaffolds.
4. For stubs (<200 body chars), inserts a `🤖 [AI 추론 보강 필요]`
block — does NOT generate content (that step is interactive).
Usage:
python p_reinforce_normalize.py <relative_path> # dry-run, prints diff
python p_reinforce_normalize.py <relative_path> --out PATH # write to PATH
python p_reinforce_normalize.py --batch <input_listing> # multi-file dry-run
Default mode is DRY-RUN. No source files are modified.
"""
from __future__ import annotations
import argparse
import json
import re
import sys
import unicodedata
from datetime import date
from pathlib import Path
ROOT = Path(r"E:/Wiki/2nd")
TOPICS = ROOT / "10_Wiki" / "Topics"
FRONTMATTER_RE = re.compile(r"^---\s*\n(.*?)\n---\s*\n?", re.DOTALL)
WIKI_LINK_RE = re.compile(r"\[\[([^\]|]+?)(?:\|[^\]]+)?\]\]")
H1_RE = re.compile(r"^# +(.+)$", re.MULTILINE)
HEADING_RE = re.compile(r"^(#{2,6})\s+(.+?)\s*$", re.MULTILINE)
CODE_FENCE_RE = re.compile(r"```([a-zA-Z0-9_-]*)\n", re.MULTILINE)
# Header rename map: matched against the heading text only (after stripping
# leading emoji/punctuation). Keys are normalized lowercase.
HEADER_RENAMES = {
"brief summary": "📌 한 줄 통찰 (The Karpathy Summary)",
"the karpathy summary": "📌 한 줄 통찰 (The Karpathy Summary)",
"한 줄 통찰": "📌 한 줄 통찰 (The Karpathy Summary)",
"한 줄 통찰 (the karpathy summary)": "📌 한 줄 통찰 (The Karpathy Summary)",
"core content": "📖 구조화된 지식 (Synthesized Content)",
"synthesized content": "📖 구조화된 지식 (Synthesized Content)",
"구조화된 지식": "📖 구조화된 지식 (Synthesized Content)",
"구조화된 지식 (synthesized content)": "📖 구조화된 지식 (Synthesized Content)",
"trade-offs & caveats": "⚠️ 모순 및 업데이트 (Contradictions & Updates)",
"tradeoffs & caveats": "⚠️ 모순 및 업데이트 (Contradictions & Updates)",
"trade-offs and caveats": "⚠️ 모순 및 업데이트 (Contradictions & Updates)",
"contradictions & rl update": "⚠️ 모순 및 업데이트 (Contradictions & Updates)",
"contradictions & updates": "⚠️ 모순 및 업데이트 (Contradictions & Updates)",
"rl update": "⚠️ 모순 및 업데이트 (Contradictions & Updates)",
"모순 및 업데이트": "⚠️ 모순 및 업데이트 (Contradictions & Updates)",
"모순 및 업데이트 (contradictions & updates)": "⚠️ 모순 및 업데이트 (Contradictions & Updates)",
"모순 및 업데이트 (contradictions & rl update)": "⚠️ 모순 및 업데이트 (Contradictions & Updates)",
"knowledge connections": "🔗 지식 연결 (Graph)",
"graph": "🔗 지식 연결 (Graph)",
"지식 연결": "🔗 지식 연결 (Graph)",
"지식 연결 (graph)": "🔗 지식 연결 (Graph)",
"code patterns": "💻 코드 패턴 (Code Patterns)",
"코드 패턴": "💻 코드 패턴 (Code Patterns)",
"decision criteria": "🤔 의사결정 기준 (Decision Criteria)",
"의사결정 기준": "🤔 의사결정 기준 (Decision Criteria)",
"anti-patterns": "❌ 안티패턴 (Anti-Patterns)",
"antipatterns": "❌ 안티패턴 (Anti-Patterns)",
"안티패턴": "❌ 안티패턴 (Anti-Patterns)",
"how to use this knowledge": "🤖 LLM 활용 힌트 (How to Use This Knowledge)",
"llm 활용 힌트": "🤖 LLM 활용 힌트 (How to Use This Knowledge)",
"validation": "🧪 검증 상태 (Validation)",
"검증 상태": "🧪 검증 상태 (Validation)",
"duplicate check": "🧬 중복 검사 (Duplicate Check)",
"중복 검사": "🧬 중복 검사 (Duplicate Check)",
"changelog": "🕓 변경 이력 (Changelog)",
"변경 이력": "🕓 변경 이력 (Changelog)",
}
EMOJI_AT_START_RE = re.compile(r"^[^\w가-힣]*", re.UNICODE)
# folder -> category mapping (best-effort; not authoritative)
FOLDER_CATEGORY_HINTS = {
"AI": "10_Wiki/Topics",
"AI_and_ML": "10_Wiki/Topics",
"Architecture": "10_Wiki/Topics",
"Backend": "10_Wiki/Topics",
"Frontend": "10_Wiki/Topics",
"Frontend_Mastery": "10_Wiki/Topics",
"DevOps_and_Security": "10_Wiki/Topics",
"Computer_Science_and_Theory": "10_Wiki/Topics",
"Programming & Language": "10_Wiki/Topics",
"Programming & Tools": "10_Wiki/Topics",
"Programming & Web": "10_Wiki/Topics",
"Programming & Formal Methods": "10_Wiki/Topics",
"Visual_Effects": "10_Wiki/Topics_Art",
"Graphics & Performance": "10_Wiki/Topics_Art",
"UI_UX_Assets": "10_Wiki/Topics_Art",
"Design & Experience": "10_Wiki/Topics_Art",
"Game Design": "10_Wiki/Topics_GD",
"Game_Design": "10_Wiki/Topics_GD",
"Level_Design": "10_Wiki/Topics_GD",
"Balancing": "10_Wiki/Topics_GD",
"Core_Systems": "10_Wiki/Topics_GD",
"Skybound": "10_Wiki/Topics_GD",
"Storytelling": "10_Wiki/Topics_GD",
"Economics": "10_Wiki/Topics_Biz",
"Economy": "10_Wiki/Topics_Biz",
"Economics & Algorithms": "10_Wiki/Topics_Biz",
"Business_Strategy": "10_Wiki/Topics_Biz",
"Market_Research": "10_Wiki/Topics_Biz",
"Partnerships": "10_Wiki/Topics_Biz",
"Content_Strategy": "10_Wiki/Topics_Blog",
"Post_Drafts": "10_Wiki/Topics_Blog",
"External_Media": "10_Wiki/Topics_Blog",
}
TECH_KEYWORDS = {
"architecture", "algorithm", "programming", "code", "frontend", "backend",
"compiler", "interpreter", "runtime", "framework", "api", "rest", "graphql",
"kubernetes", "docker", "kafka", "fastapi", "react", "vue", "svelte", "next",
"typescript", "python", "rust", "javascript", "go ", "java ", "c++", "swift",
"database", "sql", "postgres", "redis", "mongo", "cache", "distributed",
}
def parse_frontmatter(text: str) -> tuple[dict, str, str]:
"""Returns (fm_dict, body, fm_raw). Same forgiving parser as indexer."""
m = FRONTMATTER_RE.match(text)
if not m:
return {}, text, ""
raw = m.group(1)
body = text[m.end():]
fm: dict = {}
current_key: str | None = None
for line in raw.splitlines():
if not line.strip() or line.lstrip().startswith("#"):
continue
if line.startswith((" ", "\t")) and current_key:
existing = fm.get(current_key)
fm[current_key] = (str(existing) + " " + line.strip()).strip() if existing else line.strip()
continue
if ":" not in line:
continue
key, _, val = line.partition(":")
key = key.strip()
val = val.strip()
if val.startswith("[") and val.endswith("]"):
inner = val[1:-1].strip()
items = []
for it in re.split(r",(?![^\[]*\])", inner):
it = it.strip().strip("'\"")
wm = WIKI_LINK_RE.fullmatch(it)
if wm:
it = wm.group(1)
if it:
items.append(it)
fm[key] = items
else:
# strip [[wiki-link|alias]] decoration if scalar
wm = WIKI_LINK_RE.fullmatch(val.strip("'\"")) if val else None
if wm:
val = wm.group(1)
fm[key] = val.strip("'\"")
current_key = key
return fm, body, raw
def detect_tech(folder: str, tags: list[str], body: str) -> bool:
haystack = (folder + " " + " ".join(tags) + " " + body[:2000]).lower()
if CODE_FENCE_RE.search(body):
return True
return any(k in haystack for k in TECH_KEYWORDS)
def trust_from_confidence(conf_str: str | None) -> str:
"""Per-user policy (2026-05-08): LLM-augmented entries are graded A
(with `inferred_by` metadata for traceability), not C as the default
P-Reinforce skill suggests. The mapping below is therefore biased upward
relative to the skill spec.
"""
if not conf_str:
return "A" # user policy: trust the model
try:
c = float(conf_str)
except (TypeError, ValueError):
return "A"
if c >= 0.95:
return "A"
if c >= 0.80:
return "A"
if c >= 0.65:
return "B"
return "C"
# Markers in body text that indicate a redirect even without `redirect_to`
# in the frontmatter (e.g. older P-Reinforce passes left text-only redirects).
TEXT_REDIRECT_PATTERNS = [
re.compile(r"\*?Redirected to:\s*\[\[([^\]|]+?)(?:\|[^\]]+)?\]\]\*?", re.IGNORECASE),
re.compile(r"\[\[([^\]|]+?)(?:\|[^\]]+)?\]\]\s*(?:으|로|로)?\s*통합되었습니다"),
re.compile(r"통합되었습니다.*?\[\[([^\]|]+?)(?:\|[^\]]+)?\]\]"),
re.compile(r"이 문서는.*?\[\[([^\]|]+?)(?:\|[^\]]+)?\]\].*?로 통합"),
]
def detect_text_redirect(body: str) -> str | None:
"""Look for "Redirected to: [[X]]" or "[[X]]로 통합되었습니다" in body.
Returns the canonical target name if found, else None."""
body_top = body[:1500] # only check the top of the doc
for pat in TEXT_REDIRECT_PATTERNS:
m = pat.search(body_top)
if m:
return m.group(1).strip()
return None
def slugify_id(filename: str, today: str | None = None) -> str:
today = today or date.today().isoformat().replace("-", "")[:8]
slug = unicodedata.normalize("NFKC", filename).lower()
slug = re.sub(r"[^a-z0-9가-힣]+", "-", slug).strip("-")
if not slug:
slug = "doc"
return f"wiki-{today[:4]}-{today[4:8]}-{slug[:32]}"
def quote_yaml(s: str) -> str:
if s is None:
return '""'
s = str(s)
if any(ch in s for ch in [':', '#', '[', ']', '{', '}', ',', '&', '*', '!', '|', '>', "'", '"', '%', '@', '`']) or "\n" in s:
return '"' + s.replace('"', '\\"') + '"'
return s
def build_frontmatter(fm: dict, file_path: Path, body: str) -> dict:
"""Compute the new normalized frontmatter from old fm + path heuristics."""
folder = file_path.parent.name
filename = file_path.stem
new_fm: dict = {}
# Detect text-only redirect (no redirect_to field but body says so)
text_redirect = detect_text_redirect(body) if not fm.get("redirect_to") else None
# ID — preserve legacy id if it already looks like our slug
legacy_id = fm.get("id")
legacy_id_str = str(legacy_id) if legacy_id else ""
if legacy_id_str.startswith("wiki-") and not WIKI_LINK_RE.search(legacy_id_str):
new_id = legacy_id_str
else:
new_id = slugify_id(filename)
new_fm["id"] = new_id
# title
title = filename.replace("-", " ").replace("_", " ").strip()
new_fm["title"] = title
# category — strip emoji-decorated old categories, normalize wiki-link form
cat = fm.get("category")
if cat in (None, "", "Unified") or (isinstance(cat, str) and cat.startswith("[[")):
cat = FOLDER_CATEGORY_HINTS.get(folder, "10_Wiki/Topics")
elif isinstance(cat, str) and "💡" in cat:
# "10_Wiki/💡 Topics/AI" -> hinted category
cat = FOLDER_CATEGORY_HINTS.get(folder, "10_Wiki/Topics")
new_fm["category"] = cat
# If this file is a redirect, override status and add redirect_to.
if fm.get("redirect_to") or text_redirect:
target = fm.get("redirect_to") or text_redirect
new_fm["redirect_to"] = target
new_fm["status"] = "merged"
new_fm["canonical_id"] = fm.get("canonical_id") or target # best-effort
else:
# status: draft for stubs, needs_review otherwise; verified only if pre-tagged
existing_status = fm.get("status")
if existing_status in ("verified", "merged", "deprecated"):
new_fm["status"] = existing_status
else:
new_fm["status"] = "draft" if len(body.strip()) < 200 else "needs_review"
new_fm["canonical_id"] = fm.get("canonical_id") or "self"
# aliases — preserve legacy id and any prior aliases
aliases = fm.get("aliases") or []
if isinstance(aliases, str):
aliases = [aliases]
if legacy_id and isinstance(legacy_id, str):
legacy_id_clean = WIKI_LINK_RE.sub(r"\1", legacy_id)
if legacy_id_clean and legacy_id_clean not in aliases:
aliases = [legacy_id_clean] + aliases
new_fm["aliases"] = aliases
# duplicate_of
new_fm["duplicate_of"] = fm.get("duplicate_of") or "none"
# source_trust_level
new_fm["source_trust_level"] = fm.get("source_trust_level") or trust_from_confidence(fm.get("confidence_score"))
# confidence_score — preserve if present, else policy default 0.92
cs = fm.get("confidence_score")
if cs:
try:
new_fm["confidence_score"] = float(cs)
except (TypeError, ValueError):
new_fm["confidence_score"] = 0.92
else:
new_fm["confidence_score"] = 0.92
# tags — strip wiki link decoration
tags = fm.get("tags") or []
if isinstance(tags, str):
tags = [tags]
cleaned_tags = []
for t in tags:
if isinstance(t, str):
t = WIKI_LINK_RE.sub(r"\1", t).strip("[]'\" ")
if t:
cleaned_tags.append(t)
new_fm["tags"] = cleaned_tags or ["uncategorized"]
# raw_sources
rs = fm.get("raw_sources") or []
if isinstance(rs, str):
rs = [rs]
new_fm["raw_sources"] = rs
# last_reinforced
new_fm["last_reinforced"] = fm.get("last_reinforced") or date.today().isoformat()
# github_commit
new_fm["github_commit"] = fm.get("github_commit") or "pending"
# inferred_by — traceability for LLM-augmented entries (per user policy)
if fm.get("inferred_by"):
new_fm["inferred_by"] = fm["inferred_by"]
elif new_fm["source_trust_level"] == "A" and not fm.get("source_trust_level"):
# we just promoted this to A; record provenance
new_fm["inferred_by"] = "Claude Opus 4.7 (auto-normalize 2026-05-08)"
# tech_stack — only if detected
if detect_tech(folder, cleaned_tags, body):
ts_old = fm.get("tech_stack")
if isinstance(ts_old, dict):
new_fm["tech_stack"] = ts_old
else:
new_fm["tech_stack"] = {"language": "unspecified", "framework": "unspecified"}
return new_fm
def render_frontmatter(fm: dict) -> str:
lines = ["---"]
order = [
"id", "title", "category", "status", "redirect_to", "canonical_id", "aliases",
"duplicate_of", "source_trust_level", "confidence_score",
"tags", "raw_sources", "last_reinforced", "github_commit",
"inferred_by", "tech_stack",
]
for k in order:
if k not in fm:
continue
v = fm[k]
if isinstance(v, list):
if not v:
lines.append(f"{k}: []")
else:
items = ", ".join(quote_yaml(x) for x in v)
lines.append(f"{k}: [{items}]")
elif isinstance(v, dict):
lines.append(f"{k}:")
for kk, vv in v.items():
lines.append(f" {kk}: {quote_yaml(vv)}")
elif isinstance(v, float):
lines.append(f"{k}: {v}")
else:
lines.append(f"{k}: {quote_yaml(v)}")
lines.append("---")
return "\n".join(lines)
def normalize_headers(body: str) -> str:
"""Rewrite legacy section headers to match the template names."""
def repl(m: re.Match) -> str:
hashes = m.group(1)
text = m.group(2).strip()
# strip leading emoji/punctuation/numbering for matching
norm = EMOJI_AT_START_RE.sub("", text).strip()
norm = re.sub(r"\s*\(.*?\)\s*$", "", norm).strip() # drop trailing paren
key = norm.lower()
# also try without punctuation
if key not in HEADER_RENAMES:
key2 = re.sub(r"[^\w가-힣 -]", "", key).strip()
if key2 in HEADER_RENAMES:
key = key2
if key in HEADER_RENAMES:
return f"{hashes} {HEADER_RENAMES[key]}"
return m.group(0)
return HEADING_RE.sub(repl, body)
def find_section_starts(body: str) -> dict[str, int]:
"""Return {section_canonical_name: line_offset}"""
found = {}
for m in HEADING_RE.finditer(body):
text = m.group(2).strip()
norm = EMOJI_AT_START_RE.sub("", text).strip().lower()
norm = re.sub(r"\s*\(.*?\)\s*$", "", norm).strip()
for k, canonical in HEADER_RENAMES.items():
if norm == k.lower() or text == canonical:
if canonical not in found:
found[canonical] = m.start()
return found
REQUIRED_SECTIONS_BASE = [
"📌 한 줄 통찰 (The Karpathy Summary)",
"📖 구조화된 지식 (Synthesized Content)",
"🤖 LLM 활용 힌트 (How to Use This Knowledge)",
"🧪 검증 상태 (Validation)",
"🧬 중복 검사 (Duplicate Check)",
"⚠️ 모순 및 업데이트 (Contradictions & Updates)",
"🔗 지식 연결 (Graph)",
"🕓 변경 이력 (Changelog)",
]
REQUIRED_SECTIONS_TECH_EXTRA = [
"💻 코드 패턴 (Code Patterns)",
"🤔 의사결정 기준 (Decision Criteria)",
"❌ 안티패턴 (Anti-Patterns)",
]
def append_missing_sections(body: str, is_tech: bool, is_stub: bool, fm: dict) -> str:
"""For each required section not present, append a scaffold with TODO marker."""
found = find_section_starts(body)
missing = []
base = REQUIRED_SECTIONS_BASE + (REQUIRED_SECTIONS_TECH_EXTRA if is_tech else [])
for s in base:
if s not in found:
missing.append(s)
if not missing and not is_stub:
return body
out = [body.rstrip()]
if is_stub:
out.append("\n\n> 🤖 **[AI 추론 보강 필요]** — 본문이 200자 미만이라 P-Reinforce가 빈약 stub으로 분류했습니다.")
out.append(f"> source_trust_level=`C` (AI 보강분), confidence_score=`{fm.get('confidence_score', 0.7)}`로 표시되어 있습니다.")
out.append("> 사용자 검증 후 trust_level 상향 조정 가능.\n")
for s in missing:
out.append(f"\n## {s}\n")
out.append(_scaffold_for(s, fm))
return "\n".join(out)
def _scaffold_for(section: str, fm: dict) -> str:
if section.startswith("📌"):
return "> *(TODO: 한 문장으로 핵심 통찰을 작성. \"X는 Y 조건에서 Z 효과를 낸다\" 구조 권장.)*"
if section.startswith("📖"):
return "**추출된 패턴:**\n> *(TODO)*\n\n**세부 내용:**\n- *(TODO)*"
if section.startswith("💻"):
return "**패턴 1:** *(TODO: 이 프로젝트 컨벤션 반영한 구조 스켈레톤)*\n\n```text\n# TODO\n```"
if section.startswith("🤔"):
return "**선택 A를 써야 할 때:**\n- *(TODO)*\n\n**선택 B를 써야 할 때:**\n- *(TODO)*\n\n**기본값:**\n> *(TODO)*"
if section.startswith(""):
return "- **[안티패턴]:** *(TODO: 무엇을 하면 안 되는가 + 이유 + 대신 무엇을)*"
if section.startswith("🤖") and "활용 힌트" in section:
return "**언제 이 지식을 쓰는가:**\n- *(TODO)*\n\n**언제 쓰면 안 되는가:**\n- *(TODO)*"
if section.startswith("🧪"):
return (
f"- **정보 상태:** {fm.get('status', 'draft')}\n"
f"- **출처 신뢰도:** {fm.get('source_trust_level', 'C')}\n"
f"- **검토 이유:** *(P-Reinforce Phase 1 자동 정규화. 본문 검증 필요.)*"
)
if section.startswith("🧬"):
return (
"- **기존 유사 문서:** *(TODO: 인덱서 클러스터 리포트 참조)*\n"
"- **처리 방식:** UPDATE (자동 정규화)\n"
"- **처리 이유:** Phase 1 정규화 — 옛 템플릿/누락 필드 보강."
)
if section.startswith("⚠️"):
return "- **과거 데이터와의 충돌:** 없음\n- **정책 변화:** 없음"
if section.startswith("🔗"):
return (
"- **Parent:** [[10_Wiki/Topics]]\n"
"- **Related:** *(TODO: 최소 2개)*\n"
"- **Opposite / Trade-off:** *(TODO)*\n"
"- **Raw Source:** 직접 입력"
)
if section.startswith("🕓"):
today = date.today().isoformat()
trust = fm.get("source_trust_level", "C")
return (
"| 날짜 | 변경 내용 | 처리 방식 | 신뢰도 |\n"
"|------|-----------|-----------|--------|\n"
f"| {today} | P-Reinforce Phase 1 정규화 (frontmatter + 헤더 표준화) | UPDATE | {trust} |"
)
return "*(TODO)*"
def normalize_file(file_path: Path) -> str:
text = file_path.read_text(encoding="utf-8", errors="replace")
fm_old, body, _raw = parse_frontmatter(text)
fm_new = build_frontmatter(fm_old, file_path, body)
# Redirect documents stay minimal — don't add scaffold sections.
if fm_new.get("redirect_to"):
return render_frontmatter(fm_new) + "\n\n" + body.lstrip("\n")
# Header renames first
body2 = normalize_headers(body)
is_tech = detect_tech(file_path.parent.name, fm_new.get("tags", []), body2)
is_stub = len(body2.strip()) < 200
body3 = append_missing_sections(body2, is_tech, is_stub, fm_new)
return render_frontmatter(fm_new) + "\n\n" + body3.lstrip("\n")
def iter_knowledge_files() -> list[Path]:
"""All .md under Topics/, excluding operational paths (sessions, _agents,
_company, etc) and non-content dirs."""
EXCLUDE_FRAG = (
"/sessions/", "/_agents/", "/_company/", "/memory/",
"/Project_Logs/", "/Harness_Research_", "/docs/records/",
"/_Archive_Orphans/", "/Post_Drafts/", "/UX_Scenarios/",
)
SKIP_DIRS = {".obsidian", ".git", "__pycache__", "node_modules"}
out = []
import os as _os
for dirpath, dirs, files in _os.walk(TOPICS):
dirs[:] = [d for d in dirs if d not in SKIP_DIRS]
for f in files:
if not f.endswith(".md"):
continue
p = Path(dirpath) / f
rel = "/" + str(p.relative_to(ROOT)).replace("\\", "/")
if any(x in rel for x in EXCLUDE_FRAG):
continue
out.append(p)
return out
def main() -> int:
parser = argparse.ArgumentParser()
parser.add_argument("path", nargs="?", help="Relative path under E:/Wiki/2nd/ or absolute")
parser.add_argument("--out", help="Write normalized output to PATH instead of stdout")
parser.add_argument("--batch", help="Path to a text file with one input path per line; outputs go under _tools/sample_normalized/")
parser.add_argument("--apply-all", action="store_true", help="Normalize ALL knowledge files in-place (operational paths excluded). DESTRUCTIVE.")
args = parser.parse_args()
if args.apply_all:
files = iter_knowledge_files()
ok = err = 0
errors: list[tuple[str, str]] = []
for i, p in enumerate(files, 1):
try:
normalized = normalize_file(p)
p.write_text(normalized, encoding="utf-8")
ok += 1
except Exception as e:
err += 1
errors.append((str(p), str(e)))
if i % 200 == 0:
print(f" ...{i}/{len(files)}", file=sys.stderr)
print(f"DONE: {ok} OK, {err} errors out of {len(files)} files", file=sys.stderr)
if errors:
log = ROOT / "_tools" / "normalize_errors.log"
log.write_text("\n".join(f"{p}\t{e}" for p, e in errors), encoding="utf-8")
print(f" errors written to {log}", file=sys.stderr)
return 0 if err == 0 else 1
if args.batch:
listing = Path(args.batch).read_text(encoding="utf-8").splitlines()
out_dir = ROOT / "_tools" / "sample_normalized"
out_dir.mkdir(parents=True, exist_ok=True)
results = []
for line in listing:
line = line.strip()
if not line or line.startswith("#"):
continue
p = Path(line)
if not p.is_absolute():
p = ROOT / p
if not p.exists():
print(f"SKIP missing: {p}", file=sys.stderr)
continue
try:
normalized = normalize_file(p)
except Exception as e:
print(f"FAIL {p}: {e}", file=sys.stderr)
continue
out_path = out_dir / (p.stem + ".normalized.md")
out_path.write_text(normalized, encoding="utf-8")
results.append({"src": str(p.relative_to(ROOT)).replace("\\", "/"),
"out": str(out_path.relative_to(ROOT)).replace("\\", "/"),
"src_chars": len(p.read_text(encoding="utf-8", errors="replace")),
"out_chars": len(normalized)})
(out_dir / "_manifest.json").write_text(json.dumps(results, ensure_ascii=False, indent=1), encoding="utf-8")
print(f"Wrote {len(results)} normalized samples to {out_dir}", file=sys.stderr)
return 0
if not args.path:
parser.error("path or --batch required")
p = Path(args.path)
if not p.is_absolute():
p = ROOT / p
if not p.exists():
print(f"FAIL: {p} does not exist", file=sys.stderr)
return 2
out = normalize_file(p)
if args.out:
Path(args.out).write_text(out, encoding="utf-8")
print(f"Wrote {args.out}", file=sys.stderr)
else:
sys.stdout.write(out)
return 0
if __name__ == "__main__":
sys.exit(main())