feat: Wiki 지식 자산 업데이트 - UX Scenarios, Frontend, Game Design, Topics 추가 [2026-05-08]
This commit is contained in:
@@ -0,0 +1,631 @@
|
||||
"""
|
||||
P-Reinforce Phase 1 — Template Normalizer (DRY-RUN by default)
|
||||
==============================================================
|
||||
Reads a wiki .md file and emits a normalized version that conforms to the
|
||||
current P-Reinforce template (templates/wiki_document.md).
|
||||
|
||||
What it does mechanically (NO LLM calls):
|
||||
1. Cleans frontmatter:
|
||||
- Strips [[wiki-link]] decoration from id/category/tags
|
||||
- Adds missing fields: canonical_id, aliases, status,
|
||||
source_trust_level, raw_sources, duplicate_of, tech_stack (if tech)
|
||||
- Re-derives source_trust_level from confidence_score if missing
|
||||
- Preserves the original id under `legacy_id` and `aliases`
|
||||
2. Renames legacy section headers to current template:
|
||||
- "Brief Summary" -> "📌 한 줄 통찰 (The Karpathy Summary)"
|
||||
- "Core Content" -> "📖 구조화된 지식 (Synthesized Content)"
|
||||
- "Trade-offs & Caveats" -> merged into "⚠️ 모순 및 업데이트"
|
||||
- "Knowledge Connections" -> "🔗 지식 연결 (Graph)"
|
||||
3. Adds missing required sections as scaffold (with TODO markers)
|
||||
so LLM can fill them later. Tech docs get Code Patterns / Decision
|
||||
Criteria / Anti-Patterns scaffolds.
|
||||
4. For stubs (<200 body chars), inserts a `🤖 [AI 추론 보강 필요]`
|
||||
block — does NOT generate content (that step is interactive).
|
||||
|
||||
Usage:
|
||||
python p_reinforce_normalize.py <relative_path> # dry-run, prints diff
|
||||
python p_reinforce_normalize.py <relative_path> --out PATH # write to PATH
|
||||
python p_reinforce_normalize.py --batch <input_listing> # multi-file dry-run
|
||||
|
||||
Default mode is DRY-RUN. No source files are modified.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
import unicodedata
|
||||
from datetime import date
|
||||
from pathlib import Path
|
||||
|
||||
ROOT = Path(r"E:/Wiki/2nd")
|
||||
TOPICS = ROOT / "10_Wiki" / "Topics"
|
||||
|
||||
FRONTMATTER_RE = re.compile(r"^---\s*\n(.*?)\n---\s*\n?", re.DOTALL)
|
||||
WIKI_LINK_RE = re.compile(r"\[\[([^\]|]+?)(?:\|[^\]]+)?\]\]")
|
||||
H1_RE = re.compile(r"^# +(.+)$", re.MULTILINE)
|
||||
HEADING_RE = re.compile(r"^(#{2,6})\s+(.+?)\s*$", re.MULTILINE)
|
||||
CODE_FENCE_RE = re.compile(r"```([a-zA-Z0-9_-]*)\n", re.MULTILINE)
|
||||
|
||||
# Header rename map: matched against the heading text only (after stripping
|
||||
# leading emoji/punctuation). Keys are normalized lowercase.
|
||||
HEADER_RENAMES = {
|
||||
"brief summary": "📌 한 줄 통찰 (The Karpathy Summary)",
|
||||
"the karpathy summary": "📌 한 줄 통찰 (The Karpathy Summary)",
|
||||
"한 줄 통찰": "📌 한 줄 통찰 (The Karpathy Summary)",
|
||||
"한 줄 통찰 (the karpathy summary)": "📌 한 줄 통찰 (The Karpathy Summary)",
|
||||
"core content": "📖 구조화된 지식 (Synthesized Content)",
|
||||
"synthesized content": "📖 구조화된 지식 (Synthesized Content)",
|
||||
"구조화된 지식": "📖 구조화된 지식 (Synthesized Content)",
|
||||
"구조화된 지식 (synthesized content)": "📖 구조화된 지식 (Synthesized Content)",
|
||||
"trade-offs & caveats": "⚠️ 모순 및 업데이트 (Contradictions & Updates)",
|
||||
"tradeoffs & caveats": "⚠️ 모순 및 업데이트 (Contradictions & Updates)",
|
||||
"trade-offs and caveats": "⚠️ 모순 및 업데이트 (Contradictions & Updates)",
|
||||
"contradictions & rl update": "⚠️ 모순 및 업데이트 (Contradictions & Updates)",
|
||||
"contradictions & updates": "⚠️ 모순 및 업데이트 (Contradictions & Updates)",
|
||||
"rl update": "⚠️ 모순 및 업데이트 (Contradictions & Updates)",
|
||||
"모순 및 업데이트": "⚠️ 모순 및 업데이트 (Contradictions & Updates)",
|
||||
"모순 및 업데이트 (contradictions & updates)": "⚠️ 모순 및 업데이트 (Contradictions & Updates)",
|
||||
"모순 및 업데이트 (contradictions & rl update)": "⚠️ 모순 및 업데이트 (Contradictions & Updates)",
|
||||
"knowledge connections": "🔗 지식 연결 (Graph)",
|
||||
"graph": "🔗 지식 연결 (Graph)",
|
||||
"지식 연결": "🔗 지식 연결 (Graph)",
|
||||
"지식 연결 (graph)": "🔗 지식 연결 (Graph)",
|
||||
"code patterns": "💻 코드 패턴 (Code Patterns)",
|
||||
"코드 패턴": "💻 코드 패턴 (Code Patterns)",
|
||||
"decision criteria": "🤔 의사결정 기준 (Decision Criteria)",
|
||||
"의사결정 기준": "🤔 의사결정 기준 (Decision Criteria)",
|
||||
"anti-patterns": "❌ 안티패턴 (Anti-Patterns)",
|
||||
"antipatterns": "❌ 안티패턴 (Anti-Patterns)",
|
||||
"안티패턴": "❌ 안티패턴 (Anti-Patterns)",
|
||||
"how to use this knowledge": "🤖 LLM 활용 힌트 (How to Use This Knowledge)",
|
||||
"llm 활용 힌트": "🤖 LLM 활용 힌트 (How to Use This Knowledge)",
|
||||
"validation": "🧪 검증 상태 (Validation)",
|
||||
"검증 상태": "🧪 검증 상태 (Validation)",
|
||||
"duplicate check": "🧬 중복 검사 (Duplicate Check)",
|
||||
"중복 검사": "🧬 중복 검사 (Duplicate Check)",
|
||||
"changelog": "🕓 변경 이력 (Changelog)",
|
||||
"변경 이력": "🕓 변경 이력 (Changelog)",
|
||||
}
|
||||
|
||||
EMOJI_AT_START_RE = re.compile(r"^[^\w가-힣]*", re.UNICODE)
|
||||
|
||||
# folder -> category mapping (best-effort; not authoritative)
|
||||
FOLDER_CATEGORY_HINTS = {
|
||||
"AI": "10_Wiki/Topics",
|
||||
"AI_and_ML": "10_Wiki/Topics",
|
||||
"Architecture": "10_Wiki/Topics",
|
||||
"Backend": "10_Wiki/Topics",
|
||||
"Frontend": "10_Wiki/Topics",
|
||||
"Frontend_Mastery": "10_Wiki/Topics",
|
||||
"DevOps_and_Security": "10_Wiki/Topics",
|
||||
"Computer_Science_and_Theory": "10_Wiki/Topics",
|
||||
"Programming & Language": "10_Wiki/Topics",
|
||||
"Programming & Tools": "10_Wiki/Topics",
|
||||
"Programming & Web": "10_Wiki/Topics",
|
||||
"Programming & Formal Methods": "10_Wiki/Topics",
|
||||
"Visual_Effects": "10_Wiki/Topics_Art",
|
||||
"Graphics & Performance": "10_Wiki/Topics_Art",
|
||||
"UI_UX_Assets": "10_Wiki/Topics_Art",
|
||||
"Design & Experience": "10_Wiki/Topics_Art",
|
||||
"Game Design": "10_Wiki/Topics_GD",
|
||||
"Game_Design": "10_Wiki/Topics_GD",
|
||||
"Level_Design": "10_Wiki/Topics_GD",
|
||||
"Balancing": "10_Wiki/Topics_GD",
|
||||
"Core_Systems": "10_Wiki/Topics_GD",
|
||||
"Skybound": "10_Wiki/Topics_GD",
|
||||
"Storytelling": "10_Wiki/Topics_GD",
|
||||
"Economics": "10_Wiki/Topics_Biz",
|
||||
"Economy": "10_Wiki/Topics_Biz",
|
||||
"Economics & Algorithms": "10_Wiki/Topics_Biz",
|
||||
"Business_Strategy": "10_Wiki/Topics_Biz",
|
||||
"Market_Research": "10_Wiki/Topics_Biz",
|
||||
"Partnerships": "10_Wiki/Topics_Biz",
|
||||
"Content_Strategy": "10_Wiki/Topics_Blog",
|
||||
"Post_Drafts": "10_Wiki/Topics_Blog",
|
||||
"External_Media": "10_Wiki/Topics_Blog",
|
||||
}
|
||||
|
||||
TECH_KEYWORDS = {
|
||||
"architecture", "algorithm", "programming", "code", "frontend", "backend",
|
||||
"compiler", "interpreter", "runtime", "framework", "api", "rest", "graphql",
|
||||
"kubernetes", "docker", "kafka", "fastapi", "react", "vue", "svelte", "next",
|
||||
"typescript", "python", "rust", "javascript", "go ", "java ", "c++", "swift",
|
||||
"database", "sql", "postgres", "redis", "mongo", "cache", "distributed",
|
||||
}
|
||||
|
||||
|
||||
def parse_frontmatter(text: str) -> tuple[dict, str, str]:
|
||||
"""Returns (fm_dict, body, fm_raw). Same forgiving parser as indexer."""
|
||||
m = FRONTMATTER_RE.match(text)
|
||||
if not m:
|
||||
return {}, text, ""
|
||||
raw = m.group(1)
|
||||
body = text[m.end():]
|
||||
fm: dict = {}
|
||||
current_key: str | None = None
|
||||
for line in raw.splitlines():
|
||||
if not line.strip() or line.lstrip().startswith("#"):
|
||||
continue
|
||||
if line.startswith((" ", "\t")) and current_key:
|
||||
existing = fm.get(current_key)
|
||||
fm[current_key] = (str(existing) + " " + line.strip()).strip() if existing else line.strip()
|
||||
continue
|
||||
if ":" not in line:
|
||||
continue
|
||||
key, _, val = line.partition(":")
|
||||
key = key.strip()
|
||||
val = val.strip()
|
||||
if val.startswith("[") and val.endswith("]"):
|
||||
inner = val[1:-1].strip()
|
||||
items = []
|
||||
for it in re.split(r",(?![^\[]*\])", inner):
|
||||
it = it.strip().strip("'\"")
|
||||
wm = WIKI_LINK_RE.fullmatch(it)
|
||||
if wm:
|
||||
it = wm.group(1)
|
||||
if it:
|
||||
items.append(it)
|
||||
fm[key] = items
|
||||
else:
|
||||
# strip [[wiki-link|alias]] decoration if scalar
|
||||
wm = WIKI_LINK_RE.fullmatch(val.strip("'\"")) if val else None
|
||||
if wm:
|
||||
val = wm.group(1)
|
||||
fm[key] = val.strip("'\"")
|
||||
current_key = key
|
||||
return fm, body, raw
|
||||
|
||||
|
||||
def detect_tech(folder: str, tags: list[str], body: str) -> bool:
|
||||
haystack = (folder + " " + " ".join(tags) + " " + body[:2000]).lower()
|
||||
if CODE_FENCE_RE.search(body):
|
||||
return True
|
||||
return any(k in haystack for k in TECH_KEYWORDS)
|
||||
|
||||
|
||||
def trust_from_confidence(conf_str: str | None) -> str:
|
||||
"""Per-user policy (2026-05-08): LLM-augmented entries are graded A
|
||||
(with `inferred_by` metadata for traceability), not C as the default
|
||||
P-Reinforce skill suggests. The mapping below is therefore biased upward
|
||||
relative to the skill spec.
|
||||
"""
|
||||
if not conf_str:
|
||||
return "A" # user policy: trust the model
|
||||
try:
|
||||
c = float(conf_str)
|
||||
except (TypeError, ValueError):
|
||||
return "A"
|
||||
if c >= 0.95:
|
||||
return "A"
|
||||
if c >= 0.80:
|
||||
return "A"
|
||||
if c >= 0.65:
|
||||
return "B"
|
||||
return "C"
|
||||
|
||||
|
||||
# Markers in body text that indicate a redirect even without `redirect_to`
|
||||
# in the frontmatter (e.g. older P-Reinforce passes left text-only redirects).
|
||||
TEXT_REDIRECT_PATTERNS = [
|
||||
re.compile(r"\*?Redirected to:\s*\[\[([^\]|]+?)(?:\|[^\]]+)?\]\]\*?", re.IGNORECASE),
|
||||
re.compile(r"\[\[([^\]|]+?)(?:\|[^\]]+)?\]\]\s*(?:으|로|로)?\s*통합되었습니다"),
|
||||
re.compile(r"통합되었습니다.*?\[\[([^\]|]+?)(?:\|[^\]]+)?\]\]"),
|
||||
re.compile(r"이 문서는.*?\[\[([^\]|]+?)(?:\|[^\]]+)?\]\].*?로 통합"),
|
||||
]
|
||||
|
||||
|
||||
def detect_text_redirect(body: str) -> str | None:
|
||||
"""Look for "Redirected to: [[X]]" or "[[X]]로 통합되었습니다" in body.
|
||||
Returns the canonical target name if found, else None."""
|
||||
body_top = body[:1500] # only check the top of the doc
|
||||
for pat in TEXT_REDIRECT_PATTERNS:
|
||||
m = pat.search(body_top)
|
||||
if m:
|
||||
return m.group(1).strip()
|
||||
return None
|
||||
|
||||
|
||||
def slugify_id(filename: str, today: str | None = None) -> str:
|
||||
today = today or date.today().isoformat().replace("-", "")[:8]
|
||||
slug = unicodedata.normalize("NFKC", filename).lower()
|
||||
slug = re.sub(r"[^a-z0-9가-힣]+", "-", slug).strip("-")
|
||||
if not slug:
|
||||
slug = "doc"
|
||||
return f"wiki-{today[:4]}-{today[4:8]}-{slug[:32]}"
|
||||
|
||||
|
||||
def quote_yaml(s: str) -> str:
|
||||
if s is None:
|
||||
return '""'
|
||||
s = str(s)
|
||||
if any(ch in s for ch in [':', '#', '[', ']', '{', '}', ',', '&', '*', '!', '|', '>', "'", '"', '%', '@', '`']) or "\n" in s:
|
||||
return '"' + s.replace('"', '\\"') + '"'
|
||||
return s
|
||||
|
||||
|
||||
def build_frontmatter(fm: dict, file_path: Path, body: str) -> dict:
|
||||
"""Compute the new normalized frontmatter from old fm + path heuristics."""
|
||||
folder = file_path.parent.name
|
||||
filename = file_path.stem
|
||||
new_fm: dict = {}
|
||||
|
||||
# Detect text-only redirect (no redirect_to field but body says so)
|
||||
text_redirect = detect_text_redirect(body) if not fm.get("redirect_to") else None
|
||||
|
||||
# ID — preserve legacy id if it already looks like our slug
|
||||
legacy_id = fm.get("id")
|
||||
legacy_id_str = str(legacy_id) if legacy_id else ""
|
||||
if legacy_id_str.startswith("wiki-") and not WIKI_LINK_RE.search(legacy_id_str):
|
||||
new_id = legacy_id_str
|
||||
else:
|
||||
new_id = slugify_id(filename)
|
||||
new_fm["id"] = new_id
|
||||
|
||||
# title
|
||||
title = filename.replace("-", " ").replace("_", " ").strip()
|
||||
new_fm["title"] = title
|
||||
|
||||
# category — strip emoji-decorated old categories, normalize wiki-link form
|
||||
cat = fm.get("category")
|
||||
if cat in (None, "", "Unified") or (isinstance(cat, str) and cat.startswith("[[")):
|
||||
cat = FOLDER_CATEGORY_HINTS.get(folder, "10_Wiki/Topics")
|
||||
elif isinstance(cat, str) and "💡" in cat:
|
||||
# "10_Wiki/💡 Topics/AI" -> hinted category
|
||||
cat = FOLDER_CATEGORY_HINTS.get(folder, "10_Wiki/Topics")
|
||||
new_fm["category"] = cat
|
||||
|
||||
# If this file is a redirect, override status and add redirect_to.
|
||||
if fm.get("redirect_to") or text_redirect:
|
||||
target = fm.get("redirect_to") or text_redirect
|
||||
new_fm["redirect_to"] = target
|
||||
new_fm["status"] = "merged"
|
||||
new_fm["canonical_id"] = fm.get("canonical_id") or target # best-effort
|
||||
else:
|
||||
# status: draft for stubs, needs_review otherwise; verified only if pre-tagged
|
||||
existing_status = fm.get("status")
|
||||
if existing_status in ("verified", "merged", "deprecated"):
|
||||
new_fm["status"] = existing_status
|
||||
else:
|
||||
new_fm["status"] = "draft" if len(body.strip()) < 200 else "needs_review"
|
||||
new_fm["canonical_id"] = fm.get("canonical_id") or "self"
|
||||
|
||||
# aliases — preserve legacy id and any prior aliases
|
||||
aliases = fm.get("aliases") or []
|
||||
if isinstance(aliases, str):
|
||||
aliases = [aliases]
|
||||
if legacy_id and isinstance(legacy_id, str):
|
||||
legacy_id_clean = WIKI_LINK_RE.sub(r"\1", legacy_id)
|
||||
if legacy_id_clean and legacy_id_clean not in aliases:
|
||||
aliases = [legacy_id_clean] + aliases
|
||||
new_fm["aliases"] = aliases
|
||||
|
||||
# duplicate_of
|
||||
new_fm["duplicate_of"] = fm.get("duplicate_of") or "none"
|
||||
|
||||
# source_trust_level
|
||||
new_fm["source_trust_level"] = fm.get("source_trust_level") or trust_from_confidence(fm.get("confidence_score"))
|
||||
|
||||
# confidence_score — preserve if present, else policy default 0.92
|
||||
cs = fm.get("confidence_score")
|
||||
if cs:
|
||||
try:
|
||||
new_fm["confidence_score"] = float(cs)
|
||||
except (TypeError, ValueError):
|
||||
new_fm["confidence_score"] = 0.92
|
||||
else:
|
||||
new_fm["confidence_score"] = 0.92
|
||||
|
||||
# tags — strip wiki link decoration
|
||||
tags = fm.get("tags") or []
|
||||
if isinstance(tags, str):
|
||||
tags = [tags]
|
||||
cleaned_tags = []
|
||||
for t in tags:
|
||||
if isinstance(t, str):
|
||||
t = WIKI_LINK_RE.sub(r"\1", t).strip("[]'\" ")
|
||||
if t:
|
||||
cleaned_tags.append(t)
|
||||
new_fm["tags"] = cleaned_tags or ["uncategorized"]
|
||||
|
||||
# raw_sources
|
||||
rs = fm.get("raw_sources") or []
|
||||
if isinstance(rs, str):
|
||||
rs = [rs]
|
||||
new_fm["raw_sources"] = rs
|
||||
|
||||
# last_reinforced
|
||||
new_fm["last_reinforced"] = fm.get("last_reinforced") or date.today().isoformat()
|
||||
|
||||
# github_commit
|
||||
new_fm["github_commit"] = fm.get("github_commit") or "pending"
|
||||
|
||||
# inferred_by — traceability for LLM-augmented entries (per user policy)
|
||||
if fm.get("inferred_by"):
|
||||
new_fm["inferred_by"] = fm["inferred_by"]
|
||||
elif new_fm["source_trust_level"] == "A" and not fm.get("source_trust_level"):
|
||||
# we just promoted this to A; record provenance
|
||||
new_fm["inferred_by"] = "Claude Opus 4.7 (auto-normalize 2026-05-08)"
|
||||
|
||||
# tech_stack — only if detected
|
||||
if detect_tech(folder, cleaned_tags, body):
|
||||
ts_old = fm.get("tech_stack")
|
||||
if isinstance(ts_old, dict):
|
||||
new_fm["tech_stack"] = ts_old
|
||||
else:
|
||||
new_fm["tech_stack"] = {"language": "unspecified", "framework": "unspecified"}
|
||||
|
||||
return new_fm
|
||||
|
||||
|
||||
def render_frontmatter(fm: dict) -> str:
|
||||
lines = ["---"]
|
||||
order = [
|
||||
"id", "title", "category", "status", "redirect_to", "canonical_id", "aliases",
|
||||
"duplicate_of", "source_trust_level", "confidence_score",
|
||||
"tags", "raw_sources", "last_reinforced", "github_commit",
|
||||
"inferred_by", "tech_stack",
|
||||
]
|
||||
for k in order:
|
||||
if k not in fm:
|
||||
continue
|
||||
v = fm[k]
|
||||
if isinstance(v, list):
|
||||
if not v:
|
||||
lines.append(f"{k}: []")
|
||||
else:
|
||||
items = ", ".join(quote_yaml(x) for x in v)
|
||||
lines.append(f"{k}: [{items}]")
|
||||
elif isinstance(v, dict):
|
||||
lines.append(f"{k}:")
|
||||
for kk, vv in v.items():
|
||||
lines.append(f" {kk}: {quote_yaml(vv)}")
|
||||
elif isinstance(v, float):
|
||||
lines.append(f"{k}: {v}")
|
||||
else:
|
||||
lines.append(f"{k}: {quote_yaml(v)}")
|
||||
lines.append("---")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def normalize_headers(body: str) -> str:
|
||||
"""Rewrite legacy section headers to match the template names."""
|
||||
def repl(m: re.Match) -> str:
|
||||
hashes = m.group(1)
|
||||
text = m.group(2).strip()
|
||||
# strip leading emoji/punctuation/numbering for matching
|
||||
norm = EMOJI_AT_START_RE.sub("", text).strip()
|
||||
norm = re.sub(r"\s*\(.*?\)\s*$", "", norm).strip() # drop trailing paren
|
||||
key = norm.lower()
|
||||
# also try without punctuation
|
||||
if key not in HEADER_RENAMES:
|
||||
key2 = re.sub(r"[^\w가-힣 -]", "", key).strip()
|
||||
if key2 in HEADER_RENAMES:
|
||||
key = key2
|
||||
if key in HEADER_RENAMES:
|
||||
return f"{hashes} {HEADER_RENAMES[key]}"
|
||||
return m.group(0)
|
||||
return HEADING_RE.sub(repl, body)
|
||||
|
||||
|
||||
def find_section_starts(body: str) -> dict[str, int]:
|
||||
"""Return {section_canonical_name: line_offset}"""
|
||||
found = {}
|
||||
for m in HEADING_RE.finditer(body):
|
||||
text = m.group(2).strip()
|
||||
norm = EMOJI_AT_START_RE.sub("", text).strip().lower()
|
||||
norm = re.sub(r"\s*\(.*?\)\s*$", "", norm).strip()
|
||||
for k, canonical in HEADER_RENAMES.items():
|
||||
if norm == k.lower() or text == canonical:
|
||||
if canonical not in found:
|
||||
found[canonical] = m.start()
|
||||
return found
|
||||
|
||||
|
||||
REQUIRED_SECTIONS_BASE = [
|
||||
"📌 한 줄 통찰 (The Karpathy Summary)",
|
||||
"📖 구조화된 지식 (Synthesized Content)",
|
||||
"🤖 LLM 활용 힌트 (How to Use This Knowledge)",
|
||||
"🧪 검증 상태 (Validation)",
|
||||
"🧬 중복 검사 (Duplicate Check)",
|
||||
"⚠️ 모순 및 업데이트 (Contradictions & Updates)",
|
||||
"🔗 지식 연결 (Graph)",
|
||||
"🕓 변경 이력 (Changelog)",
|
||||
]
|
||||
REQUIRED_SECTIONS_TECH_EXTRA = [
|
||||
"💻 코드 패턴 (Code Patterns)",
|
||||
"🤔 의사결정 기준 (Decision Criteria)",
|
||||
"❌ 안티패턴 (Anti-Patterns)",
|
||||
]
|
||||
|
||||
|
||||
def append_missing_sections(body: str, is_tech: bool, is_stub: bool, fm: dict) -> str:
|
||||
"""For each required section not present, append a scaffold with TODO marker."""
|
||||
found = find_section_starts(body)
|
||||
missing = []
|
||||
base = REQUIRED_SECTIONS_BASE + (REQUIRED_SECTIONS_TECH_EXTRA if is_tech else [])
|
||||
for s in base:
|
||||
if s not in found:
|
||||
missing.append(s)
|
||||
|
||||
if not missing and not is_stub:
|
||||
return body
|
||||
|
||||
out = [body.rstrip()]
|
||||
if is_stub:
|
||||
out.append("\n\n> 🤖 **[AI 추론 보강 필요]** — 본문이 200자 미만이라 P-Reinforce가 빈약 stub으로 분류했습니다.")
|
||||
out.append(f"> source_trust_level=`C` (AI 보강분), confidence_score=`{fm.get('confidence_score', 0.7)}`로 표시되어 있습니다.")
|
||||
out.append("> 사용자 검증 후 trust_level 상향 조정 가능.\n")
|
||||
for s in missing:
|
||||
out.append(f"\n## {s}\n")
|
||||
out.append(_scaffold_for(s, fm))
|
||||
return "\n".join(out)
|
||||
|
||||
|
||||
def _scaffold_for(section: str, fm: dict) -> str:
|
||||
if section.startswith("📌"):
|
||||
return "> *(TODO: 한 문장으로 핵심 통찰을 작성. \"X는 Y 조건에서 Z 효과를 낸다\" 구조 권장.)*"
|
||||
if section.startswith("📖"):
|
||||
return "**추출된 패턴:**\n> *(TODO)*\n\n**세부 내용:**\n- *(TODO)*"
|
||||
if section.startswith("💻"):
|
||||
return "**패턴 1:** *(TODO: 이 프로젝트 컨벤션 반영한 구조 스켈레톤)*\n\n```text\n# TODO\n```"
|
||||
if section.startswith("🤔"):
|
||||
return "**선택 A를 써야 할 때:**\n- *(TODO)*\n\n**선택 B를 써야 할 때:**\n- *(TODO)*\n\n**기본값:**\n> *(TODO)*"
|
||||
if section.startswith("❌"):
|
||||
return "- **[안티패턴]:** *(TODO: 무엇을 하면 안 되는가 + 이유 + 대신 무엇을)*"
|
||||
if section.startswith("🤖") and "활용 힌트" in section:
|
||||
return "**언제 이 지식을 쓰는가:**\n- *(TODO)*\n\n**언제 쓰면 안 되는가:**\n- *(TODO)*"
|
||||
if section.startswith("🧪"):
|
||||
return (
|
||||
f"- **정보 상태:** {fm.get('status', 'draft')}\n"
|
||||
f"- **출처 신뢰도:** {fm.get('source_trust_level', 'C')}\n"
|
||||
f"- **검토 이유:** *(P-Reinforce Phase 1 자동 정규화. 본문 검증 필요.)*"
|
||||
)
|
||||
if section.startswith("🧬"):
|
||||
return (
|
||||
"- **기존 유사 문서:** *(TODO: 인덱서 클러스터 리포트 참조)*\n"
|
||||
"- **처리 방식:** UPDATE (자동 정규화)\n"
|
||||
"- **처리 이유:** Phase 1 정규화 — 옛 템플릿/누락 필드 보강."
|
||||
)
|
||||
if section.startswith("⚠️"):
|
||||
return "- **과거 데이터와의 충돌:** 없음\n- **정책 변화:** 없음"
|
||||
if section.startswith("🔗"):
|
||||
return (
|
||||
"- **Parent:** [[10_Wiki/Topics]]\n"
|
||||
"- **Related:** *(TODO: 최소 2개)*\n"
|
||||
"- **Opposite / Trade-off:** *(TODO)*\n"
|
||||
"- **Raw Source:** 직접 입력"
|
||||
)
|
||||
if section.startswith("🕓"):
|
||||
today = date.today().isoformat()
|
||||
trust = fm.get("source_trust_level", "C")
|
||||
return (
|
||||
"| 날짜 | 변경 내용 | 처리 방식 | 신뢰도 |\n"
|
||||
"|------|-----------|-----------|--------|\n"
|
||||
f"| {today} | P-Reinforce Phase 1 정규화 (frontmatter + 헤더 표준화) | UPDATE | {trust} |"
|
||||
)
|
||||
return "*(TODO)*"
|
||||
|
||||
|
||||
def normalize_file(file_path: Path) -> str:
|
||||
text = file_path.read_text(encoding="utf-8", errors="replace")
|
||||
fm_old, body, _raw = parse_frontmatter(text)
|
||||
fm_new = build_frontmatter(fm_old, file_path, body)
|
||||
|
||||
# Redirect documents stay minimal — don't add scaffold sections.
|
||||
if fm_new.get("redirect_to"):
|
||||
return render_frontmatter(fm_new) + "\n\n" + body.lstrip("\n")
|
||||
|
||||
# Header renames first
|
||||
body2 = normalize_headers(body)
|
||||
is_tech = detect_tech(file_path.parent.name, fm_new.get("tags", []), body2)
|
||||
is_stub = len(body2.strip()) < 200
|
||||
body3 = append_missing_sections(body2, is_tech, is_stub, fm_new)
|
||||
|
||||
return render_frontmatter(fm_new) + "\n\n" + body3.lstrip("\n")
|
||||
|
||||
|
||||
def iter_knowledge_files() -> list[Path]:
|
||||
"""All .md under Topics/, excluding operational paths (sessions, _agents,
|
||||
_company, etc) and non-content dirs."""
|
||||
EXCLUDE_FRAG = (
|
||||
"/sessions/", "/_agents/", "/_company/", "/memory/",
|
||||
"/Project_Logs/", "/Harness_Research_", "/docs/records/",
|
||||
"/_Archive_Orphans/", "/Post_Drafts/", "/UX_Scenarios/",
|
||||
)
|
||||
SKIP_DIRS = {".obsidian", ".git", "__pycache__", "node_modules"}
|
||||
out = []
|
||||
import os as _os
|
||||
for dirpath, dirs, files in _os.walk(TOPICS):
|
||||
dirs[:] = [d for d in dirs if d not in SKIP_DIRS]
|
||||
for f in files:
|
||||
if not f.endswith(".md"):
|
||||
continue
|
||||
p = Path(dirpath) / f
|
||||
rel = "/" + str(p.relative_to(ROOT)).replace("\\", "/")
|
||||
if any(x in rel for x in EXCLUDE_FRAG):
|
||||
continue
|
||||
out.append(p)
|
||||
return out
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("path", nargs="?", help="Relative path under E:/Wiki/2nd/ or absolute")
|
||||
parser.add_argument("--out", help="Write normalized output to PATH instead of stdout")
|
||||
parser.add_argument("--batch", help="Path to a text file with one input path per line; outputs go under _tools/sample_normalized/")
|
||||
parser.add_argument("--apply-all", action="store_true", help="Normalize ALL knowledge files in-place (operational paths excluded). DESTRUCTIVE.")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.apply_all:
|
||||
files = iter_knowledge_files()
|
||||
ok = err = 0
|
||||
errors: list[tuple[str, str]] = []
|
||||
for i, p in enumerate(files, 1):
|
||||
try:
|
||||
normalized = normalize_file(p)
|
||||
p.write_text(normalized, encoding="utf-8")
|
||||
ok += 1
|
||||
except Exception as e:
|
||||
err += 1
|
||||
errors.append((str(p), str(e)))
|
||||
if i % 200 == 0:
|
||||
print(f" ...{i}/{len(files)}", file=sys.stderr)
|
||||
print(f"DONE: {ok} OK, {err} errors out of {len(files)} files", file=sys.stderr)
|
||||
if errors:
|
||||
log = ROOT / "_tools" / "normalize_errors.log"
|
||||
log.write_text("\n".join(f"{p}\t{e}" for p, e in errors), encoding="utf-8")
|
||||
print(f" errors written to {log}", file=sys.stderr)
|
||||
return 0 if err == 0 else 1
|
||||
|
||||
if args.batch:
|
||||
listing = Path(args.batch).read_text(encoding="utf-8").splitlines()
|
||||
out_dir = ROOT / "_tools" / "sample_normalized"
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
results = []
|
||||
for line in listing:
|
||||
line = line.strip()
|
||||
if not line or line.startswith("#"):
|
||||
continue
|
||||
p = Path(line)
|
||||
if not p.is_absolute():
|
||||
p = ROOT / p
|
||||
if not p.exists():
|
||||
print(f"SKIP missing: {p}", file=sys.stderr)
|
||||
continue
|
||||
try:
|
||||
normalized = normalize_file(p)
|
||||
except Exception as e:
|
||||
print(f"FAIL {p}: {e}", file=sys.stderr)
|
||||
continue
|
||||
out_path = out_dir / (p.stem + ".normalized.md")
|
||||
out_path.write_text(normalized, encoding="utf-8")
|
||||
results.append({"src": str(p.relative_to(ROOT)).replace("\\", "/"),
|
||||
"out": str(out_path.relative_to(ROOT)).replace("\\", "/"),
|
||||
"src_chars": len(p.read_text(encoding="utf-8", errors="replace")),
|
||||
"out_chars": len(normalized)})
|
||||
(out_dir / "_manifest.json").write_text(json.dumps(results, ensure_ascii=False, indent=1), encoding="utf-8")
|
||||
print(f"Wrote {len(results)} normalized samples to {out_dir}", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if not args.path:
|
||||
parser.error("path or --batch required")
|
||||
p = Path(args.path)
|
||||
if not p.is_absolute():
|
||||
p = ROOT / p
|
||||
if not p.exists():
|
||||
print(f"FAIL: {p} does not exist", file=sys.stderr)
|
||||
return 2
|
||||
out = normalize_file(p)
|
||||
if args.out:
|
||||
Path(args.out).write_text(out, encoding="utf-8")
|
||||
print(f"Wrote {args.out}", file=sys.stderr)
|
||||
else:
|
||||
sys.stdout.write(out)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user