import os import re import uuid import sys import shutil from datetime import datetime # UTF-8 Output support if sys.stdout.encoding != 'utf-8': import io sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') base_dir = r"e:\Wiki\2nd" raw_dir = os.path.join(base_dir, "00_Raw", "2026-04-20") wiki_base = os.path.join(base_dir, "10_Wiki", "πŸ’‘ Topics") # Simple keyword-based categorizer CATEGORY_KEYWORDS = { "AI": ["AI", "Artificial Intelligence", "LLM", "GPT", "Neural", "Deep Learning", "Machine Learning", "Adversarial"], "Graphics & Performance": ["Graphics", "Rendering", "Three.js", "WebGL", "WebGPU", "Shader", "3D", "Gaussian Splatting", "Splat", "Frame"], "Design & Experience": ["UX", "UI", "Design", "Accessibility", "A11y", "Interface", "HCI", "Cognitive", "Gamification"], "Programming & Language": ["TypeScript", "JS", "C++", "Java", "Python", "Syntax", "AST", "Type", "Core", "Pattern", "Compiler"], "Software Architecture": ["Architecture", "Microservices", "DDD", "API", "Contract", "System Design", "Cloud", "Distributed"], "Psychology & Behavior": ["Psychology", "Behavior", "ABA", "Neuroscience", "Mind", "Cognitive", "Emotion", "Addiction"], "Game Design": ["Game", "Level", "Narrative", "Player", "Quest", "Mechanic", "Simulation"], "Health & Science": ["Health", "Medical", "Biomedical", "Biology", "Clinical", "Injury", "ACL", "Performance Optimization"], "Security": ["Security", "OWASP", "Encryption", "Auth", "Hack", "Attack", "Malware", "Privacy"] } def get_category(filename, content): filename_lower = filename.lower() content_lower = content[:500].lower() # Check first 500 chars for cat, keywords in CATEGORY_KEYWORDS.items(): for kw in keywords: if kw.lower() in filename_lower or kw.lower() in content_lower: return cat return "General Knowledge" def process_batch(limit=200): files = [f for f in os.listdir(raw_dir) if f.endswith(".md")] processed_count = 0 # Get existing wiki titles to skip existing_titles = set() for root, dirs, f_list in os.walk(wiki_base): for f in f_list: existing_titles.add(f.replace(".md", "")) for filename in files: if processed_count >= limit: break title_raw = filename.replace(".md", "") safe_title = re.sub(r'[^\w\s\(\)\[\]-]', '', title_raw).strip() if safe_title in existing_titles: continue raw_path = os.path.join(raw_dir, filename) try: with open(raw_path, "r", encoding="utf-8", errors="ignore") as f: content = f.read() except: continue category = get_category(filename, content) category_path = f"10_Wiki/πŸ’‘ Topics/{category}" # Parse basic info summary_match = re.search(r'##?\s*πŸ“Œ\s*Brief Summary\n(.*?)(?=\n##|$)', content, re.S) summary = summary_match.group(1).strip() if summary_match else "지식 μš”μ•½ 정보 μΆ”μΆœ 쀑..." core_match = re.search(r'##?\s*πŸ“–\s*Core Content\n(.*?)(?=\n##|$)', content, re.S) core = core_match.group(1).strip() if core_match else "λ³Έλ¬Έ ꡬ쑰화 μž‘μ—… 쀑..." conn_match = re.search(r'##?\s*πŸ”—\s*Knowledge Connections\n(.*?)(?=\n##|$)', content, re.S) conn = conn_match.group(1).strip() if conn_match else "" doc_id = f"P-REINFORCE-AUTO-{uuid.uuid4().hex[:6].upper()}" today = datetime.now().strftime("%Y-%m-%d") wiki_content = f"""--- id: {doc_id} category: "[[{category_path}]]" confidence_score: 0.90 tags: [auto-reinforced] last_reinforced: {today} github_commit: "[P-Reinforce] Continuous Worker - {safe_title}" --- # [[{safe_title}]] ## πŸ“Œ ν•œ 쀄 톡찰 (The Karpathy Summary) > {summary} ## πŸ“– κ΅¬μ‘°ν™”λœ 지식 (Synthesized Content) {core} ## ⚠️ λͺ¨μˆœ 및 μ—…λ°μ΄νŠΈ (Contradictions & RL Update) - **κ³Όκ±° λ°μ΄ν„°μ™€μ˜ 좩돌:** μžλ™ν™” 엔진에 μ˜ν•΄ λ§€ν•‘λœ μ§€μ‹μœΌλ‘œ, μΆ”ν›„ μ •λ°€ 검증 ν•„μš”. - **μ •μ±… λ³€ν™”:** {category} λΆ„μ•Όμ˜ μžλ™ μžμ‚°ν™” μˆ˜ν–‰. ## πŸ”— 지식 μ—°κ²° (Graph) {conn} - Raw Source: [[00_Raw/2026-04-20/{filename}]] --- """ target_dir = os.path.join(base_dir, category_path.replace("/", os.sep)) if not os.path.exists(target_dir): os.makedirs(target_dir) target_path = os.path.join(target_dir, f"{safe_title}.md") with open(target_path, "w", encoding="utf-8") as f: f.write(wiki_content) print(f"[{processed_count+1}] Processed: {safe_title}") processed_count += 1 return processed_count if __name__ == "__main__": count = process_batch(2000) # Process ALL remaining files print(f"Total processed in this session: {count}")