import os import re from collections import defaultdict topics_dir = "/Volumes/Data/project/Antigravity/Wiki/10_Wiki/Topics" files = [f for f in os.listdir(topics_dir) if f.endswith(".md")] groups = defaultdict(list) def extract_core_name(filename): name = filename[:-3] name = re.sub(r'[가-힣]', '', name) name = re.sub(r'[^a-zA-Z0-9]', '', name).lower() return name for f in files: core = extract_core_name(f) if core: groups[core].append(f) def get_words(filename): name = filename[:-3] name = re.sub(r'[가-힣]', '', name) words = re.findall(r'[a-zA-Z0-9]+', name.lower()) return frozenset(words) word_groups = defaultdict(list) for f in files: words = get_words(f) if words: word_groups[words].append(f) final_groups = [] used_files = set() for core, flist in groups.items(): if len(flist) > 1: group = set(flist) final_groups.append(group) used_files.update(group) for words, flist in word_groups.items(): if len(flist) > 1: group = set(flist) merged = False for existing_group in final_groups: if group.intersection(existing_group): existing_group.update(group) used_files.update(group) merged = True break if not merged: if not group.issubset(used_files): final_groups.append(group) used_files.update(group) def parse_markdown(content): sections = { 'yaml': '', 'title': '', 'summary': [], 'core': [], 'tradeoffs': [], 'connections': [], 'other': [] } yaml_match = re.search(r'^---\n(.*?)\n---\n', content, re.DOTALL) if yaml_match: sections['yaml'] = yaml_match.group(0) content = content[yaml_match.end():] title_match = re.search(r'^#\s+(.*)', content, re.MULTILINE) if title_match: sections['title'] = title_match.group(1) parts = re.split(r'\n##\s+', '\n' + content) for part in parts[1:]: header_end = part.find('\n') if header_end == -1: header_end = len(part) header = part[:header_end].lower() body = part[header_end:].strip() if not body: continue if 'summary' in header or '통찰' in header: sections['summary'].append(body) elif 'core' in header or '구조화된' in header or 'content' in header: sections['core'].append(body) elif 'trade-off' in header or 'tradeoff' in header or '모순' in header or 'caveat' in header: sections['tradeoffs'].append(body) elif 'connection' in header or '연결' in header: sections['connections'].append(body) else: sections['other'].append(f"## {part.strip()}") return sections count = 0 for group in final_groups: group = sorted(list(group)) def score_filename(f): score = 0 if re.search(r'[가-힣]', f): score += 100 score += len(f) return score canonical = min(group, key=score_filename) safe_canonical = canonical.replace(' ', '_') merged = { 'title': '', 'summary': [], 'core': [], 'tradeoffs': [], 'connections': [], 'other': [] } valid_group = [] for f in group: try: with open(os.path.join(topics_dir, f), 'r', encoding='utf-8') as file: content = file.read() parsed = parse_markdown(content) if not merged['title'] and parsed['title']: merged['title'] = parsed['title'] merged['summary'].extend(parsed['summary']) merged['core'].extend(parsed['core']) merged['tradeoffs'].extend(parsed['tradeoffs']) merged['connections'].extend(parsed['connections']) merged['other'].extend(parsed['other']) valid_group.append(f) except FileNotFoundError: continue if not valid_group: continue if not merged['title']: merged['title'] = safe_canonical.replace('.md', '').replace('_', ' ') summary_text = "\n\n---\n\n".join(merged['summary']) if merged['summary'] else 'No summary available.' core_text = "\n\n---\n\n".join(merged['core']) if merged['core'] else 'No core content available.' tradeoffs_text = "\n\n---\n\n".join(merged['tradeoffs']) if merged['tradeoffs'] else 'No trade-offs available.' connections_text = "\n\n---\n\n".join(merged['connections']) if merged['connections'] else 'No connections available.' final_content = f"""--- category: Unified tags: [auto-consolidated, technical-documentation] title: {merged['title']} last_updated: 2026-05-02 --- # {merged['title']} ## 📌 Brief Summary {summary_text} ## 📖 Core Content {core_text} ## ⚖️ Trade-offs & Caveats {tradeoffs_text} ## 🔗 Knowledge Connections {connections_text} """ if merged['other']: final_content += "\n\n" + "\n\n".join(merged['other']) with open(os.path.join(topics_dir, safe_canonical), 'w', encoding='utf-8') as file: file.write(final_content) for f in valid_group: if f != safe_canonical: try: os.remove(os.path.join(topics_dir, f)) except OSError: pass count += 1 print(f"Consolidated {len(valid_group)} files into {safe_canonical}") print(f"Successfully consolidated {count} groups.")