Wikify: Auto-consolidate redundant/similar knowledge base files

2026-05-02 23:59:27 +09:00
parent 9981d83a4d
commit 303b01b228
1369 changed files with 33533 additions and 33429 deletions
@@ -0,0 +1,194 @@
+import os
+import re
+from collections import defaultdict
+
+topics_dir = "/Volumes/Data/project/Antigravity/Wiki/10_Wiki/Topics"
+files = [f for f in os.listdir(topics_dir) if f.endswith(".md")]
+
+groups = defaultdict(list)
+
+def extract_core_name(filename):
+    name = filename[:-3]
+    name = re.sub(r'[가-힣]', '', name)
+    name = re.sub(r'[^a-zA-Z0-9]', '', name).lower()
+    return name
+
+for f in files:
+    core = extract_core_name(f)
+    if core:
+        groups[core].append(f)
+
+def get_words(filename):
+    name = filename[:-3]
+    name = re.sub(r'[가-힣]', '', name)
+    words = re.findall(r'[a-zA-Z0-9]+', name.lower())
+    return frozenset(words)
+
+word_groups = defaultdict(list)
+for f in files:
+    words = get_words(f)
+    if words:
+        word_groups[words].append(f)
+
+final_groups = []
+used_files = set()
+
+for core, flist in groups.items():
+    if len(flist) > 1:
+        group = set(flist)
+        final_groups.append(group)
+        used_files.update(group)
+
+for words, flist in word_groups.items():
+    if len(flist) > 1:
+        group = set(flist)
+        merged = False
+        for existing_group in final_groups:
+            if group.intersection(existing_group):
+                existing_group.update(group)
+                used_files.update(group)
+                merged = True
+                break
+        if not merged:
+            if not group.issubset(used_files):
+                final_groups.append(group)
+                used_files.update(group)
+
+def parse_markdown(content):
+    sections = {
+        'yaml': '',
+        'title': '',
+        'summary': [],
+        'core': [],
+        'tradeoffs': [],
+        'connections': [],
+        'other': []
+    }
+    
+    yaml_match = re.search(r'^---\n(.*?)\n---\n', content, re.DOTALL)
+    if yaml_match:
+        sections['yaml'] = yaml_match.group(0)
+        content = content[yaml_match.end():]
+        
+    title_match = re.search(r'^#\s+(.*)', content, re.MULTILINE)
+    if title_match:
+        sections['title'] = title_match.group(1)
+        
+    parts = re.split(r'\n##\s+', '\n' + content)
+    
+    for part in parts[1:]:
+        header_end = part.find('\n')
+        if header_end == -1:
+            header_end = len(part)
+        header = part[:header_end].lower()
+        body = part[header_end:].strip()
+        
+        if not body:
+            continue
+            
+        if 'summary' in header or '통찰' in header:
+            sections['summary'].append(body)
+        elif 'core' in header or '구조화된' in header or 'content' in header:
+            sections['core'].append(body)
+        elif 'trade-off' in header or 'tradeoff' in header or '모순' in header or 'caveat' in header:
+            sections['tradeoffs'].append(body)
+        elif 'connection' in header or '연결' in header:
+            sections['connections'].append(body)
+        else:
+            sections['other'].append(f"## {part.strip()}")
+            
+    return sections
+
+count = 0
+for group in final_groups:
+    group = sorted(list(group))
+    
+    def score_filename(f):
+        score = 0
+        if re.search(r'[가-힣]', f):
+            score += 100 
+        score += len(f)
+        return score
+        
+    canonical = min(group, key=score_filename)
+    safe_canonical = canonical.replace(' ', '_')
+        
+    merged = {
+        'title': '',
+        'summary': [],
+        'core': [],
+        'tradeoffs': [],
+        'connections': [],
+        'other': []
+    }
+    
+    valid_group = []
+    
+    for f in group:
+        try:
+            with open(os.path.join(topics_dir, f), 'r', encoding='utf-8') as file:
+                content = file.read()
+                parsed = parse_markdown(content)
+                
+                if not merged['title'] and parsed['title']:
+                    merged['title'] = parsed['title']
+                    
+                merged['summary'].extend(parsed['summary'])
+                merged['core'].extend(parsed['core'])
+                merged['tradeoffs'].extend(parsed['tradeoffs'])
+                merged['connections'].extend(parsed['connections'])
+                merged['other'].extend(parsed['other'])
+                
+                valid_group.append(f)
+        except FileNotFoundError:
+            continue
+            
+    if not valid_group:
+        continue
+            
+    if not merged['title']:
+        merged['title'] = safe_canonical.replace('.md', '').replace('_', ' ')
+        
+    summary_text = "\n\n---\n\n".join(merged['summary']) if merged['summary'] else 'No summary available.'
+    core_text = "\n\n---\n\n".join(merged['core']) if merged['core'] else 'No core content available.'
+    tradeoffs_text = "\n\n---\n\n".join(merged['tradeoffs']) if merged['tradeoffs'] else 'No trade-offs available.'
+    connections_text = "\n\n---\n\n".join(merged['connections']) if merged['connections'] else 'No connections available.'
+
+    final_content = f"""---
+category: Unified
+tags: [auto-consolidated, technical-documentation]
+title: {merged['title']}
+last_updated: 2026-05-02
+---
+
+# {merged['title']}
+
+## 📌 Brief Summary
+{summary_text}
+
+## 📖 Core Content
+{core_text}
+
+## ⚖️ Trade-offs & Caveats
+{tradeoffs_text}
+
+## 🔗 Knowledge Connections
+{connections_text}
+"""
+    if merged['other']:
+        final_content += "\n\n" + "\n\n".join(merged['other'])
+        
+    with open(os.path.join(topics_dir, safe_canonical), 'w', encoding='utf-8') as file:
+        file.write(final_content)
+        
+    for f in valid_group:
+        if f != safe_canonical:
+            try:
+                os.remove(os.path.join(topics_dir, f))
+            except OSError:
+                pass
+                
+    count += 1
+    print(f"Consolidated {len(valid_group)} files into {safe_canonical}")
+
+print(f"Successfully consolidated {count} groups.")
@@ -0,0 +1,68 @@
+import os
+import re
+from collections import defaultdict
+
+topics_dir = "/Volumes/Data/project/Antigravity/Wiki/10_Wiki/Topics"
+files = [f for f in os.listdir(topics_dir) if f.endswith(".md")]
+
+groups = defaultdict(list)
+
+def extract_core_name(filename):
+    # Remove .md
+    name = filename[:-3]
+    # Remove Korean characters
+    name = re.sub(r'[가-힣]', '', name)
+    # Remove special chars and spaces, keeping only alphanumeric
+    name = re.sub(r'[^a-zA-Z0-9]', '', name).lower()
+    
+    # Handle common acronyms matching full names
+    # e.g. domaindrivendesign == ddd (hard to do programmatically without a dict)
+    # Let's just group by exact alphanumeric match first
+    return name
+
+for f in files:
+    core = extract_core_name(f)
+    if core: # avoid empty strings if a file was purely Korean
+        groups[core].append(f)
+
+# Also let's try to group things that contain the exact same words
+def get_words(filename):
+    name = filename[:-3]
+    name = re.sub(r'[가-힣]', '', name)
+    words = re.findall(r'[a-zA-Z0-9]+', name.lower())
+    return frozenset(words)
+
+word_groups = defaultdict(list)
+for f in files:
+    words = get_words(f)
+    if words:
+        word_groups[words].append(f)
+
+# Write report
+report_path = "/Volumes/Data/project/Antigravity/Wiki/scratch/duplicate_candidates.md"
+os.makedirs(os.path.dirname(report_path), exist_ok=True)
+
+with open(report_path, "w", encoding="utf-8") as out:
+    out.write("# Duplicate Candidates Report\n\n")
+    
+    out.write("## Exact Alphanumeric Matches\n")
+    for core, flist in sorted(groups.items()):
+        if len(flist) > 1:
+            out.write(f"- **{core}**\n")
+            for f in flist:
+                out.write(f"  - {f}\n")
+            out.write("\n")
+            
+    out.write("## Exact Word Set Matches\n")
+    for words, flist in sorted(word_groups.items(), key=lambda x: len(x[1]), reverse=True):
+        if len(flist) > 1:
+            # check if already covered by exact alphanumeric
+            # skip if all files in this group share the same alphanumeric core
+            cores = {extract_core_name(f) for f in flist}
+            if len(cores) > 1:
+                out.write(f"- **{', '.join(words)}**\n")
+                for f in flist:
+                    out.write(f"  - {f}\n")
+                out.write("\n")
+
+print(f"Report generated at {report_path}")