Wikify: Auto-consolidate redundant/similar knowledge base files

2026-05-02 23:59:27 +09:00
parent 9981d83a4d
commit 303b01b228
1369 changed files with 33533 additions and 33429 deletions
@@ -0,0 +1,68 @@
+import os
+import re
+from collections import defaultdict
+
+topics_dir = "/Volumes/Data/project/Antigravity/Wiki/10_Wiki/Topics"
+files = [f for f in os.listdir(topics_dir) if f.endswith(".md")]
+
+groups = defaultdict(list)
+
+def extract_core_name(filename):
+    # Remove .md
+    name = filename[:-3]
+    # Remove Korean characters
+    name = re.sub(r'[가-힣]', '', name)
+    # Remove special chars and spaces, keeping only alphanumeric
+    name = re.sub(r'[^a-zA-Z0-9]', '', name).lower()
+    
+    # Handle common acronyms matching full names
+    # e.g. domaindrivendesign == ddd (hard to do programmatically without a dict)
+    # Let's just group by exact alphanumeric match first
+    return name
+
+for f in files:
+    core = extract_core_name(f)
+    if core: # avoid empty strings if a file was purely Korean
+        groups[core].append(f)
+
+# Also let's try to group things that contain the exact same words
+def get_words(filename):
+    name = filename[:-3]
+    name = re.sub(r'[가-힣]', '', name)
+    words = re.findall(r'[a-zA-Z0-9]+', name.lower())
+    return frozenset(words)
+
+word_groups = defaultdict(list)
+for f in files:
+    words = get_words(f)
+    if words:
+        word_groups[words].append(f)
+
+# Write report
+report_path = "/Volumes/Data/project/Antigravity/Wiki/scratch/duplicate_candidates.md"
+os.makedirs(os.path.dirname(report_path), exist_ok=True)
+
+with open(report_path, "w", encoding="utf-8") as out:
+    out.write("# Duplicate Candidates Report\n\n")
+    
+    out.write("## Exact Alphanumeric Matches\n")
+    for core, flist in sorted(groups.items()):
+        if len(flist) > 1:
+            out.write(f"- **{core}**\n")
+            for f in flist:
+                out.write(f"  - {f}\n")
+            out.write("\n")
+            
+    out.write("## Exact Word Set Matches\n")
+    for words, flist in sorted(word_groups.items(), key=lambda x: len(x[1]), reverse=True):
+        if len(flist) > 1:
+            # check if already covered by exact alphanumeric
+            # skip if all files in this group share the same alphanumeric core
+            cores = {extract_core_name(f) for f in flist}
+            if len(cores) > 1:
+                out.write(f"- **{', '.join(words)}**\n")
+                for f in flist:
+                    out.write(f"  - {f}\n")
+                out.write("\n")
+
+print(f"Report generated at {report_path}")