Wikify: Auto-consolidate redundant/similar knowledge base files
This commit is contained in:
@@ -0,0 +1,68 @@
|
||||
import os
|
||||
import re
|
||||
from collections import defaultdict
|
||||
|
||||
topics_dir = "/Volumes/Data/project/Antigravity/Wiki/10_Wiki/Topics"
|
||||
files = [f for f in os.listdir(topics_dir) if f.endswith(".md")]
|
||||
|
||||
groups = defaultdict(list)
|
||||
|
||||
def extract_core_name(filename):
|
||||
# Remove .md
|
||||
name = filename[:-3]
|
||||
# Remove Korean characters
|
||||
name = re.sub(r'[가-힣]', '', name)
|
||||
# Remove special chars and spaces, keeping only alphanumeric
|
||||
name = re.sub(r'[^a-zA-Z0-9]', '', name).lower()
|
||||
|
||||
# Handle common acronyms matching full names
|
||||
# e.g. domaindrivendesign == ddd (hard to do programmatically without a dict)
|
||||
# Let's just group by exact alphanumeric match first
|
||||
return name
|
||||
|
||||
for f in files:
|
||||
core = extract_core_name(f)
|
||||
if core: # avoid empty strings if a file was purely Korean
|
||||
groups[core].append(f)
|
||||
|
||||
# Also let's try to group things that contain the exact same words
|
||||
def get_words(filename):
|
||||
name = filename[:-3]
|
||||
name = re.sub(r'[가-힣]', '', name)
|
||||
words = re.findall(r'[a-zA-Z0-9]+', name.lower())
|
||||
return frozenset(words)
|
||||
|
||||
word_groups = defaultdict(list)
|
||||
for f in files:
|
||||
words = get_words(f)
|
||||
if words:
|
||||
word_groups[words].append(f)
|
||||
|
||||
# Write report
|
||||
report_path = "/Volumes/Data/project/Antigravity/Wiki/scratch/duplicate_candidates.md"
|
||||
os.makedirs(os.path.dirname(report_path), exist_ok=True)
|
||||
|
||||
with open(report_path, "w", encoding="utf-8") as out:
|
||||
out.write("# Duplicate Candidates Report\n\n")
|
||||
|
||||
out.write("## Exact Alphanumeric Matches\n")
|
||||
for core, flist in sorted(groups.items()):
|
||||
if len(flist) > 1:
|
||||
out.write(f"- **{core}**\n")
|
||||
for f in flist:
|
||||
out.write(f" - {f}\n")
|
||||
out.write("\n")
|
||||
|
||||
out.write("## Exact Word Set Matches\n")
|
||||
for words, flist in sorted(word_groups.items(), key=lambda x: len(x[1]), reverse=True):
|
||||
if len(flist) > 1:
|
||||
# check if already covered by exact alphanumeric
|
||||
# skip if all files in this group share the same alphanumeric core
|
||||
cores = {extract_core_name(f) for f in flist}
|
||||
if len(cores) > 1:
|
||||
out.write(f"- **{', '.join(words)}**\n")
|
||||
for f in flist:
|
||||
out.write(f" - {f}\n")
|
||||
out.write("\n")
|
||||
|
||||
print(f"Report generated at {report_path}")
|
||||
Reference in New Issue
Block a user