2nd/scratch/find_duplicates.py

import os
import re
from collections import defaultdict

topics_dir = "/Volumes/Data/project/Antigravity/Wiki/10_Wiki/Topics"
files = [f for f in os.listdir(topics_dir) if f.endswith(".md")]

groups = defaultdict(list)

def extract_core_name(filename):
    # Remove .md
    name = filename[:-3]
    # Remove Korean characters
    name = re.sub(r'[가-힣]', '', name)
    # Remove special chars and spaces, keeping only alphanumeric
    name = re.sub(r'[^a-zA-Z0-9]', '', name).lower()

    # Handle common acronyms matching full names
    # e.g. domaindrivendesign == ddd (hard to do programmatically without a dict)
    # Let's just group by exact alphanumeric match first
    return name

for f in files:
    core = extract_core_name(f)
    if core: # avoid empty strings if a file was purely Korean
        groups[core].append(f)

# Also let's try to group things that contain the exact same words
def get_words(filename):
    name = filename[:-3]
    name = re.sub(r'[가-힣]', '', name)
    words = re.findall(r'[a-zA-Z0-9]+', name.lower())
    return frozenset(words)

word_groups = defaultdict(list)
for f in files:
    words = get_words(f)
    if words:
        word_groups[words].append(f)

# Write report
report_path = "/Volumes/Data/project/Antigravity/Wiki/scratch/duplicate_candidates.md"
os.makedirs(os.path.dirname(report_path), exist_ok=True)

with open(report_path, "w", encoding="utf-8") as out:
    out.write("# Duplicate Candidates Report\n\n")

    out.write("## Exact Alphanumeric Matches\n")
    for core, flist in sorted(groups.items()):
        if len(flist) > 1:
            out.write(f"- **{core}**\n")
            for f in flist:
                out.write(f"  - {f}\n")
            out.write("\n")

    out.write("## Exact Word Set Matches\n")
    for words, flist in sorted(word_groups.items(), key=lambda x: len(x[1]), reverse=True):
        if len(flist) > 1:
            # check if already covered by exact alphanumeric
            # skip if all files in this group share the same alphanumeric core
            cores = {extract_core_name(f) for f in flist}
            if len(cores) > 1:
                out.write(f"- **{', '.join(words)}**\n")
                for f in flist:
                    out.write(f"  - {f}\n")
                out.write("\n")

print(f"Report generated at {report_path}")