import os import re from collections import defaultdict topics_dir = "/Volumes/Data/project/Antigravity/Wiki/10_Wiki/Topics" files = [f for f in os.listdir(topics_dir) if f.endswith(".md")] groups = defaultdict(list) def extract_core_name(filename): # Remove .md name = filename[:-3] # Remove Korean characters name = re.sub(r'[가-힣]', '', name) # Remove special chars and spaces, keeping only alphanumeric name = re.sub(r'[^a-zA-Z0-9]', '', name).lower() # Handle common acronyms matching full names # e.g. domaindrivendesign == ddd (hard to do programmatically without a dict) # Let's just group by exact alphanumeric match first return name for f in files: core = extract_core_name(f) if core: # avoid empty strings if a file was purely Korean groups[core].append(f) # Also let's try to group things that contain the exact same words def get_words(filename): name = filename[:-3] name = re.sub(r'[가-힣]', '', name) words = re.findall(r'[a-zA-Z0-9]+', name.lower()) return frozenset(words) word_groups = defaultdict(list) for f in files: words = get_words(f) if words: word_groups[words].append(f) # Write report report_path = "/Volumes/Data/project/Antigravity/Wiki/scratch/duplicate_candidates.md" os.makedirs(os.path.dirname(report_path), exist_ok=True) with open(report_path, "w", encoding="utf-8") as out: out.write("# Duplicate Candidates Report\n\n") out.write("## Exact Alphanumeric Matches\n") for core, flist in sorted(groups.items()): if len(flist) > 1: out.write(f"- **{core}**\n") for f in flist: out.write(f" - {f}\n") out.write("\n") out.write("## Exact Word Set Matches\n") for words, flist in sorted(word_groups.items(), key=lambda x: len(x[1]), reverse=True): if len(flist) > 1: # check if already covered by exact alphanumeric # skip if all files in this group share the same alphanumeric core cores = {extract_core_name(f) for f in flist} if len(cores) > 1: out.write(f"- **{', '.join(words)}**\n") for f in flist: out.write(f" - {f}\n") out.write("\n") print(f"Report generated at {report_path}")