Files
2nd/scratch/find_duplicates.py

69 lines
2.3 KiB
Python

import os
import re
from collections import defaultdict
topics_dir = "/Volumes/Data/project/Antigravity/Wiki/10_Wiki/Topics"
files = [f for f in os.listdir(topics_dir) if f.endswith(".md")]
groups = defaultdict(list)
def extract_core_name(filename):
# Remove .md
name = filename[:-3]
# Remove Korean characters
name = re.sub(r'[가-힣]', '', name)
# Remove special chars and spaces, keeping only alphanumeric
name = re.sub(r'[^a-zA-Z0-9]', '', name).lower()
# Handle common acronyms matching full names
# e.g. domaindrivendesign == ddd (hard to do programmatically without a dict)
# Let's just group by exact alphanumeric match first
return name
for f in files:
core = extract_core_name(f)
if core: # avoid empty strings if a file was purely Korean
groups[core].append(f)
# Also let's try to group things that contain the exact same words
def get_words(filename):
name = filename[:-3]
name = re.sub(r'[가-힣]', '', name)
words = re.findall(r'[a-zA-Z0-9]+', name.lower())
return frozenset(words)
word_groups = defaultdict(list)
for f in files:
words = get_words(f)
if words:
word_groups[words].append(f)
# Write report
report_path = "/Volumes/Data/project/Antigravity/Wiki/scratch/duplicate_candidates.md"
os.makedirs(os.path.dirname(report_path), exist_ok=True)
with open(report_path, "w", encoding="utf-8") as out:
out.write("# Duplicate Candidates Report\n\n")
out.write("## Exact Alphanumeric Matches\n")
for core, flist in sorted(groups.items()):
if len(flist) > 1:
out.write(f"- **{core}**\n")
for f in flist:
out.write(f" - {f}\n")
out.write("\n")
out.write("## Exact Word Set Matches\n")
for words, flist in sorted(word_groups.items(), key=lambda x: len(x[1]), reverse=True):
if len(flist) > 1:
# check if already covered by exact alphanumeric
# skip if all files in this group share the same alphanumeric core
cores = {extract_core_name(f) for f in flist}
if len(cores) > 1:
out.write(f"- **{', '.join(words)}**\n")
for f in flist:
out.write(f" - {f}\n")
out.write("\n")
print(f"Report generated at {report_path}")