69 lines
2.3 KiB
Python
69 lines
2.3 KiB
Python
import os
|
|
import re
|
|
from collections import defaultdict
|
|
|
|
topics_dir = "/Volumes/Data/project/Antigravity/Wiki/10_Wiki/Topics"
|
|
files = [f for f in os.listdir(topics_dir) if f.endswith(".md")]
|
|
|
|
groups = defaultdict(list)
|
|
|
|
def extract_core_name(filename):
|
|
# Remove .md
|
|
name = filename[:-3]
|
|
# Remove Korean characters
|
|
name = re.sub(r'[가-힣]', '', name)
|
|
# Remove special chars and spaces, keeping only alphanumeric
|
|
name = re.sub(r'[^a-zA-Z0-9]', '', name).lower()
|
|
|
|
# Handle common acronyms matching full names
|
|
# e.g. domaindrivendesign == ddd (hard to do programmatically without a dict)
|
|
# Let's just group by exact alphanumeric match first
|
|
return name
|
|
|
|
for f in files:
|
|
core = extract_core_name(f)
|
|
if core: # avoid empty strings if a file was purely Korean
|
|
groups[core].append(f)
|
|
|
|
# Also let's try to group things that contain the exact same words
|
|
def get_words(filename):
|
|
name = filename[:-3]
|
|
name = re.sub(r'[가-힣]', '', name)
|
|
words = re.findall(r'[a-zA-Z0-9]+', name.lower())
|
|
return frozenset(words)
|
|
|
|
word_groups = defaultdict(list)
|
|
for f in files:
|
|
words = get_words(f)
|
|
if words:
|
|
word_groups[words].append(f)
|
|
|
|
# Write report
|
|
report_path = "/Volumes/Data/project/Antigravity/Wiki/scratch/duplicate_candidates.md"
|
|
os.makedirs(os.path.dirname(report_path), exist_ok=True)
|
|
|
|
with open(report_path, "w", encoding="utf-8") as out:
|
|
out.write("# Duplicate Candidates Report\n\n")
|
|
|
|
out.write("## Exact Alphanumeric Matches\n")
|
|
for core, flist in sorted(groups.items()):
|
|
if len(flist) > 1:
|
|
out.write(f"- **{core}**\n")
|
|
for f in flist:
|
|
out.write(f" - {f}\n")
|
|
out.write("\n")
|
|
|
|
out.write("## Exact Word Set Matches\n")
|
|
for words, flist in sorted(word_groups.items(), key=lambda x: len(x[1]), reverse=True):
|
|
if len(flist) > 1:
|
|
# check if already covered by exact alphanumeric
|
|
# skip if all files in this group share the same alphanumeric core
|
|
cores = {extract_core_name(f) for f in flist}
|
|
if len(cores) > 1:
|
|
out.write(f"- **{', '.join(words)}**\n")
|
|
for f in flist:
|
|
out.write(f" - {f}\n")
|
|
out.write("\n")
|
|
|
|
print(f"Report generated at {report_path}")
|