Files
2nd/scratch/deduplicate_topics.py
T

58 lines
2.0 KiB
Python

import os
from collections import defaultdict
root_dir = "/Volumes/Data/project/Antigravity/Wiki/10_Wiki/Topics"
file_map = defaultdict(list)
# Walk and map basenames to full paths
for root, dirs, files in os.walk(root_dir):
for file in files:
if file.endswith(".md"):
path = os.path.join(root, file)
file_map[file].append(path)
duplicates_found = 0
deleted_count = 0
for filename, paths in file_map.items():
if len(paths) > 1:
duplicates_found += 1
# Sort paths by size descending
# We also consider the path depth (shorter paths usually preferred if sizes are close)
# But primarily size for "richness"
paths_with_info = []
for p in paths:
try:
size = os.path.getsize(p)
# Count lines as well for better richness metric
with open(p, 'r', encoding='utf-8', errors='ignore') as f:
lines = len(f.readlines())
paths_with_info.append({
'path': p,
'size': size,
'lines': lines,
'score': size * 0.7 + lines * 0.3 # Heuristic score
})
except Exception as e:
print(f"Error reading {p}: {e}")
# Sort by score descending
paths_with_info.sort(key=lambda x: x['score'], reverse=True)
winner = paths_with_info[0]
losers = paths_with_info[1:]
print(f"\nDuplicate: {filename}")
print(f" KEEP: {winner['path']} (Size: {winner['size']}, Lines: {winner['lines']})")
for loser in losers:
try:
os.remove(loser['path'])
print(f" DELETE: {loser['path']} (Size: {loser['size']}, Lines: {loser['lines']})")
deleted_count += 1
except Exception as e:
print(f" FAILED to delete {loser['path']}: {e}")
print(f"\nTotal duplicate groups: {duplicates_found}")
print(f"Total files deleted: {deleted_count}")