58 lines
2.0 KiB
Python
58 lines
2.0 KiB
Python
import os
|
|
from collections import defaultdict
|
|
|
|
root_dir = "/Volumes/Data/project/Antigravity/Wiki/10_Wiki/Topics"
|
|
file_map = defaultdict(list)
|
|
|
|
# Walk and map basenames to full paths
|
|
for root, dirs, files in os.walk(root_dir):
|
|
for file in files:
|
|
if file.endswith(".md"):
|
|
path = os.path.join(root, file)
|
|
file_map[file].append(path)
|
|
|
|
duplicates_found = 0
|
|
deleted_count = 0
|
|
|
|
for filename, paths in file_map.items():
|
|
if len(paths) > 1:
|
|
duplicates_found += 1
|
|
# Sort paths by size descending
|
|
# We also consider the path depth (shorter paths usually preferred if sizes are close)
|
|
# But primarily size for "richness"
|
|
paths_with_info = []
|
|
for p in paths:
|
|
try:
|
|
size = os.path.getsize(p)
|
|
# Count lines as well for better richness metric
|
|
with open(p, 'r', encoding='utf-8', errors='ignore') as f:
|
|
lines = len(f.readlines())
|
|
paths_with_info.append({
|
|
'path': p,
|
|
'size': size,
|
|
'lines': lines,
|
|
'score': size * 0.7 + lines * 0.3 # Heuristic score
|
|
})
|
|
except Exception as e:
|
|
print(f"Error reading {p}: {e}")
|
|
|
|
# Sort by score descending
|
|
paths_with_info.sort(key=lambda x: x['score'], reverse=True)
|
|
|
|
winner = paths_with_info[0]
|
|
losers = paths_with_info[1:]
|
|
|
|
print(f"\nDuplicate: {filename}")
|
|
print(f" KEEP: {winner['path']} (Size: {winner['size']}, Lines: {winner['lines']})")
|
|
|
|
for loser in losers:
|
|
try:
|
|
os.remove(loser['path'])
|
|
print(f" DELETE: {loser['path']} (Size: {loser['size']}, Lines: {loser['lines']})")
|
|
deleted_count += 1
|
|
except Exception as e:
|
|
print(f" FAILED to delete {loser['path']}: {e}")
|
|
|
|
print(f"\nTotal duplicate groups: {duplicates_found}")
|
|
print(f"Total files deleted: {deleted_count}")
|