import os from collections import defaultdict root_dir = "/Volumes/Data/project/Antigravity/Wiki/10_Wiki/Topics" file_map = defaultdict(list) # Walk and map basenames to full paths for root, dirs, files in os.walk(root_dir): for file in files: if file.endswith(".md"): path = os.path.join(root, file) file_map[file].append(path) duplicates_found = 0 deleted_count = 0 for filename, paths in file_map.items(): if len(paths) > 1: duplicates_found += 1 # Sort paths by size descending # We also consider the path depth (shorter paths usually preferred if sizes are close) # But primarily size for "richness" paths_with_info = [] for p in paths: try: size = os.path.getsize(p) # Count lines as well for better richness metric with open(p, 'r', encoding='utf-8', errors='ignore') as f: lines = len(f.readlines()) paths_with_info.append({ 'path': p, 'size': size, 'lines': lines, 'score': size * 0.7 + lines * 0.3 # Heuristic score }) except Exception as e: print(f"Error reading {p}: {e}") # Sort by score descending paths_with_info.sort(key=lambda x: x['score'], reverse=True) winner = paths_with_info[0] losers = paths_with_info[1:] print(f"\nDuplicate: {filename}") print(f" KEEP: {winner['path']} (Size: {winner['size']}, Lines: {winner['lines']})") for loser in losers: try: os.remove(loser['path']) print(f" DELETE: {loser['path']} (Size: {loser['size']}, Lines: {loser['lines']})") deleted_count += 1 except Exception as e: print(f" FAILED to delete {loser['path']}: {e}") print(f"\nTotal duplicate groups: {duplicates_found}") print(f"Total files deleted: {deleted_count}")