2nd/scratch/deduplicate_topics.py

import os
from collections import defaultdict

root_dir = "/Volumes/Data/project/Antigravity/Wiki/10_Wiki/Topics"
file_map = defaultdict(list)

# Walk and map basenames to full paths
for root, dirs, files in os.walk(root_dir):
    for file in files:
        if file.endswith(".md"):
            path = os.path.join(root, file)
            file_map[file].append(path)

duplicates_found = 0
deleted_count = 0

for filename, paths in file_map.items():
    if len(paths) > 1:
        duplicates_found += 1
        # Sort paths by size descending
        # We also consider the path depth (shorter paths usually preferred if sizes are close)
        # But primarily size for "richness"
        paths_with_info = []
        for p in paths:
            try:
                size = os.path.getsize(p)
                # Count lines as well for better richness metric
                with open(p, 'r', encoding='utf-8', errors='ignore') as f:
                    lines = len(f.readlines())
                paths_with_info.append({
                    'path': p,
                    'size': size,
                    'lines': lines,
                    'score': size * 0.7 + lines * 0.3 # Heuristic score
                })
            except Exception as e:
                print(f"Error reading {p}: {e}")

        # Sort by score descending
        paths_with_info.sort(key=lambda x: x['score'], reverse=True)

        winner = paths_with_info[0]
        losers = paths_with_info[1:]

        print(f"\nDuplicate: {filename}")
        print(f"  KEEP: {winner['path']} (Size: {winner['size']}, Lines: {winner['lines']})")

        for loser in losers:
            try:
                os.remove(loser['path'])
                print(f"  DELETE: {loser['path']} (Size: {loser['size']}, Lines: {loser['lines']})")
                deleted_count += 1
            except Exception as e:
                print(f"  FAILED to delete {loser['path']}: {e}")

print(f"\nTotal duplicate groups: {duplicates_found}")
print(f"Total files deleted: {deleted_count}")