[P-Reinforce] Global knowledge consolidation, massive deduplication (5,249 files), and high-density wikification (45 nodes)
This commit is contained in:
@@ -0,0 +1,57 @@
|
||||
import os
|
||||
from collections import defaultdict
|
||||
|
||||
root_dir = "/Volumes/Data/project/Antigravity/Wiki/10_Wiki/Topics"
|
||||
file_map = defaultdict(list)
|
||||
|
||||
# Walk and map basenames to full paths
|
||||
for root, dirs, files in os.walk(root_dir):
|
||||
for file in files:
|
||||
if file.endswith(".md"):
|
||||
path = os.path.join(root, file)
|
||||
file_map[file].append(path)
|
||||
|
||||
duplicates_found = 0
|
||||
deleted_count = 0
|
||||
|
||||
for filename, paths in file_map.items():
|
||||
if len(paths) > 1:
|
||||
duplicates_found += 1
|
||||
# Sort paths by size descending
|
||||
# We also consider the path depth (shorter paths usually preferred if sizes are close)
|
||||
# But primarily size for "richness"
|
||||
paths_with_info = []
|
||||
for p in paths:
|
||||
try:
|
||||
size = os.path.getsize(p)
|
||||
# Count lines as well for better richness metric
|
||||
with open(p, 'r', encoding='utf-8', errors='ignore') as f:
|
||||
lines = len(f.readlines())
|
||||
paths_with_info.append({
|
||||
'path': p,
|
||||
'size': size,
|
||||
'lines': lines,
|
||||
'score': size * 0.7 + lines * 0.3 # Heuristic score
|
||||
})
|
||||
except Exception as e:
|
||||
print(f"Error reading {p}: {e}")
|
||||
|
||||
# Sort by score descending
|
||||
paths_with_info.sort(key=lambda x: x['score'], reverse=True)
|
||||
|
||||
winner = paths_with_info[0]
|
||||
losers = paths_with_info[1:]
|
||||
|
||||
print(f"\nDuplicate: {filename}")
|
||||
print(f" KEEP: {winner['path']} (Size: {winner['size']}, Lines: {winner['lines']})")
|
||||
|
||||
for loser in losers:
|
||||
try:
|
||||
os.remove(loser['path'])
|
||||
print(f" DELETE: {loser['path']} (Size: {loser['size']}, Lines: {loser['lines']})")
|
||||
deleted_count += 1
|
||||
except Exception as e:
|
||||
print(f" FAILED to delete {loser['path']}: {e}")
|
||||
|
||||
print(f"\nTotal duplicate groups: {duplicates_found}")
|
||||
print(f"Total files deleted: {deleted_count}")
|
||||
Reference in New Issue
Block a user