Wikify: Auto-consolidate redundant/similar knowledge base files

This commit is contained in:
Antigravity Agent
2026-05-02 23:59:27 +09:00
parent 9981d83a4d
commit 303b01b228
1369 changed files with 33533 additions and 33429 deletions
+194
View File
@@ -0,0 +1,194 @@
import os
import re
from collections import defaultdict
topics_dir = "/Volumes/Data/project/Antigravity/Wiki/10_Wiki/Topics"
files = [f for f in os.listdir(topics_dir) if f.endswith(".md")]
groups = defaultdict(list)
def extract_core_name(filename):
name = filename[:-3]
name = re.sub(r'[가-힣]', '', name)
name = re.sub(r'[^a-zA-Z0-9]', '', name).lower()
return name
for f in files:
core = extract_core_name(f)
if core:
groups[core].append(f)
def get_words(filename):
name = filename[:-3]
name = re.sub(r'[가-힣]', '', name)
words = re.findall(r'[a-zA-Z0-9]+', name.lower())
return frozenset(words)
word_groups = defaultdict(list)
for f in files:
words = get_words(f)
if words:
word_groups[words].append(f)
final_groups = []
used_files = set()
for core, flist in groups.items():
if len(flist) > 1:
group = set(flist)
final_groups.append(group)
used_files.update(group)
for words, flist in word_groups.items():
if len(flist) > 1:
group = set(flist)
merged = False
for existing_group in final_groups:
if group.intersection(existing_group):
existing_group.update(group)
used_files.update(group)
merged = True
break
if not merged:
if not group.issubset(used_files):
final_groups.append(group)
used_files.update(group)
def parse_markdown(content):
sections = {
'yaml': '',
'title': '',
'summary': [],
'core': [],
'tradeoffs': [],
'connections': [],
'other': []
}
yaml_match = re.search(r'^---\n(.*?)\n---\n', content, re.DOTALL)
if yaml_match:
sections['yaml'] = yaml_match.group(0)
content = content[yaml_match.end():]
title_match = re.search(r'^#\s+(.*)', content, re.MULTILINE)
if title_match:
sections['title'] = title_match.group(1)
parts = re.split(r'\n##\s+', '\n' + content)
for part in parts[1:]:
header_end = part.find('\n')
if header_end == -1:
header_end = len(part)
header = part[:header_end].lower()
body = part[header_end:].strip()
if not body:
continue
if 'summary' in header or '통찰' in header:
sections['summary'].append(body)
elif 'core' in header or '구조화된' in header or 'content' in header:
sections['core'].append(body)
elif 'trade-off' in header or 'tradeoff' in header or '모순' in header or 'caveat' in header:
sections['tradeoffs'].append(body)
elif 'connection' in header or '연결' in header:
sections['connections'].append(body)
else:
sections['other'].append(f"## {part.strip()}")
return sections
count = 0
for group in final_groups:
group = sorted(list(group))
def score_filename(f):
score = 0
if re.search(r'[가-힣]', f):
score += 100
score += len(f)
return score
canonical = min(group, key=score_filename)
safe_canonical = canonical.replace(' ', '_')
merged = {
'title': '',
'summary': [],
'core': [],
'tradeoffs': [],
'connections': [],
'other': []
}
valid_group = []
for f in group:
try:
with open(os.path.join(topics_dir, f), 'r', encoding='utf-8') as file:
content = file.read()
parsed = parse_markdown(content)
if not merged['title'] and parsed['title']:
merged['title'] = parsed['title']
merged['summary'].extend(parsed['summary'])
merged['core'].extend(parsed['core'])
merged['tradeoffs'].extend(parsed['tradeoffs'])
merged['connections'].extend(parsed['connections'])
merged['other'].extend(parsed['other'])
valid_group.append(f)
except FileNotFoundError:
continue
if not valid_group:
continue
if not merged['title']:
merged['title'] = safe_canonical.replace('.md', '').replace('_', ' ')
summary_text = "\n\n---\n\n".join(merged['summary']) if merged['summary'] else 'No summary available.'
core_text = "\n\n---\n\n".join(merged['core']) if merged['core'] else 'No core content available.'
tradeoffs_text = "\n\n---\n\n".join(merged['tradeoffs']) if merged['tradeoffs'] else 'No trade-offs available.'
connections_text = "\n\n---\n\n".join(merged['connections']) if merged['connections'] else 'No connections available.'
final_content = f"""---
category: Unified
tags: [auto-consolidated, technical-documentation]
title: {merged['title']}
last_updated: 2026-05-02
---
# {merged['title']}
## 📌 Brief Summary
{summary_text}
## 📖 Core Content
{core_text}
## ⚖️ Trade-offs & Caveats
{tradeoffs_text}
## 🔗 Knowledge Connections
{connections_text}
"""
if merged['other']:
final_content += "\n\n" + "\n\n".join(merged['other'])
with open(os.path.join(topics_dir, safe_canonical), 'w', encoding='utf-8') as file:
file.write(final_content)
for f in valid_group:
if f != safe_canonical:
try:
os.remove(os.path.join(topics_dir, f))
except OSError:
pass
count += 1
print(f"Consolidated {len(valid_group)} files into {safe_canonical}")
print(f"Successfully consolidated {count} groups.")
File diff suppressed because it is too large Load Diff
+68
View File
@@ -0,0 +1,68 @@
import os
import re
from collections import defaultdict
topics_dir = "/Volumes/Data/project/Antigravity/Wiki/10_Wiki/Topics"
files = [f for f in os.listdir(topics_dir) if f.endswith(".md")]
groups = defaultdict(list)
def extract_core_name(filename):
# Remove .md
name = filename[:-3]
# Remove Korean characters
name = re.sub(r'[가-힣]', '', name)
# Remove special chars and spaces, keeping only alphanumeric
name = re.sub(r'[^a-zA-Z0-9]', '', name).lower()
# Handle common acronyms matching full names
# e.g. domaindrivendesign == ddd (hard to do programmatically without a dict)
# Let's just group by exact alphanumeric match first
return name
for f in files:
core = extract_core_name(f)
if core: # avoid empty strings if a file was purely Korean
groups[core].append(f)
# Also let's try to group things that contain the exact same words
def get_words(filename):
name = filename[:-3]
name = re.sub(r'[가-힣]', '', name)
words = re.findall(r'[a-zA-Z0-9]+', name.lower())
return frozenset(words)
word_groups = defaultdict(list)
for f in files:
words = get_words(f)
if words:
word_groups[words].append(f)
# Write report
report_path = "/Volumes/Data/project/Antigravity/Wiki/scratch/duplicate_candidates.md"
os.makedirs(os.path.dirname(report_path), exist_ok=True)
with open(report_path, "w", encoding="utf-8") as out:
out.write("# Duplicate Candidates Report\n\n")
out.write("## Exact Alphanumeric Matches\n")
for core, flist in sorted(groups.items()):
if len(flist) > 1:
out.write(f"- **{core}**\n")
for f in flist:
out.write(f" - {f}\n")
out.write("\n")
out.write("## Exact Word Set Matches\n")
for words, flist in sorted(word_groups.items(), key=lambda x: len(x[1]), reverse=True):
if len(flist) > 1:
# check if already covered by exact alphanumeric
# skip if all files in this group share the same alphanumeric core
cores = {extract_core_name(f) for f in flist}
if len(cores) > 1:
out.write(f"- **{', '.join(words)}**\n")
for f in flist:
out.write(f" - {f}\n")
out.write("\n")
print(f"Report generated at {report_path}")