Wikify: Auto-consolidate redundant/similar knowledge base files
This commit is contained in:
@@ -0,0 +1,194 @@
|
||||
import os
|
||||
import re
|
||||
from collections import defaultdict
|
||||
|
||||
topics_dir = "/Volumes/Data/project/Antigravity/Wiki/10_Wiki/Topics"
|
||||
files = [f for f in os.listdir(topics_dir) if f.endswith(".md")]
|
||||
|
||||
groups = defaultdict(list)
|
||||
|
||||
def extract_core_name(filename):
|
||||
name = filename[:-3]
|
||||
name = re.sub(r'[가-힣]', '', name)
|
||||
name = re.sub(r'[^a-zA-Z0-9]', '', name).lower()
|
||||
return name
|
||||
|
||||
for f in files:
|
||||
core = extract_core_name(f)
|
||||
if core:
|
||||
groups[core].append(f)
|
||||
|
||||
def get_words(filename):
|
||||
name = filename[:-3]
|
||||
name = re.sub(r'[가-힣]', '', name)
|
||||
words = re.findall(r'[a-zA-Z0-9]+', name.lower())
|
||||
return frozenset(words)
|
||||
|
||||
word_groups = defaultdict(list)
|
||||
for f in files:
|
||||
words = get_words(f)
|
||||
if words:
|
||||
word_groups[words].append(f)
|
||||
|
||||
final_groups = []
|
||||
used_files = set()
|
||||
|
||||
for core, flist in groups.items():
|
||||
if len(flist) > 1:
|
||||
group = set(flist)
|
||||
final_groups.append(group)
|
||||
used_files.update(group)
|
||||
|
||||
for words, flist in word_groups.items():
|
||||
if len(flist) > 1:
|
||||
group = set(flist)
|
||||
merged = False
|
||||
for existing_group in final_groups:
|
||||
if group.intersection(existing_group):
|
||||
existing_group.update(group)
|
||||
used_files.update(group)
|
||||
merged = True
|
||||
break
|
||||
if not merged:
|
||||
if not group.issubset(used_files):
|
||||
final_groups.append(group)
|
||||
used_files.update(group)
|
||||
|
||||
def parse_markdown(content):
|
||||
sections = {
|
||||
'yaml': '',
|
||||
'title': '',
|
||||
'summary': [],
|
||||
'core': [],
|
||||
'tradeoffs': [],
|
||||
'connections': [],
|
||||
'other': []
|
||||
}
|
||||
|
||||
yaml_match = re.search(r'^---\n(.*?)\n---\n', content, re.DOTALL)
|
||||
if yaml_match:
|
||||
sections['yaml'] = yaml_match.group(0)
|
||||
content = content[yaml_match.end():]
|
||||
|
||||
title_match = re.search(r'^#\s+(.*)', content, re.MULTILINE)
|
||||
if title_match:
|
||||
sections['title'] = title_match.group(1)
|
||||
|
||||
parts = re.split(r'\n##\s+', '\n' + content)
|
||||
|
||||
for part in parts[1:]:
|
||||
header_end = part.find('\n')
|
||||
if header_end == -1:
|
||||
header_end = len(part)
|
||||
header = part[:header_end].lower()
|
||||
body = part[header_end:].strip()
|
||||
|
||||
if not body:
|
||||
continue
|
||||
|
||||
if 'summary' in header or '통찰' in header:
|
||||
sections['summary'].append(body)
|
||||
elif 'core' in header or '구조화된' in header or 'content' in header:
|
||||
sections['core'].append(body)
|
||||
elif 'trade-off' in header or 'tradeoff' in header or '모순' in header or 'caveat' in header:
|
||||
sections['tradeoffs'].append(body)
|
||||
elif 'connection' in header or '연결' in header:
|
||||
sections['connections'].append(body)
|
||||
else:
|
||||
sections['other'].append(f"## {part.strip()}")
|
||||
|
||||
return sections
|
||||
|
||||
count = 0
|
||||
for group in final_groups:
|
||||
group = sorted(list(group))
|
||||
|
||||
def score_filename(f):
|
||||
score = 0
|
||||
if re.search(r'[가-힣]', f):
|
||||
score += 100
|
||||
score += len(f)
|
||||
return score
|
||||
|
||||
canonical = min(group, key=score_filename)
|
||||
safe_canonical = canonical.replace(' ', '_')
|
||||
|
||||
merged = {
|
||||
'title': '',
|
||||
'summary': [],
|
||||
'core': [],
|
||||
'tradeoffs': [],
|
||||
'connections': [],
|
||||
'other': []
|
||||
}
|
||||
|
||||
valid_group = []
|
||||
|
||||
for f in group:
|
||||
try:
|
||||
with open(os.path.join(topics_dir, f), 'r', encoding='utf-8') as file:
|
||||
content = file.read()
|
||||
parsed = parse_markdown(content)
|
||||
|
||||
if not merged['title'] and parsed['title']:
|
||||
merged['title'] = parsed['title']
|
||||
|
||||
merged['summary'].extend(parsed['summary'])
|
||||
merged['core'].extend(parsed['core'])
|
||||
merged['tradeoffs'].extend(parsed['tradeoffs'])
|
||||
merged['connections'].extend(parsed['connections'])
|
||||
merged['other'].extend(parsed['other'])
|
||||
|
||||
valid_group.append(f)
|
||||
except FileNotFoundError:
|
||||
continue
|
||||
|
||||
if not valid_group:
|
||||
continue
|
||||
|
||||
if not merged['title']:
|
||||
merged['title'] = safe_canonical.replace('.md', '').replace('_', ' ')
|
||||
|
||||
summary_text = "\n\n---\n\n".join(merged['summary']) if merged['summary'] else 'No summary available.'
|
||||
core_text = "\n\n---\n\n".join(merged['core']) if merged['core'] else 'No core content available.'
|
||||
tradeoffs_text = "\n\n---\n\n".join(merged['tradeoffs']) if merged['tradeoffs'] else 'No trade-offs available.'
|
||||
connections_text = "\n\n---\n\n".join(merged['connections']) if merged['connections'] else 'No connections available.'
|
||||
|
||||
final_content = f"""---
|
||||
category: Unified
|
||||
tags: [auto-consolidated, technical-documentation]
|
||||
title: {merged['title']}
|
||||
last_updated: 2026-05-02
|
||||
---
|
||||
|
||||
# {merged['title']}
|
||||
|
||||
## 📌 Brief Summary
|
||||
{summary_text}
|
||||
|
||||
## 📖 Core Content
|
||||
{core_text}
|
||||
|
||||
## ⚖️ Trade-offs & Caveats
|
||||
{tradeoffs_text}
|
||||
|
||||
## 🔗 Knowledge Connections
|
||||
{connections_text}
|
||||
"""
|
||||
if merged['other']:
|
||||
final_content += "\n\n" + "\n\n".join(merged['other'])
|
||||
|
||||
with open(os.path.join(topics_dir, safe_canonical), 'w', encoding='utf-8') as file:
|
||||
file.write(final_content)
|
||||
|
||||
for f in valid_group:
|
||||
if f != safe_canonical:
|
||||
try:
|
||||
os.remove(os.path.join(topics_dir, f))
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
count += 1
|
||||
print(f"Consolidated {len(valid_group)} files into {safe_canonical}")
|
||||
|
||||
print(f"Successfully consolidated {count} groups.")
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,68 @@
|
||||
import os
|
||||
import re
|
||||
from collections import defaultdict
|
||||
|
||||
topics_dir = "/Volumes/Data/project/Antigravity/Wiki/10_Wiki/Topics"
|
||||
files = [f for f in os.listdir(topics_dir) if f.endswith(".md")]
|
||||
|
||||
groups = defaultdict(list)
|
||||
|
||||
def extract_core_name(filename):
|
||||
# Remove .md
|
||||
name = filename[:-3]
|
||||
# Remove Korean characters
|
||||
name = re.sub(r'[가-힣]', '', name)
|
||||
# Remove special chars and spaces, keeping only alphanumeric
|
||||
name = re.sub(r'[^a-zA-Z0-9]', '', name).lower()
|
||||
|
||||
# Handle common acronyms matching full names
|
||||
# e.g. domaindrivendesign == ddd (hard to do programmatically without a dict)
|
||||
# Let's just group by exact alphanumeric match first
|
||||
return name
|
||||
|
||||
for f in files:
|
||||
core = extract_core_name(f)
|
||||
if core: # avoid empty strings if a file was purely Korean
|
||||
groups[core].append(f)
|
||||
|
||||
# Also let's try to group things that contain the exact same words
|
||||
def get_words(filename):
|
||||
name = filename[:-3]
|
||||
name = re.sub(r'[가-힣]', '', name)
|
||||
words = re.findall(r'[a-zA-Z0-9]+', name.lower())
|
||||
return frozenset(words)
|
||||
|
||||
word_groups = defaultdict(list)
|
||||
for f in files:
|
||||
words = get_words(f)
|
||||
if words:
|
||||
word_groups[words].append(f)
|
||||
|
||||
# Write report
|
||||
report_path = "/Volumes/Data/project/Antigravity/Wiki/scratch/duplicate_candidates.md"
|
||||
os.makedirs(os.path.dirname(report_path), exist_ok=True)
|
||||
|
||||
with open(report_path, "w", encoding="utf-8") as out:
|
||||
out.write("# Duplicate Candidates Report\n\n")
|
||||
|
||||
out.write("## Exact Alphanumeric Matches\n")
|
||||
for core, flist in sorted(groups.items()):
|
||||
if len(flist) > 1:
|
||||
out.write(f"- **{core}**\n")
|
||||
for f in flist:
|
||||
out.write(f" - {f}\n")
|
||||
out.write("\n")
|
||||
|
||||
out.write("## Exact Word Set Matches\n")
|
||||
for words, flist in sorted(word_groups.items(), key=lambda x: len(x[1]), reverse=True):
|
||||
if len(flist) > 1:
|
||||
# check if already covered by exact alphanumeric
|
||||
# skip if all files in this group share the same alphanumeric core
|
||||
cores = {extract_core_name(f) for f in flist}
|
||||
if len(cores) > 1:
|
||||
out.write(f"- **{', '.join(words)}**\n")
|
||||
for f in flist:
|
||||
out.write(f" - {f}\n")
|
||||
out.write("\n")
|
||||
|
||||
print(f"Report generated at {report_path}")
|
||||
Reference in New Issue
Block a user