195 lines
5.6 KiB
Python
195 lines
5.6 KiB
Python
import os
|
|
import re
|
|
from collections import defaultdict
|
|
|
|
topics_dir = "/Volumes/Data/project/Antigravity/Wiki/10_Wiki/Topics"
|
|
files = [f for f in os.listdir(topics_dir) if f.endswith(".md")]
|
|
|
|
groups = defaultdict(list)
|
|
|
|
def extract_core_name(filename):
|
|
name = filename[:-3]
|
|
name = re.sub(r'[가-힣]', '', name)
|
|
name = re.sub(r'[^a-zA-Z0-9]', '', name).lower()
|
|
return name
|
|
|
|
for f in files:
|
|
core = extract_core_name(f)
|
|
if core:
|
|
groups[core].append(f)
|
|
|
|
def get_words(filename):
|
|
name = filename[:-3]
|
|
name = re.sub(r'[가-힣]', '', name)
|
|
words = re.findall(r'[a-zA-Z0-9]+', name.lower())
|
|
return frozenset(words)
|
|
|
|
word_groups = defaultdict(list)
|
|
for f in files:
|
|
words = get_words(f)
|
|
if words:
|
|
word_groups[words].append(f)
|
|
|
|
final_groups = []
|
|
used_files = set()
|
|
|
|
for core, flist in groups.items():
|
|
if len(flist) > 1:
|
|
group = set(flist)
|
|
final_groups.append(group)
|
|
used_files.update(group)
|
|
|
|
for words, flist in word_groups.items():
|
|
if len(flist) > 1:
|
|
group = set(flist)
|
|
merged = False
|
|
for existing_group in final_groups:
|
|
if group.intersection(existing_group):
|
|
existing_group.update(group)
|
|
used_files.update(group)
|
|
merged = True
|
|
break
|
|
if not merged:
|
|
if not group.issubset(used_files):
|
|
final_groups.append(group)
|
|
used_files.update(group)
|
|
|
|
def parse_markdown(content):
|
|
sections = {
|
|
'yaml': '',
|
|
'title': '',
|
|
'summary': [],
|
|
'core': [],
|
|
'tradeoffs': [],
|
|
'connections': [],
|
|
'other': []
|
|
}
|
|
|
|
yaml_match = re.search(r'^---\n(.*?)\n---\n', content, re.DOTALL)
|
|
if yaml_match:
|
|
sections['yaml'] = yaml_match.group(0)
|
|
content = content[yaml_match.end():]
|
|
|
|
title_match = re.search(r'^#\s+(.*)', content, re.MULTILINE)
|
|
if title_match:
|
|
sections['title'] = title_match.group(1)
|
|
|
|
parts = re.split(r'\n##\s+', '\n' + content)
|
|
|
|
for part in parts[1:]:
|
|
header_end = part.find('\n')
|
|
if header_end == -1:
|
|
header_end = len(part)
|
|
header = part[:header_end].lower()
|
|
body = part[header_end:].strip()
|
|
|
|
if not body:
|
|
continue
|
|
|
|
if 'summary' in header or '통찰' in header:
|
|
sections['summary'].append(body)
|
|
elif 'core' in header or '구조화된' in header or 'content' in header:
|
|
sections['core'].append(body)
|
|
elif 'trade-off' in header or 'tradeoff' in header or '모순' in header or 'caveat' in header:
|
|
sections['tradeoffs'].append(body)
|
|
elif 'connection' in header or '연결' in header:
|
|
sections['connections'].append(body)
|
|
else:
|
|
sections['other'].append(f"## {part.strip()}")
|
|
|
|
return sections
|
|
|
|
count = 0
|
|
for group in final_groups:
|
|
group = sorted(list(group))
|
|
|
|
def score_filename(f):
|
|
score = 0
|
|
if re.search(r'[가-힣]', f):
|
|
score += 100
|
|
score += len(f)
|
|
return score
|
|
|
|
canonical = min(group, key=score_filename)
|
|
safe_canonical = canonical.replace(' ', '_')
|
|
|
|
merged = {
|
|
'title': '',
|
|
'summary': [],
|
|
'core': [],
|
|
'tradeoffs': [],
|
|
'connections': [],
|
|
'other': []
|
|
}
|
|
|
|
valid_group = []
|
|
|
|
for f in group:
|
|
try:
|
|
with open(os.path.join(topics_dir, f), 'r', encoding='utf-8') as file:
|
|
content = file.read()
|
|
parsed = parse_markdown(content)
|
|
|
|
if not merged['title'] and parsed['title']:
|
|
merged['title'] = parsed['title']
|
|
|
|
merged['summary'].extend(parsed['summary'])
|
|
merged['core'].extend(parsed['core'])
|
|
merged['tradeoffs'].extend(parsed['tradeoffs'])
|
|
merged['connections'].extend(parsed['connections'])
|
|
merged['other'].extend(parsed['other'])
|
|
|
|
valid_group.append(f)
|
|
except FileNotFoundError:
|
|
continue
|
|
|
|
if not valid_group:
|
|
continue
|
|
|
|
if not merged['title']:
|
|
merged['title'] = safe_canonical.replace('.md', '').replace('_', ' ')
|
|
|
|
summary_text = "\n\n---\n\n".join(merged['summary']) if merged['summary'] else 'No summary available.'
|
|
core_text = "\n\n---\n\n".join(merged['core']) if merged['core'] else 'No core content available.'
|
|
tradeoffs_text = "\n\n---\n\n".join(merged['tradeoffs']) if merged['tradeoffs'] else 'No trade-offs available.'
|
|
connections_text = "\n\n---\n\n".join(merged['connections']) if merged['connections'] else 'No connections available.'
|
|
|
|
final_content = f"""---
|
|
category: Unified
|
|
tags: [auto-consolidated, technical-documentation]
|
|
title: {merged['title']}
|
|
last_updated: 2026-05-02
|
|
---
|
|
|
|
# {merged['title']}
|
|
|
|
## 📌 Brief Summary
|
|
{summary_text}
|
|
|
|
## 📖 Core Content
|
|
{core_text}
|
|
|
|
## ⚖️ Trade-offs & Caveats
|
|
{tradeoffs_text}
|
|
|
|
## 🔗 Knowledge Connections
|
|
{connections_text}
|
|
"""
|
|
if merged['other']:
|
|
final_content += "\n\n" + "\n\n".join(merged['other'])
|
|
|
|
with open(os.path.join(topics_dir, safe_canonical), 'w', encoding='utf-8') as file:
|
|
file.write(final_content)
|
|
|
|
for f in valid_group:
|
|
if f != safe_canonical:
|
|
try:
|
|
os.remove(os.path.join(topics_dir, f))
|
|
except OSError:
|
|
pass
|
|
|
|
count += 1
|
|
print(f"Consolidated {len(valid_group)} files into {safe_canonical}")
|
|
|
|
print(f"Successfully consolidated {count} groups.")
|