2nd/scratch/consolidate.py

import os
import re
from collections import defaultdict

topics_dir = "/Volumes/Data/project/Antigravity/Wiki/10_Wiki/Topics"
files = [f for f in os.listdir(topics_dir) if f.endswith(".md")]

groups = defaultdict(list)

def extract_core_name(filename):
    name = filename[:-3]
    name = re.sub(r'[가-힣]', '', name)
    name = re.sub(r'[^a-zA-Z0-9]', '', name).lower()
    return name

for f in files:
    core = extract_core_name(f)
    if core:
        groups[core].append(f)

def get_words(filename):
    name = filename[:-3]
    name = re.sub(r'[가-힣]', '', name)
    words = re.findall(r'[a-zA-Z0-9]+', name.lower())
    return frozenset(words)

word_groups = defaultdict(list)
for f in files:
    words = get_words(f)
    if words:
        word_groups[words].append(f)

final_groups = []
used_files = set()

for core, flist in groups.items():
    if len(flist) > 1:
        group = set(flist)
        final_groups.append(group)
        used_files.update(group)

for words, flist in word_groups.items():
    if len(flist) > 1:
        group = set(flist)
        merged = False
        for existing_group in final_groups:
            if group.intersection(existing_group):
                existing_group.update(group)
                used_files.update(group)
                merged = True
                break
        if not merged:
            if not group.issubset(used_files):
                final_groups.append(group)
                used_files.update(group)

def parse_markdown(content):
    sections = {
        'yaml': '',
        'title': '',
        'summary': [],
        'core': [],
        'tradeoffs': [],
        'connections': [],
        'other': []
    }

    yaml_match = re.search(r'^---\n(.*?)\n---\n', content, re.DOTALL)
    if yaml_match:
        sections['yaml'] = yaml_match.group(0)
        content = content[yaml_match.end():]

    title_match = re.search(r'^#\s+(.*)', content, re.MULTILINE)
    if title_match:
        sections['title'] = title_match.group(1)

    parts = re.split(r'\n##\s+', '\n' + content)

    for part in parts[1:]:
        header_end = part.find('\n')
        if header_end == -1:
            header_end = len(part)
        header = part[:header_end].lower()
        body = part[header_end:].strip()

        if not body:
            continue

        if 'summary' in header or '통찰' in header:
            sections['summary'].append(body)
        elif 'core' in header or '구조화된' in header or 'content' in header:
            sections['core'].append(body)
        elif 'trade-off' in header or 'tradeoff' in header or '모순' in header or 'caveat' in header:
            sections['tradeoffs'].append(body)
        elif 'connection' in header or '연결' in header:
            sections['connections'].append(body)
        else:
            sections['other'].append(f"## {part.strip()}")

    return sections

count = 0
for group in final_groups:
    group = sorted(list(group))

    def score_filename(f):
        score = 0
        if re.search(r'[가-힣]', f):
            score += 100
        score += len(f)
        return score

    canonical = min(group, key=score_filename)
    safe_canonical = canonical.replace(' ', '_')

    merged = {
        'title': '',
        'summary': [],
        'core': [],
        'tradeoffs': [],
        'connections': [],
        'other': []
    }

    valid_group = []

    for f in group:
        try:
            with open(os.path.join(topics_dir, f), 'r', encoding='utf-8') as file:
                content = file.read()
                parsed = parse_markdown(content)

                if not merged['title'] and parsed['title']:
                    merged['title'] = parsed['title']

                merged['summary'].extend(parsed['summary'])
                merged['core'].extend(parsed['core'])
                merged['tradeoffs'].extend(parsed['tradeoffs'])
                merged['connections'].extend(parsed['connections'])
                merged['other'].extend(parsed['other'])

                valid_group.append(f)
        except FileNotFoundError:
            continue

    if not valid_group:
        continue

    if not merged['title']:
        merged['title'] = safe_canonical.replace('.md', '').replace('_', ' ')

    summary_text = "\n\n---\n\n".join(merged['summary']) if merged['summary'] else 'No summary available.'
    core_text = "\n\n---\n\n".join(merged['core']) if merged['core'] else 'No core content available.'
    tradeoffs_text = "\n\n---\n\n".join(merged['tradeoffs']) if merged['tradeoffs'] else 'No trade-offs available.'
    connections_text = "\n\n---\n\n".join(merged['connections']) if merged['connections'] else 'No connections available.'

    final_content = f"""---
category: Unified
tags: [auto-consolidated, technical-documentation]
title: {merged['title']}
last_updated: 2026-05-02
---

# {merged['title']}

## 📌 Brief Summary
{summary_text}

## 📖 Core Content
{core_text}

## ⚖️ Trade-offs & Caveats
{tradeoffs_text}

## 🔗 Knowledge Connections
{connections_text}
"""
    if merged['other']:
        final_content += "\n\n" + "\n\n".join(merged['other'])

    with open(os.path.join(topics_dir, safe_canonical), 'w', encoding='utf-8') as file:
        file.write(final_content)

    for f in valid_group:
        if f != safe_canonical:
            try:
                os.remove(os.path.join(topics_dir, f))
            except OSError:
                pass

    count += 1
    print(f"Consolidated {len(valid_group)} files into {safe_canonical}")

print(f"Successfully consolidated {count} groups.")