Files
2nd/scratch/categorize.py
T

127 lines
4.9 KiB
Python

import os
import re
import shutil
from collections import defaultdict
topics_dir = "/Volumes/Data/project/Antigravity/Wiki/10_Wiki/Topics"
files = [f for f in os.listdir(topics_dir) if f.endswith(".md") and os.path.isfile(os.path.join(topics_dir, f))]
# Define categories and their matching keywords/tags
categories = {
"Project_Logs": [r'^2024-', r'^2025-', r'^2026-'],
"AI_and_ML": ['ai', 'machine-learning', 'deep-learning', 'llm', 'gpt', 'dall-e', 'midjourney', 'generative', 'reinforcement', 'neural', 'diffusion', 'nlp'],
"Architecture": ['architecture', 'ddd', 'pattern', 'solid', 'microservices', 'c4', 'system-design', 'infrastructure', 'clean-architecture', 'domain-driven', 'cqrs', 'mvc'],
"Frontend": ['frontend', 'react', 'css', 'dom', 'webgl', 'threejs', 'browser', 'javascript', 'typescript', 'ui-components', 'rendering'],
"Backend": ['backend', 'database', 'sql', 'api', 'graphql', 'rest', 'nodejs', 'server', 'redis', 'cache'],
"DevOps_and_Security": ['devops', 'ci-cd', 'security', 'sast', 'dast', 'git', 'docker', 'kubernetes', 'cloud', 'testing', 'deployment'],
"Game_Design": ['game', 'game-design', 'combat', 'monetization', 'level-design', 'player', 'boss', 'skybound', 'mechanics', 'gacha'],
"Design_and_UX": ['ux', 'ui', 'design-system', 'accessibility', 'figma', 'typography', 'user-experience'],
"Business_and_Management": ['management', 'agile', 'scrum', 'business', 'economics', 'consulting', 'kpi', 'strategy', 'monopoly'],
"Computer_Science_and_Theory": ['algorithm', 'data-structure', 'theory', 'math', 'physics', 'computational', 'graph', 'tree', 'complexity'],
"Other": []
}
file_categories = defaultdict(list)
def determine_category(filename, content):
filename_lower = filename.lower()
# 1. Project logs by filename
for p in categories["Project_Logs"]:
if re.search(p, filename_lower):
return "Project_Logs"
# Extract tags from yaml
tags = []
yaml_match = re.search(r'^---\n(.*?)\n---\n', content, re.DOTALL)
if yaml_match:
yaml_text = yaml_match.group(1)
tags_match = re.search(r'tags:\s*\[(.*?)\]', yaml_text)
if tags_match:
tags_str = tags_match.group(1)
tags = [t.strip().strip('"\'').lower() for t in tags_str.split(',')]
# Remove generic tags
generic_tags = ['auto-wikified', 'technical-documentation', 'auto-consolidated']
tags = [t for t in tags if t not in generic_tags]
# Check tags first
for cat, keywords in categories.items():
if cat in ["Project_Logs", "Other"]:
continue
for tag in tags:
if any(k in tag for k in keywords):
return cat
# Check filename and content summary
# To optimize, we just check filename and a small chunk of content
text_to_search = filename_lower + " " + content[:1000].lower()
for cat, keywords in categories.items():
if cat in ["Project_Logs", "Other"]:
continue
for k in keywords:
if k in text_to_search:
return cat
return "Other"
# Group files
for f in files:
filepath = os.path.join(topics_dir, f)
with open(filepath, 'r', encoding='utf-8') as file:
content = file.read()
cat = determine_category(f, content)
file_categories[cat].append((f, content))
# Create folders and move files
for cat, flist in file_categories.items():
if not flist:
continue
cat_dir = os.path.join(topics_dir, cat)
if not os.path.exists(cat_dir):
os.makedirs(cat_dir)
# List for the index file
index_links = []
for filename, content in flist:
# Move file
src = os.path.join(topics_dir, filename)
dst = os.path.join(cat_dir, filename)
shutil.move(src, dst)
# update metadata in the file to point to the new category if needed
# Actually the user only asked to organize into folders.
# Add to index list
# Extract title
title_match = re.search(r'^#\s+(.*)', content, re.MULTILINE)
title = title_match.group(1) if title_match else filename[:-3]
index_links.append(f"- [[{filename[:-3]}]] : {title}")
# Create the category summary file (e.g. Architecture.md inside Architecture folder)
index_links.sort()
index_content = f"""---
category: Unified
tags: [category-index, {cat.lower()}]
title: {cat.replace('_', ' ')} Directory
last_updated: 2026-05-02
---
# {cat.replace('_', ' ')} Directory
이 문서는 `{cat}` 카테고리에 속한 모든 지식 문서들의 목록을 제공합니다.
## 📄 문서 목록
""" + "\n".join(index_links)
index_filename = f"{cat}.md"
with open(os.path.join(cat_dir, index_filename), 'w', encoding='utf-8') as index_file:
index_file.write(index_content)
print(f"Created category '{cat}' with {len(flist)} files.")
print("Categorization complete.")