[P-Reinforce] Global knowledge consolidation, massive deduplication (5,249 files), and high-density wikification (45 nodes)

2026-05-05 15:28:22 +09:00
parent a7d1e60ccf
commit dd01e01bea
3430 changed files with 42739 additions and 52263 deletions
@@ -0,0 +1,26 @@
+import os
+import re
+
+root_dir = "/Volumes/Data/project/Antigravity/Wiki"
+patterns = [
+    re.compile(r"지식 요약 정보 추출 중"),
+    re.compile(r"본문 구조화 작업 중")
+]
+
+deleted_count = 0
+
+for root, dirs, files in os.walk(root_dir):
+    for file in files:
+        if file.endswith(".md"):
+            file_path = os.path.join(root, file)
+            try:
+                with open(file_path, 'r', encoding='utf-8') as f:
+                    content = f.read()
+                    if any(p.search(content) for p in patterns):
+                        os.remove(file_path)
+                        print(f"Deleted: {file_path}")
+                        deleted_count += 1
+            except Exception as e:
+                print(f"Error processing {file_path}: {e}")
+
+print(f"\nTotal deleted: {deleted_count}")
@@ -0,0 +1,57 @@
+import os
+from collections import defaultdict
+
+root_dir = "/Volumes/Data/project/Antigravity/Wiki/10_Wiki/Topics"
+file_map = defaultdict(list)
+
+# Walk and map basenames to full paths
+for root, dirs, files in os.walk(root_dir):
+    for file in files:
+        if file.endswith(".md"):
+            path = os.path.join(root, file)
+            file_map[file].append(path)
+
+duplicates_found = 0
+deleted_count = 0
+
+for filename, paths in file_map.items():
+    if len(paths) > 1:
+        duplicates_found += 1
+        # Sort paths by size descending
+        # We also consider the path depth (shorter paths usually preferred if sizes are close)
+        # But primarily size for "richness"
+        paths_with_info = []
+        for p in paths:
+            try:
+                size = os.path.getsize(p)
+                # Count lines as well for better richness metric
+                with open(p, 'r', encoding='utf-8', errors='ignore') as f:
+                    lines = len(f.readlines())
+                paths_with_info.append({
+                    'path': p,
+                    'size': size,
+                    'lines': lines,
+                    'score': size * 0.7 + lines * 0.3 # Heuristic score
+                })
+            except Exception as e:
+                print(f"Error reading {p}: {e}")
+
+        # Sort by score descending
+        paths_with_info.sort(key=lambda x: x['score'], reverse=True)
+        
+        winner = paths_with_info[0]
+        losers = paths_with_info[1:]
+        
+        print(f"\nDuplicate: {filename}")
+        print(f"  KEEP: {winner['path']} (Size: {winner['size']}, Lines: {winner['lines']})")
+        
+        for loser in losers:
+            try:
+                os.remove(loser['path'])
+                print(f"  DELETE: {loser['path']} (Size: {loser['size']}, Lines: {loser['lines']})")
+                deleted_count += 1
+            except Exception as e:
+                print(f"  FAILED to delete {loser['path']}: {e}")
+
+print(f"\nTotal duplicate groups: {duplicates_found}")
+print(f"Total files deleted: {deleted_count}")
@@ -1,30 +0,0 @@
-import os
-import re
-
-base_path = r'E:\Wiki\2nd\10_Wiki\Topics'
-placeholder_patterns = [
-    r'> 지식 요약 작업 중',
-    r'> 지식 요약 정보 추출 중',
-    r'본문 구조화 작업 중'
-]
-
-found_files = []
-
-for root, dirs, files in os.walk(base_path):
-    for f in files:
-        if f.endswith('.md'):
-            file_path = os.path.join(root, f)
-            if len(file_path) > 240: continue
-            
-            try:
-                with open(file_path, 'r', encoding='utf-8', errors='ignore') as f_obj:
-                    content = f_obj.read()
-                    matches = [re.search(p, content) for p in placeholder_patterns]
-                    if any(matches):
-                        found_files.append(file_path)
-            except:
-                pass
-
-print(f"Found {len(found_files)} specific placeholder files.")
-for f in found_files:
-    print(f"{f}")
@@ -1,28 +0,0 @@
-import os
-
-base_path = r'E:\Wiki\2nd\10_Wiki\Topics'
-junk_phrases = [
-    "지식 요약 정보 추출 중...",
-    "본문 구조화 작업 중...",
-    "신규 문서로, 기존 정보와의 충돌 분석 예정."
-]
-
-found_files = []
-
-for root, dirs, files in os.walk(base_path):
-    for f in files:
-        if f.endswith('.md'):
-            file_path = os.path.join(root, f)
-            if len(file_path) > 240: continue
-            
-            try:
-                with open(file_path, 'r', encoding='utf-8', errors='ignore') as f_obj:
-                    content = f_obj.read()
-                    if any(phrase in content for phrase in junk_phrases):
-                        found_files.append(file_path)
-            except:
-                pass
-
-print(f"Found {len(found_files)} in-progress placeholder files.")
-for f in found_files:
-    print(f"{f}")
@@ -0,0 +1,64 @@
+import os
+
+topics_dir = "/Volumes/Data/project/Antigravity/Wiki/10_Wiki/Topics"
+
+# Dictionary of {filename: content}
+# (Content truncated for brevity in this scratch script, will be filled with the synthesized text)
+wikis = {
+    "AGI (Artificial General Intelligence).md": """---
+category: Unified
+tags: [auto-consolidated, technical-documentation]
+title: AGI (Artificial General Intelligence)
+last_updated: 2026-05-05
+---
+
+# AGI (Artificial General Intelligence)
+
+## 📌 Brief Summary
+범용 인공지능(AGI)은 인간이 수행할 수 있는 모든 지적 작업을 수행할 수 있는 인공지능을 의미하며, 인공지능 연구의 궁극적인 목표이다. 특정 분야에 국한되지 않고 새로운 환경에서 학습하고 문제를 해결하며, 상식을 바탕으로 추론하고 자율적으로 행동하는 능력을 포함한다.
+
+## 📖 Core Content
+* **뉴로-심볼릭 통합 (Neuro-Symbolic Integration):** 신경망의 학습 능력과 기호 논리의 추론 능력을 결합하여 AGI를 구현하려는 시도이다.
+* **자기 개선 및 지속적 학습:** 스스로 알고리즘을 최적화하고 새로운 지식을 지속적으로 갱신하는 능력이 필수적이다.
+* **설명 가능성 및 안전성:** 고도의 지능이 인류의 가치와 정렬(Alignment)되도록 보장하는 거버넌스 체계가 수반되어야 한다.
+
+## ⚖️ Trade-offs & Caveats
+* **지능 vs 통제:** 지능이 높아질수록 인간의 통제를 벗어날 위험(Alignment Problem)이 증가한다.
+* **연산 자원 및 효율성:** AGI 수준의 지능을 구현하기 위한 막대한 하드웨어 비용과 전력 소모가 환경적/경제적 제약으로 작용한다.
+
+## 🔗 Knowledge Connections
+* [[Neuro-Symbolic AI]]
+* [[LLM Alignment]]
+
+---
+*Last updated: 2026-05-05*""",
+    "Global Workspace Theory (GWT).md": """---
+category: Cognitive Modeling
+tags: [neuroscience, consciousness]
+title: Global Workspace Theory (GWT)
+last_updated: 2026-05-05
+---
+
+# Global Workspace Theory (GWT)
+
+## 📌 Brief Summary
+전역 작업 공간 이론(GWT)은 인간의 의식을 '극장의 무대'에 비유하여, 수많은 무의식적 프로세스들이 특정 정보를 전역적으로 공유할 때 의식이 발생한다고 설명하는 인지 아키텍처이다.
+
+## 📖 Core Content
+* **전역 방송 (Global Broadcasting):** 특정 정보가 전역 작업 공간에 진입하면, 뇌의 다른 다양한 모듈들이 해당 정보에 접근하여 병렬적으로 처리할 수 있게 된다.
+* **의식적 주의 (Conscious Attention):** 무의식적인 자극이라도 의식적인 주의가 선행되어야 장기적인 학습 및 암묵적 규칙 추론이 가능하다.
+* **GNW 모델:** 신경생리학적으로 전두엽과 두정엽의 장거리 뉴런들이 이 전역 공간을 형성한다는 이론으로 확장되었다.
+
+## ⚖️ Trade-offs & Caveats
+* **병목 현상:** 전역 공간은 한 번에 한정된 양의 정보만 처리할 수 있어, 복잡한 다중 작업 시 인지적 과부하가 발생한다.
+
+---
+*Last updated: 2026-05-05*""",
+    # ... more to be added in chunks or follow-up
+}
+
+for name, content in wikis.items():
+    path = os.path.join(topics_dir, name)
+    with open(path, 'w', encoding='utf-8') as f:
+        f.write(content)
+    print(f"Created: {path}")