chore: version up to 2.80.34 and package

2026-05-12 22:54:21 +09:00
parent 148bfb070b
commit 065e598cca
26 changed files with 2023 additions and 139 deletions
@@ -0,0 +1,220 @@
+/**
+ * ============================================================
+ * Brain Index — persistent, mtime-keyed tokenized cache of the Second Brain
+ *
+ * RAG 검색은 매 질의마다 브레인의 모든 .md 파일을 읽고 토크나이즈해서 TF-IDF 점수를
+ * 계산했습니다 — 파일 수가 많아지면 그게 병목입니다.
+ *
+ * 이 모듈은 `<brainPath>/.astra/brain-index.json` 에 파일별 토큰 배열을 (mtime+size 키로)
+ * 저장해 두고, 다음 질의에서는 *변경된 파일만* 다시 읽어 토크나이즈합니다. 나머지는 디스크/메모리
+ * 캐시에서 그대로 가져옵니다. 디스크 쓰기는 디바운스되고 실패해도 in-memory 로만 동작합니다.
+ * ============================================================
+ */
+
+import * as fs from 'fs';
+import * as path from 'path';
+import { tokenize, countConflictIndicators } from './scoring';
+import { logInfo } from '../utils';
+
+const INDEX_VERSION = 2;
+const INDEX_DIR = '.astra';
+const INDEX_FILE = 'brain-index.json';
+/** 인덱스가 이 개수를 넘으면 이번 스캔에서 못 본 항목을 정리합니다 (삭제된 파일 누적 방지). */
+const MAX_INDEX_ENTRIES = 12000;
+/** 디스크 쓰기 디바운스. */
+const WRITE_DEBOUNCE_MS = 1500;
+
+interface IndexEntry {
+    mtimeMs: number;
+    size: number;
+    title: string;          // basename without .md
+    relativePath: string;   // relative to brainPath
+    tokens: string[];       // tokenize(`${title} ${content}`)
+    titleTokens: string[];  // tokenize(title)
+    conflictCount: number;  // countConflictIndicators(`${title} ${content}`)
+}
+
+interface PersistedIndex {
+    version: number;
+    entries: Record<string, IndexEntry>; // keyed by absolute file path
+}
+
+export interface IndexedBrainDoc {
+    filePath: string;
+    relativePath: string;
+    title: string;
+    tokens: string[];
+    titleTokens: string[];
+    conflictCount: number;
+    mtimeMs: number;
+}
+
+interface BrainState {
+    index: PersistedIndex;
+    dirty: boolean;
+    diskPath: string | null;   // null if we can't determine a writable path
+    writeTimer?: ReturnType<typeof setTimeout>;
+}
+
+const _states = new Map<string, BrainState>();
+
+function indexFileFor(brainPath: string): string {
+    return path.join(brainPath, INDEX_DIR, INDEX_FILE);
+}
+
+function loadState(brainPath: string): BrainState {
+    const existing = _states.get(brainPath);
+    if (existing) return existing;
+
+    let index: PersistedIndex = { version: INDEX_VERSION, entries: {} };
+    let diskPath: string | null = null;
+    try {
+        diskPath = indexFileFor(brainPath);
+        if (fs.existsSync(diskPath)) {
+            const raw = JSON.parse(fs.readFileSync(diskPath, 'utf8'));
+            if (raw && raw.version === INDEX_VERSION && raw.entries && typeof raw.entries === 'object') {
+                index = raw as PersistedIndex;
+            } else {
+                logInfo('Brain index is stale/unrecognized — rebuilding.', { brainPath });
+            }
+        }
+    } catch (e: any) {
+        logInfo('Brain index load failed — starting fresh.', { brainPath, error: e?.message || String(e) });
+        index = { version: INDEX_VERSION, entries: {} };
+    }
+    const st: BrainState = { index, dirty: false, diskPath };
+    _states.set(brainPath, st);
+    return st;
+}
+
+function scheduleWrite(st: BrainState, brainPath: string): void {
+    if (!st.dirty || !st.diskPath || st.writeTimer) return;
+    const timer = setTimeout(() => {
+        st.writeTimer = undefined;
+        if (!st.dirty || !st.diskPath) return;
+        try {
+            const dir = path.dirname(st.diskPath);
+            fs.mkdirSync(dir, { recursive: true });
+            // One-time .gitignore so the cache dir never gets committed into a Second Brain git repo.
+            const gi = path.join(dir, '.gitignore');
+            if (!fs.existsSync(gi)) {
+                try { fs.writeFileSync(gi, '*\n', 'utf8'); } catch { /* non-fatal */ }
+            }
+            const tmp = `${st.diskPath}.tmp`;
+            fs.writeFileSync(tmp, JSON.stringify(st.index), 'utf8');
+            fs.renameSync(tmp, st.diskPath);
+            st.dirty = false;
+        } catch (e: any) {
+            logInfo('Brain index write failed (continuing in-memory only).', { brainPath, error: e?.message || String(e) });
+        }
+    }, WRITE_DEBOUNCE_MS);
+    if (typeof (timer as any).unref === 'function') (timer as any).unref();
+    st.writeTimer = timer;
+}
+
+/**
+ * Returns tokenized representations for `files` (absolute brain-file paths, already
+ * scoped/filtered by the caller). Unchanged files are served from the index; changed/new
+ * files are read & tokenized and the index is updated (debounced disk write).
+ *
+ * Safe to call with an empty/invalid `brainPath` or empty list — returns [].
+ */
+export function getBrainTokenIndex(brainPath: string, files: string[]): IndexedBrainDoc[] {
+    if (!brainPath || !Array.isArray(files) || files.length === 0) return [];
+    const st = loadState(brainPath);
+    const out: IndexedBrainDoc[] = [];
+    const seen = new Set<string>();
+    let reindexed = 0;
+
+    for (const file of files) {
+        seen.add(file);
+        let stat: fs.Stats;
+        try {
+            stat = fs.statSync(file);
+        } catch {
+            continue; // listed but gone now — skip silently
+        }
+        const cached = st.index.entries[file];
+        if (cached
+            && cached.mtimeMs === stat.mtimeMs
+            && cached.size === stat.size
+            && Array.isArray(cached.tokens)
+            && Array.isArray(cached.titleTokens)) {
+            out.push({
+                filePath: file,
+                relativePath: cached.relativePath,
+                title: cached.title,
+                tokens: cached.tokens,
+                titleTokens: cached.titleTokens,
+                conflictCount: cached.conflictCount || 0,
+                mtimeMs: cached.mtimeMs,
+            });
+            continue;
+        }
+        // (Re)index this file.
+        let content = '';
+        try {
+            content = fs.readFileSync(file, 'utf8');
+        } catch {
+            continue;
+        }
+        const relativePath = path.relative(brainPath, file);
+        const title = path.basename(file, '.md');
+        const combined = `${title} ${content}`;
+        const entry: IndexEntry = {
+            mtimeMs: stat.mtimeMs,
+            size: stat.size,
+            title,
+            relativePath,
+            tokens: tokenize(combined),
+            titleTokens: tokenize(title),
+            conflictCount: countConflictIndicators(combined),
+        };
+        st.index.entries[file] = entry;
+        st.dirty = true;
+        reindexed++;
+        out.push({
+            filePath: file,
+            relativePath,
+            title,
+            tokens: entry.tokens,
+            titleTokens: entry.titleTokens,
+            conflictCount: entry.conflictCount,
+            mtimeMs: entry.mtimeMs,
+        });
+    }
+
+    // Prune stale entries. We only prune when this looked like a (near-)full scan — i.e. we saw
+    // most of the index — so an agent-scoped query doesn't evict cache for out-of-scope files.
+    // (Falls back to a hard prune if the index has grown beyond MAX_INDEX_ENTRIES.)
+    const entryKeys = Object.keys(st.index.entries);
+    const looksFullScan = seen.size >= entryKeys.length * 0.8;
+    if (looksFullScan || entryKeys.length > MAX_INDEX_ENTRIES) {
+        for (const key of entryKeys) {
+            if (!seen.has(key)) {
+                delete st.index.entries[key];
+                st.dirty = true;
+            }
+        }
+    }
+
+    if (reindexed > 0) {
+        logInfo('Brain index updated.', { brainPath, files: files.length, reindexed, totalEntries: Object.keys(st.index.entries).length });
+    }
+    if (st.dirty) scheduleWrite(st, brainPath);
+    return out;
+}
+
+/** Drop the in-memory index (and pending write) for one brain, or all brains. The disk file is left as-is. */
+export function clearBrainTokenIndex(brainPath?: string): void {
+    if (brainPath === undefined) {
+        for (const st of _states.values()) {
+            if (st.writeTimer) { clearTimeout(st.writeTimer); st.writeTimer = undefined; }
+        }
+        _states.clear();
+        return;
+    }
+    const st = _states.get(brainPath);
+    if (st?.writeTimer) clearTimeout(st.writeTimer);
+    _states.delete(brainPath);
+}
@@ -19,11 +19,13 @@ import { findBrainFiles, summarizeText } from '../utils';
 import { isInside } from '../lib/paths';
 import { MemoryManager } from '../memory';
 import { RetrievalChunk, RetrievalResult, ContextBudgetConfig } from './types';
-import { tokenize, expandQuery, scoreTfIdf, extractBestExcerpt } from './scoring';
+import { tokenize, expandQuery, scoreTfIdfPreTokenized, extractBestExcerpt } from './scoring';
 import { selectWithinBudget, assembleContext, estimateTokens } from './contextBudget';
+import { getBrainTokenIndex } from './brainIndex';

-export { tokenize, expandQuery, scoreTfIdf, extractBestExcerpt } from './scoring';
+export { tokenize, expandQuery, scoreTfIdf, scoreTfIdfPreTokenized, extractBestExcerpt } from './scoring';
 export { selectWithinBudget, assembleContext, estimateTokens } from './contextBudget';
+export { getBrainTokenIndex, clearBrainTokenIndex } from './brainIndex';
 export * from './types';

 interface RetrievalOptions {
@@ -133,52 +135,48 @@ export class RetrievalOrchestrator {

            if (allFiles.length === 0) return [];

-            // Read all files for TF-IDF
-            const documents = allFiles.map((file) => {
+            // Tokenized docs from the persistent mtime-keyed index — unchanged files are not re-read
+            // or re-tokenized, so per-query work over a large brain drops from O(total content) to O(files) stats.
+            const indexed = getBrainTokenIndex(brain.localBrainPath, allFiles);
+            if (indexed.length === 0) return [];
+
+            const scored = scoreTfIdfPreTokenized(
+                expandedTokens,
+                indexed.map((d) => ({
+                    tokens: d.tokens,
+                    titleTokens: d.titleTokens,
+                    lastModified: d.mtimeMs,
+                    conflictCount: d.conflictCount,
+                }))
+            );
+
+            const topResults: RetrievalChunk[] = [];
+            for (const s of scored.filter((x) => x.score > 0).sort((a, b) => b.score - a.score).slice(0, limit)) {
+                const doc = indexed[s.index];
+                // Only the top `limit` files are actually read off disk (for excerpt extraction).
                let content = '';
-                let lastModified = 0;
-                try {
-                    content = fs.readFileSync(file, 'utf8');
-                    lastModified = fs.statSync(file).mtimeMs;
-                } catch { /* skip */ }
-                return {
-                    title: path.basename(file, '.md'),
-                    content,
-                    lastModified,
-                    filePath: file,
-                    relativePath: path.relative(brain.localBrainPath, file)
-                };
-            });
-
-            // TF-IDF scoring
-            const scored = scoreTfIdf(expandedTokens, documents);
-
-            return scored
-                .filter((s) => s.score > 0)
-                .sort((a, b) => b.score - a.score)
-                .slice(0, limit)
-                .map((s) => {
-                    const doc = documents[s.index];
-                    const excerpt = extractBestExcerpt(doc.content, expandedTokens, 400);
-                    return {
-                        id: `brain-${s.index}`,
-                        source: 'brain-memory' as const,
-                        title: doc.relativePath,
-                        content: summarizeText(excerpt, 400),
-                        score: s.score,
-                        tokenEstimate: estimateTokens(excerpt),
-                        metadata: {
-                            filePath: doc.filePath,
-                            category: this.inferCategory(doc.relativePath),
-                            isProjectEvidence: this.isProjectEvidence(doc.relativePath, doc.content),
-                            lastUpdated: doc.lastModified,
-                            // Phase 5: Scoring Intelligence Integration
-                            conflictDetected: s.conflictDetected,
-                            conflictSeverity: s.conflictSeverity,
-                            informationDensity: s.informationDensity
-                        }
-                    };
+                try { content = fs.readFileSync(doc.filePath, 'utf8'); } catch { /* deleted just now — skip */ continue; }
+                const excerpt = extractBestExcerpt(content, expandedTokens, 400);
+                topResults.push({
+                    id: `brain-${s.index}`,
+                    source: 'brain-memory' as const,
+                    title: doc.relativePath,
+                    content: summarizeText(excerpt, 400),
+                    score: s.score,
+                    tokenEstimate: estimateTokens(excerpt),
+                    metadata: {
+                        filePath: doc.filePath,
+                        category: this.inferCategory(doc.relativePath),
+                        isProjectEvidence: this.isProjectEvidence(doc.relativePath, content),
+                        lastUpdated: doc.mtimeMs,
+                        // Phase 5: Scoring Intelligence Integration
+                        conflictDetected: s.conflictDetected,
+                        conflictSeverity: s.conflictSeverity,
+                        informationDensity: s.informationDensity,
+                    },
                });
+            }
+            return topResults;
        } catch {
            return [];
        }
@@ -160,6 +160,30 @@ function inverseDocumentFrequency(

 export type ConflictSeverity = 'NONE' | 'LOW' | 'MEDIUM' | 'HIGH';

+/**
+ * Counts how many distinct conflict-indicator words are present (substring match) in `rawText`.
+ * Exposed so the brain index can cache this per-file instead of re-scanning content every query.
+ */
+export function countConflictIndicators(rawText: string): number {
+    const lower = (rawText || '').toLowerCase();
+    let n = 0;
+    for (const indicator of SCORING_CONFIG.CONFLICT_INDICATORS) {
+        if (lower.includes(indicator.toLowerCase())) n++;
+    }
+    return n;
+}
+
+/** A document whose tokens were already computed (e.g. from the persistent brain index). */
+export interface PreTokenizedDoc {
+    /** tokenize(`${title} ${content}`) */
+    tokens: string[];
+    /** tokenize(title) */
+    titleTokens: string[];
+    lastModified?: number;
+    /** result of countConflictIndicators(`${title} ${content}`); 0 if unknown */
+    conflictCount: number;
+}
+
 export interface ScoredDocument {
    index: number;
    score: number;
@@ -173,6 +197,8 @@ export interface ScoredDocument {

 /**
 * TF-IDF 기반으로 문서 집합을 스코어링합니다.
+ * 문서 내용을 받아 즉석에서 토크나이즈합니다 — 이미 토큰화된 집합이 있다면
+ * `scoreTfIdfPreTokenized` 를 직접 호출하면 토크나이즈를 건너뛸 수 있습니다.
 */
 export function scoreTfIdf(
    queryTokens: string[],
@@ -183,11 +209,28 @@ export function scoreTfIdf(
    }>
 ): ScoredDocument[] {
    if (documents.length === 0 || queryTokens.length === 0) return [];
+    return scoreTfIdfPreTokenized(queryTokens, documents.map((doc) => {
+        const combined = `${doc.title} ${doc.content}`;
+        return {
+            tokens: tokenize(combined),
+            titleTokens: tokenize(doc.title),
+            lastModified: doc.lastModified,
+            conflictCount: countConflictIndicators(combined),
+        };
+    }));
+}

-    // Pre-tokenize all documents
-    const docTokenArrays = documents.map((doc) =>
-        tokenize(`${doc.title} ${doc.content}`)
-    );
+/**
+ * TF-IDF 스코어링 — 이미 토큰화된 문서 집합 버전 (브레인 인덱스 등 캐시된 토큰을 그대로 사용).
+ * `scoreTfIdf` 와 동일한 알고리즘이며 출력 형태도 같습니다.
+ */
+export function scoreTfIdfPreTokenized(
+    queryTokens: string[],
+    documents: PreTokenizedDoc[]
+): ScoredDocument[] {
+    if (documents.length === 0 || queryTokens.length === 0) return [];
+
+    const docTokenArrays = documents.map((doc) => doc.tokens);
    const docTokenSets = docTokenArrays.map((tokens) => new Set(tokens));

    // Expand query with synonyms
@@ -205,22 +248,18 @@ export function scoreTfIdf(

    return documents.map((doc, index) => {
        const docTokens = docTokenArrays[index];
-        const titleTokens = new Set(tokenize(doc.title));
+        const titleTokens = new Set(doc.titleTokens);
        let score = 0;
        const matchedTerms: string[] = [];

-        // Conflict Detection & Severity Analysis (Substring based for better recall with particles)
-        const rawText = `${doc.title} ${doc.content}`.toLowerCase();
-        const conflictMatches = [...SCORING_CONFIG.CONFLICT_INDICATORS].filter(indicator => 
-            rawText.includes(indicator.toLowerCase())
-        );
-        
-        const conflictDetected = conflictMatches.length > 0;
+        // Conflict Detection & Severity Analysis (pre-counted by caller / index)
+        const conflictCount = doc.conflictCount || 0;
+        const conflictDetected = conflictCount > 0;
        let conflictSeverity: ConflictSeverity = 'NONE';

-        if (conflictMatches.length >= SCORING_CONFIG.CONFLICT_THRESHOLDS.HIGH) conflictSeverity = 'HIGH';
-        else if (conflictMatches.length >= SCORING_CONFIG.CONFLICT_THRESHOLDS.MEDIUM) conflictSeverity = 'MEDIUM';
-        else if (conflictMatches.length >= SCORING_CONFIG.CONFLICT_THRESHOLDS.LOW) conflictSeverity = 'LOW';
+        if (conflictCount >= SCORING_CONFIG.CONFLICT_THRESHOLDS.HIGH) conflictSeverity = 'HIGH';
+        else if (conflictCount >= SCORING_CONFIG.CONFLICT_THRESHOLDS.MEDIUM) conflictSeverity = 'MEDIUM';
+        else if (conflictCount >= SCORING_CONFIG.CONFLICT_THRESHOLDS.LOW) conflictSeverity = 'LOW';

        for (const term of expandedQuery) {
            const tf = termFrequency(term, docTokens);