chore: version up to 2.80.34 and package

This commit is contained in:
g1nation
2026-05-12 22:54:21 +09:00
parent 148bfb070b
commit 065e598cca
26 changed files with 2023 additions and 139 deletions
+220
View File
@@ -0,0 +1,220 @@
/**
* ============================================================
* Brain Index — persistent, mtime-keyed tokenized cache of the Second Brain
*
* RAG 검색은 매 질의마다 브레인의 모든 .md 파일을 읽고 토크나이즈해서 TF-IDF 점수를
* 계산했습니다 — 파일 수가 많아지면 그게 병목입니다.
*
* 이 모듈은 `<brainPath>/.astra/brain-index.json` 에 파일별 토큰 배열을 (mtime+size 키로)
* 저장해 두고, 다음 질의에서는 *변경된 파일만* 다시 읽어 토크나이즈합니다. 나머지는 디스크/메모리
* 캐시에서 그대로 가져옵니다. 디스크 쓰기는 디바운스되고 실패해도 in-memory 로만 동작합니다.
* ============================================================
*/
import * as fs from 'fs';
import * as path from 'path';
import { tokenize, countConflictIndicators } from './scoring';
import { logInfo } from '../utils';
const INDEX_VERSION = 2;
const INDEX_DIR = '.astra';
const INDEX_FILE = 'brain-index.json';
/** 인덱스가 이 개수를 넘으면 이번 스캔에서 못 본 항목을 정리합니다 (삭제된 파일 누적 방지). */
const MAX_INDEX_ENTRIES = 12000;
/** 디스크 쓰기 디바운스. */
const WRITE_DEBOUNCE_MS = 1500;
interface IndexEntry {
mtimeMs: number;
size: number;
title: string; // basename without .md
relativePath: string; // relative to brainPath
tokens: string[]; // tokenize(`${title} ${content}`)
titleTokens: string[]; // tokenize(title)
conflictCount: number; // countConflictIndicators(`${title} ${content}`)
}
interface PersistedIndex {
version: number;
entries: Record<string, IndexEntry>; // keyed by absolute file path
}
export interface IndexedBrainDoc {
filePath: string;
relativePath: string;
title: string;
tokens: string[];
titleTokens: string[];
conflictCount: number;
mtimeMs: number;
}
interface BrainState {
index: PersistedIndex;
dirty: boolean;
diskPath: string | null; // null if we can't determine a writable path
writeTimer?: ReturnType<typeof setTimeout>;
}
const _states = new Map<string, BrainState>();
function indexFileFor(brainPath: string): string {
return path.join(brainPath, INDEX_DIR, INDEX_FILE);
}
function loadState(brainPath: string): BrainState {
const existing = _states.get(brainPath);
if (existing) return existing;
let index: PersistedIndex = { version: INDEX_VERSION, entries: {} };
let diskPath: string | null = null;
try {
diskPath = indexFileFor(brainPath);
if (fs.existsSync(diskPath)) {
const raw = JSON.parse(fs.readFileSync(diskPath, 'utf8'));
if (raw && raw.version === INDEX_VERSION && raw.entries && typeof raw.entries === 'object') {
index = raw as PersistedIndex;
} else {
logInfo('Brain index is stale/unrecognized — rebuilding.', { brainPath });
}
}
} catch (e: any) {
logInfo('Brain index load failed — starting fresh.', { brainPath, error: e?.message || String(e) });
index = { version: INDEX_VERSION, entries: {} };
}
const st: BrainState = { index, dirty: false, diskPath };
_states.set(brainPath, st);
return st;
}
function scheduleWrite(st: BrainState, brainPath: string): void {
if (!st.dirty || !st.diskPath || st.writeTimer) return;
const timer = setTimeout(() => {
st.writeTimer = undefined;
if (!st.dirty || !st.diskPath) return;
try {
const dir = path.dirname(st.diskPath);
fs.mkdirSync(dir, { recursive: true });
// One-time .gitignore so the cache dir never gets committed into a Second Brain git repo.
const gi = path.join(dir, '.gitignore');
if (!fs.existsSync(gi)) {
try { fs.writeFileSync(gi, '*\n', 'utf8'); } catch { /* non-fatal */ }
}
const tmp = `${st.diskPath}.tmp`;
fs.writeFileSync(tmp, JSON.stringify(st.index), 'utf8');
fs.renameSync(tmp, st.diskPath);
st.dirty = false;
} catch (e: any) {
logInfo('Brain index write failed (continuing in-memory only).', { brainPath, error: e?.message || String(e) });
}
}, WRITE_DEBOUNCE_MS);
if (typeof (timer as any).unref === 'function') (timer as any).unref();
st.writeTimer = timer;
}
/**
* Returns tokenized representations for `files` (absolute brain-file paths, already
* scoped/filtered by the caller). Unchanged files are served from the index; changed/new
* files are read & tokenized and the index is updated (debounced disk write).
*
* Safe to call with an empty/invalid `brainPath` or empty list — returns [].
*/
export function getBrainTokenIndex(brainPath: string, files: string[]): IndexedBrainDoc[] {
if (!brainPath || !Array.isArray(files) || files.length === 0) return [];
const st = loadState(brainPath);
const out: IndexedBrainDoc[] = [];
const seen = new Set<string>();
let reindexed = 0;
for (const file of files) {
seen.add(file);
let stat: fs.Stats;
try {
stat = fs.statSync(file);
} catch {
continue; // listed but gone now — skip silently
}
const cached = st.index.entries[file];
if (cached
&& cached.mtimeMs === stat.mtimeMs
&& cached.size === stat.size
&& Array.isArray(cached.tokens)
&& Array.isArray(cached.titleTokens)) {
out.push({
filePath: file,
relativePath: cached.relativePath,
title: cached.title,
tokens: cached.tokens,
titleTokens: cached.titleTokens,
conflictCount: cached.conflictCount || 0,
mtimeMs: cached.mtimeMs,
});
continue;
}
// (Re)index this file.
let content = '';
try {
content = fs.readFileSync(file, 'utf8');
} catch {
continue;
}
const relativePath = path.relative(brainPath, file);
const title = path.basename(file, '.md');
const combined = `${title} ${content}`;
const entry: IndexEntry = {
mtimeMs: stat.mtimeMs,
size: stat.size,
title,
relativePath,
tokens: tokenize(combined),
titleTokens: tokenize(title),
conflictCount: countConflictIndicators(combined),
};
st.index.entries[file] = entry;
st.dirty = true;
reindexed++;
out.push({
filePath: file,
relativePath,
title,
tokens: entry.tokens,
titleTokens: entry.titleTokens,
conflictCount: entry.conflictCount,
mtimeMs: entry.mtimeMs,
});
}
// Prune stale entries. We only prune when this looked like a (near-)full scan — i.e. we saw
// most of the index — so an agent-scoped query doesn't evict cache for out-of-scope files.
// (Falls back to a hard prune if the index has grown beyond MAX_INDEX_ENTRIES.)
const entryKeys = Object.keys(st.index.entries);
const looksFullScan = seen.size >= entryKeys.length * 0.8;
if (looksFullScan || entryKeys.length > MAX_INDEX_ENTRIES) {
for (const key of entryKeys) {
if (!seen.has(key)) {
delete st.index.entries[key];
st.dirty = true;
}
}
}
if (reindexed > 0) {
logInfo('Brain index updated.', { brainPath, files: files.length, reindexed, totalEntries: Object.keys(st.index.entries).length });
}
if (st.dirty) scheduleWrite(st, brainPath);
return out;
}
/** Drop the in-memory index (and pending write) for one brain, or all brains. The disk file is left as-is. */
export function clearBrainTokenIndex(brainPath?: string): void {
if (brainPath === undefined) {
for (const st of _states.values()) {
if (st.writeTimer) { clearTimeout(st.writeTimer); st.writeTimer = undefined; }
}
_states.clear();
return;
}
const st = _states.get(brainPath);
if (st?.writeTimer) clearTimeout(st.writeTimer);
_states.delete(brainPath);
}
+44 -46
View File
@@ -19,11 +19,13 @@ import { findBrainFiles, summarizeText } from '../utils';
import { isInside } from '../lib/paths';
import { MemoryManager } from '../memory';
import { RetrievalChunk, RetrievalResult, ContextBudgetConfig } from './types';
import { tokenize, expandQuery, scoreTfIdf, extractBestExcerpt } from './scoring';
import { tokenize, expandQuery, scoreTfIdfPreTokenized, extractBestExcerpt } from './scoring';
import { selectWithinBudget, assembleContext, estimateTokens } from './contextBudget';
import { getBrainTokenIndex } from './brainIndex';
export { tokenize, expandQuery, scoreTfIdf, extractBestExcerpt } from './scoring';
export { tokenize, expandQuery, scoreTfIdf, scoreTfIdfPreTokenized, extractBestExcerpt } from './scoring';
export { selectWithinBudget, assembleContext, estimateTokens } from './contextBudget';
export { getBrainTokenIndex, clearBrainTokenIndex } from './brainIndex';
export * from './types';
interface RetrievalOptions {
@@ -133,52 +135,48 @@ export class RetrievalOrchestrator {
if (allFiles.length === 0) return [];
// Read all files for TF-IDF
const documents = allFiles.map((file) => {
// Tokenized docs from the persistent mtime-keyed index — unchanged files are not re-read
// or re-tokenized, so per-query work over a large brain drops from O(total content) to O(files) stats.
const indexed = getBrainTokenIndex(brain.localBrainPath, allFiles);
if (indexed.length === 0) return [];
const scored = scoreTfIdfPreTokenized(
expandedTokens,
indexed.map((d) => ({
tokens: d.tokens,
titleTokens: d.titleTokens,
lastModified: d.mtimeMs,
conflictCount: d.conflictCount,
}))
);
const topResults: RetrievalChunk[] = [];
for (const s of scored.filter((x) => x.score > 0).sort((a, b) => b.score - a.score).slice(0, limit)) {
const doc = indexed[s.index];
// Only the top `limit` files are actually read off disk (for excerpt extraction).
let content = '';
let lastModified = 0;
try {
content = fs.readFileSync(file, 'utf8');
lastModified = fs.statSync(file).mtimeMs;
} catch { /* skip */ }
return {
title: path.basename(file, '.md'),
content,
lastModified,
filePath: file,
relativePath: path.relative(brain.localBrainPath, file)
};
});
// TF-IDF scoring
const scored = scoreTfIdf(expandedTokens, documents);
return scored
.filter((s) => s.score > 0)
.sort((a, b) => b.score - a.score)
.slice(0, limit)
.map((s) => {
const doc = documents[s.index];
const excerpt = extractBestExcerpt(doc.content, expandedTokens, 400);
return {
id: `brain-${s.index}`,
source: 'brain-memory' as const,
title: doc.relativePath,
content: summarizeText(excerpt, 400),
score: s.score,
tokenEstimate: estimateTokens(excerpt),
metadata: {
filePath: doc.filePath,
category: this.inferCategory(doc.relativePath),
isProjectEvidence: this.isProjectEvidence(doc.relativePath, doc.content),
lastUpdated: doc.lastModified,
// Phase 5: Scoring Intelligence Integration
conflictDetected: s.conflictDetected,
conflictSeverity: s.conflictSeverity,
informationDensity: s.informationDensity
}
};
try { content = fs.readFileSync(doc.filePath, 'utf8'); } catch { /* deleted just now — skip */ continue; }
const excerpt = extractBestExcerpt(content, expandedTokens, 400);
topResults.push({
id: `brain-${s.index}`,
source: 'brain-memory' as const,
title: doc.relativePath,
content: summarizeText(excerpt, 400),
score: s.score,
tokenEstimate: estimateTokens(excerpt),
metadata: {
filePath: doc.filePath,
category: this.inferCategory(doc.relativePath),
isProjectEvidence: this.isProjectEvidence(doc.relativePath, content),
lastUpdated: doc.mtimeMs,
// Phase 5: Scoring Intelligence Integration
conflictDetected: s.conflictDetected,
conflictSeverity: s.conflictSeverity,
informationDensity: s.informationDensity,
},
});
}
return topResults;
} catch {
return [];
}
+54 -15
View File
@@ -160,6 +160,30 @@ function inverseDocumentFrequency(
export type ConflictSeverity = 'NONE' | 'LOW' | 'MEDIUM' | 'HIGH';
/**
* Counts how many distinct conflict-indicator words are present (substring match) in `rawText`.
* Exposed so the brain index can cache this per-file instead of re-scanning content every query.
*/
export function countConflictIndicators(rawText: string): number {
const lower = (rawText || '').toLowerCase();
let n = 0;
for (const indicator of SCORING_CONFIG.CONFLICT_INDICATORS) {
if (lower.includes(indicator.toLowerCase())) n++;
}
return n;
}
/** A document whose tokens were already computed (e.g. from the persistent brain index). */
export interface PreTokenizedDoc {
/** tokenize(`${title} ${content}`) */
tokens: string[];
/** tokenize(title) */
titleTokens: string[];
lastModified?: number;
/** result of countConflictIndicators(`${title} ${content}`); 0 if unknown */
conflictCount: number;
}
export interface ScoredDocument {
index: number;
score: number;
@@ -173,6 +197,8 @@ export interface ScoredDocument {
/**
* TF-IDF 기반으로 문서 집합을 스코어링합니다.
* 문서 내용을 받아 즉석에서 토크나이즈합니다 — 이미 토큰화된 집합이 있다면
* `scoreTfIdfPreTokenized` 를 직접 호출하면 토크나이즈를 건너뛸 수 있습니다.
*/
export function scoreTfIdf(
queryTokens: string[],
@@ -183,11 +209,28 @@ export function scoreTfIdf(
}>
): ScoredDocument[] {
if (documents.length === 0 || queryTokens.length === 0) return [];
return scoreTfIdfPreTokenized(queryTokens, documents.map((doc) => {
const combined = `${doc.title} ${doc.content}`;
return {
tokens: tokenize(combined),
titleTokens: tokenize(doc.title),
lastModified: doc.lastModified,
conflictCount: countConflictIndicators(combined),
};
}));
}
// Pre-tokenize all documents
const docTokenArrays = documents.map((doc) =>
tokenize(`${doc.title} ${doc.content}`)
);
/**
* TF-IDF 스코어링 — 이미 토큰화된 문서 집합 버전 (브레인 인덱스 등 캐시된 토큰을 그대로 사용).
* `scoreTfIdf` 와 동일한 알고리즘이며 출력 형태도 같습니다.
*/
export function scoreTfIdfPreTokenized(
queryTokens: string[],
documents: PreTokenizedDoc[]
): ScoredDocument[] {
if (documents.length === 0 || queryTokens.length === 0) return [];
const docTokenArrays = documents.map((doc) => doc.tokens);
const docTokenSets = docTokenArrays.map((tokens) => new Set(tokens));
// Expand query with synonyms
@@ -205,22 +248,18 @@ export function scoreTfIdf(
return documents.map((doc, index) => {
const docTokens = docTokenArrays[index];
const titleTokens = new Set(tokenize(doc.title));
const titleTokens = new Set(doc.titleTokens);
let score = 0;
const matchedTerms: string[] = [];
// Conflict Detection & Severity Analysis (Substring based for better recall with particles)
const rawText = `${doc.title} ${doc.content}`.toLowerCase();
const conflictMatches = [...SCORING_CONFIG.CONFLICT_INDICATORS].filter(indicator =>
rawText.includes(indicator.toLowerCase())
);
const conflictDetected = conflictMatches.length > 0;
// Conflict Detection & Severity Analysis (pre-counted by caller / index)
const conflictCount = doc.conflictCount || 0;
const conflictDetected = conflictCount > 0;
let conflictSeverity: ConflictSeverity = 'NONE';
if (conflictMatches.length >= SCORING_CONFIG.CONFLICT_THRESHOLDS.HIGH) conflictSeverity = 'HIGH';
else if (conflictMatches.length >= SCORING_CONFIG.CONFLICT_THRESHOLDS.MEDIUM) conflictSeverity = 'MEDIUM';
else if (conflictMatches.length >= SCORING_CONFIG.CONFLICT_THRESHOLDS.LOW) conflictSeverity = 'LOW';
if (conflictCount >= SCORING_CONFIG.CONFLICT_THRESHOLDS.HIGH) conflictSeverity = 'HIGH';
else if (conflictCount >= SCORING_CONFIG.CONFLICT_THRESHOLDS.MEDIUM) conflictSeverity = 'MEDIUM';
else if (conflictCount >= SCORING_CONFIG.CONFLICT_THRESHOLDS.LOW) conflictSeverity = 'LOW';
for (const term of expandedQuery) {
const tf = termFrequency(term, docTokens);