chore: version up to 2.80.34 and package
This commit is contained in:
@@ -0,0 +1,220 @@
|
||||
/**
|
||||
* ============================================================
|
||||
* Brain Index — persistent, mtime-keyed tokenized cache of the Second Brain
|
||||
*
|
||||
* RAG 검색은 매 질의마다 브레인의 모든 .md 파일을 읽고 토크나이즈해서 TF-IDF 점수를
|
||||
* 계산했습니다 — 파일 수가 많아지면 그게 병목입니다.
|
||||
*
|
||||
* 이 모듈은 `<brainPath>/.astra/brain-index.json` 에 파일별 토큰 배열을 (mtime+size 키로)
|
||||
* 저장해 두고, 다음 질의에서는 *변경된 파일만* 다시 읽어 토크나이즈합니다. 나머지는 디스크/메모리
|
||||
* 캐시에서 그대로 가져옵니다. 디스크 쓰기는 디바운스되고 실패해도 in-memory 로만 동작합니다.
|
||||
* ============================================================
|
||||
*/
|
||||
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import { tokenize, countConflictIndicators } from './scoring';
|
||||
import { logInfo } from '../utils';
|
||||
|
||||
const INDEX_VERSION = 2;
|
||||
const INDEX_DIR = '.astra';
|
||||
const INDEX_FILE = 'brain-index.json';
|
||||
/** 인덱스가 이 개수를 넘으면 이번 스캔에서 못 본 항목을 정리합니다 (삭제된 파일 누적 방지). */
|
||||
const MAX_INDEX_ENTRIES = 12000;
|
||||
/** 디스크 쓰기 디바운스. */
|
||||
const WRITE_DEBOUNCE_MS = 1500;
|
||||
|
||||
interface IndexEntry {
|
||||
mtimeMs: number;
|
||||
size: number;
|
||||
title: string; // basename without .md
|
||||
relativePath: string; // relative to brainPath
|
||||
tokens: string[]; // tokenize(`${title} ${content}`)
|
||||
titleTokens: string[]; // tokenize(title)
|
||||
conflictCount: number; // countConflictIndicators(`${title} ${content}`)
|
||||
}
|
||||
|
||||
interface PersistedIndex {
|
||||
version: number;
|
||||
entries: Record<string, IndexEntry>; // keyed by absolute file path
|
||||
}
|
||||
|
||||
export interface IndexedBrainDoc {
|
||||
filePath: string;
|
||||
relativePath: string;
|
||||
title: string;
|
||||
tokens: string[];
|
||||
titleTokens: string[];
|
||||
conflictCount: number;
|
||||
mtimeMs: number;
|
||||
}
|
||||
|
||||
interface BrainState {
|
||||
index: PersistedIndex;
|
||||
dirty: boolean;
|
||||
diskPath: string | null; // null if we can't determine a writable path
|
||||
writeTimer?: ReturnType<typeof setTimeout>;
|
||||
}
|
||||
|
||||
const _states = new Map<string, BrainState>();
|
||||
|
||||
function indexFileFor(brainPath: string): string {
|
||||
return path.join(brainPath, INDEX_DIR, INDEX_FILE);
|
||||
}
|
||||
|
||||
function loadState(brainPath: string): BrainState {
|
||||
const existing = _states.get(brainPath);
|
||||
if (existing) return existing;
|
||||
|
||||
let index: PersistedIndex = { version: INDEX_VERSION, entries: {} };
|
||||
let diskPath: string | null = null;
|
||||
try {
|
||||
diskPath = indexFileFor(brainPath);
|
||||
if (fs.existsSync(diskPath)) {
|
||||
const raw = JSON.parse(fs.readFileSync(diskPath, 'utf8'));
|
||||
if (raw && raw.version === INDEX_VERSION && raw.entries && typeof raw.entries === 'object') {
|
||||
index = raw as PersistedIndex;
|
||||
} else {
|
||||
logInfo('Brain index is stale/unrecognized — rebuilding.', { brainPath });
|
||||
}
|
||||
}
|
||||
} catch (e: any) {
|
||||
logInfo('Brain index load failed — starting fresh.', { brainPath, error: e?.message || String(e) });
|
||||
index = { version: INDEX_VERSION, entries: {} };
|
||||
}
|
||||
const st: BrainState = { index, dirty: false, diskPath };
|
||||
_states.set(brainPath, st);
|
||||
return st;
|
||||
}
|
||||
|
||||
function scheduleWrite(st: BrainState, brainPath: string): void {
|
||||
if (!st.dirty || !st.diskPath || st.writeTimer) return;
|
||||
const timer = setTimeout(() => {
|
||||
st.writeTimer = undefined;
|
||||
if (!st.dirty || !st.diskPath) return;
|
||||
try {
|
||||
const dir = path.dirname(st.diskPath);
|
||||
fs.mkdirSync(dir, { recursive: true });
|
||||
// One-time .gitignore so the cache dir never gets committed into a Second Brain git repo.
|
||||
const gi = path.join(dir, '.gitignore');
|
||||
if (!fs.existsSync(gi)) {
|
||||
try { fs.writeFileSync(gi, '*\n', 'utf8'); } catch { /* non-fatal */ }
|
||||
}
|
||||
const tmp = `${st.diskPath}.tmp`;
|
||||
fs.writeFileSync(tmp, JSON.stringify(st.index), 'utf8');
|
||||
fs.renameSync(tmp, st.diskPath);
|
||||
st.dirty = false;
|
||||
} catch (e: any) {
|
||||
logInfo('Brain index write failed (continuing in-memory only).', { brainPath, error: e?.message || String(e) });
|
||||
}
|
||||
}, WRITE_DEBOUNCE_MS);
|
||||
if (typeof (timer as any).unref === 'function') (timer as any).unref();
|
||||
st.writeTimer = timer;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns tokenized representations for `files` (absolute brain-file paths, already
|
||||
* scoped/filtered by the caller). Unchanged files are served from the index; changed/new
|
||||
* files are read & tokenized and the index is updated (debounced disk write).
|
||||
*
|
||||
* Safe to call with an empty/invalid `brainPath` or empty list — returns [].
|
||||
*/
|
||||
export function getBrainTokenIndex(brainPath: string, files: string[]): IndexedBrainDoc[] {
|
||||
if (!brainPath || !Array.isArray(files) || files.length === 0) return [];
|
||||
const st = loadState(brainPath);
|
||||
const out: IndexedBrainDoc[] = [];
|
||||
const seen = new Set<string>();
|
||||
let reindexed = 0;
|
||||
|
||||
for (const file of files) {
|
||||
seen.add(file);
|
||||
let stat: fs.Stats;
|
||||
try {
|
||||
stat = fs.statSync(file);
|
||||
} catch {
|
||||
continue; // listed but gone now — skip silently
|
||||
}
|
||||
const cached = st.index.entries[file];
|
||||
if (cached
|
||||
&& cached.mtimeMs === stat.mtimeMs
|
||||
&& cached.size === stat.size
|
||||
&& Array.isArray(cached.tokens)
|
||||
&& Array.isArray(cached.titleTokens)) {
|
||||
out.push({
|
||||
filePath: file,
|
||||
relativePath: cached.relativePath,
|
||||
title: cached.title,
|
||||
tokens: cached.tokens,
|
||||
titleTokens: cached.titleTokens,
|
||||
conflictCount: cached.conflictCount || 0,
|
||||
mtimeMs: cached.mtimeMs,
|
||||
});
|
||||
continue;
|
||||
}
|
||||
// (Re)index this file.
|
||||
let content = '';
|
||||
try {
|
||||
content = fs.readFileSync(file, 'utf8');
|
||||
} catch {
|
||||
continue;
|
||||
}
|
||||
const relativePath = path.relative(brainPath, file);
|
||||
const title = path.basename(file, '.md');
|
||||
const combined = `${title} ${content}`;
|
||||
const entry: IndexEntry = {
|
||||
mtimeMs: stat.mtimeMs,
|
||||
size: stat.size,
|
||||
title,
|
||||
relativePath,
|
||||
tokens: tokenize(combined),
|
||||
titleTokens: tokenize(title),
|
||||
conflictCount: countConflictIndicators(combined),
|
||||
};
|
||||
st.index.entries[file] = entry;
|
||||
st.dirty = true;
|
||||
reindexed++;
|
||||
out.push({
|
||||
filePath: file,
|
||||
relativePath,
|
||||
title,
|
||||
tokens: entry.tokens,
|
||||
titleTokens: entry.titleTokens,
|
||||
conflictCount: entry.conflictCount,
|
||||
mtimeMs: entry.mtimeMs,
|
||||
});
|
||||
}
|
||||
|
||||
// Prune stale entries. We only prune when this looked like a (near-)full scan — i.e. we saw
|
||||
// most of the index — so an agent-scoped query doesn't evict cache for out-of-scope files.
|
||||
// (Falls back to a hard prune if the index has grown beyond MAX_INDEX_ENTRIES.)
|
||||
const entryKeys = Object.keys(st.index.entries);
|
||||
const looksFullScan = seen.size >= entryKeys.length * 0.8;
|
||||
if (looksFullScan || entryKeys.length > MAX_INDEX_ENTRIES) {
|
||||
for (const key of entryKeys) {
|
||||
if (!seen.has(key)) {
|
||||
delete st.index.entries[key];
|
||||
st.dirty = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (reindexed > 0) {
|
||||
logInfo('Brain index updated.', { brainPath, files: files.length, reindexed, totalEntries: Object.keys(st.index.entries).length });
|
||||
}
|
||||
if (st.dirty) scheduleWrite(st, brainPath);
|
||||
return out;
|
||||
}
|
||||
|
||||
/** Drop the in-memory index (and pending write) for one brain, or all brains. The disk file is left as-is. */
|
||||
export function clearBrainTokenIndex(brainPath?: string): void {
|
||||
if (brainPath === undefined) {
|
||||
for (const st of _states.values()) {
|
||||
if (st.writeTimer) { clearTimeout(st.writeTimer); st.writeTimer = undefined; }
|
||||
}
|
||||
_states.clear();
|
||||
return;
|
||||
}
|
||||
const st = _states.get(brainPath);
|
||||
if (st?.writeTimer) clearTimeout(st.writeTimer);
|
||||
_states.delete(brainPath);
|
||||
}
|
||||
+44
-46
@@ -19,11 +19,13 @@ import { findBrainFiles, summarizeText } from '../utils';
|
||||
import { isInside } from '../lib/paths';
|
||||
import { MemoryManager } from '../memory';
|
||||
import { RetrievalChunk, RetrievalResult, ContextBudgetConfig } from './types';
|
||||
import { tokenize, expandQuery, scoreTfIdf, extractBestExcerpt } from './scoring';
|
||||
import { tokenize, expandQuery, scoreTfIdfPreTokenized, extractBestExcerpt } from './scoring';
|
||||
import { selectWithinBudget, assembleContext, estimateTokens } from './contextBudget';
|
||||
import { getBrainTokenIndex } from './brainIndex';
|
||||
|
||||
export { tokenize, expandQuery, scoreTfIdf, extractBestExcerpt } from './scoring';
|
||||
export { tokenize, expandQuery, scoreTfIdf, scoreTfIdfPreTokenized, extractBestExcerpt } from './scoring';
|
||||
export { selectWithinBudget, assembleContext, estimateTokens } from './contextBudget';
|
||||
export { getBrainTokenIndex, clearBrainTokenIndex } from './brainIndex';
|
||||
export * from './types';
|
||||
|
||||
interface RetrievalOptions {
|
||||
@@ -133,52 +135,48 @@ export class RetrievalOrchestrator {
|
||||
|
||||
if (allFiles.length === 0) return [];
|
||||
|
||||
// Read all files for TF-IDF
|
||||
const documents = allFiles.map((file) => {
|
||||
// Tokenized docs from the persistent mtime-keyed index — unchanged files are not re-read
|
||||
// or re-tokenized, so per-query work over a large brain drops from O(total content) to O(files) stats.
|
||||
const indexed = getBrainTokenIndex(brain.localBrainPath, allFiles);
|
||||
if (indexed.length === 0) return [];
|
||||
|
||||
const scored = scoreTfIdfPreTokenized(
|
||||
expandedTokens,
|
||||
indexed.map((d) => ({
|
||||
tokens: d.tokens,
|
||||
titleTokens: d.titleTokens,
|
||||
lastModified: d.mtimeMs,
|
||||
conflictCount: d.conflictCount,
|
||||
}))
|
||||
);
|
||||
|
||||
const topResults: RetrievalChunk[] = [];
|
||||
for (const s of scored.filter((x) => x.score > 0).sort((a, b) => b.score - a.score).slice(0, limit)) {
|
||||
const doc = indexed[s.index];
|
||||
// Only the top `limit` files are actually read off disk (for excerpt extraction).
|
||||
let content = '';
|
||||
let lastModified = 0;
|
||||
try {
|
||||
content = fs.readFileSync(file, 'utf8');
|
||||
lastModified = fs.statSync(file).mtimeMs;
|
||||
} catch { /* skip */ }
|
||||
return {
|
||||
title: path.basename(file, '.md'),
|
||||
content,
|
||||
lastModified,
|
||||
filePath: file,
|
||||
relativePath: path.relative(brain.localBrainPath, file)
|
||||
};
|
||||
});
|
||||
|
||||
// TF-IDF scoring
|
||||
const scored = scoreTfIdf(expandedTokens, documents);
|
||||
|
||||
return scored
|
||||
.filter((s) => s.score > 0)
|
||||
.sort((a, b) => b.score - a.score)
|
||||
.slice(0, limit)
|
||||
.map((s) => {
|
||||
const doc = documents[s.index];
|
||||
const excerpt = extractBestExcerpt(doc.content, expandedTokens, 400);
|
||||
return {
|
||||
id: `brain-${s.index}`,
|
||||
source: 'brain-memory' as const,
|
||||
title: doc.relativePath,
|
||||
content: summarizeText(excerpt, 400),
|
||||
score: s.score,
|
||||
tokenEstimate: estimateTokens(excerpt),
|
||||
metadata: {
|
||||
filePath: doc.filePath,
|
||||
category: this.inferCategory(doc.relativePath),
|
||||
isProjectEvidence: this.isProjectEvidence(doc.relativePath, doc.content),
|
||||
lastUpdated: doc.lastModified,
|
||||
// Phase 5: Scoring Intelligence Integration
|
||||
conflictDetected: s.conflictDetected,
|
||||
conflictSeverity: s.conflictSeverity,
|
||||
informationDensity: s.informationDensity
|
||||
}
|
||||
};
|
||||
try { content = fs.readFileSync(doc.filePath, 'utf8'); } catch { /* deleted just now — skip */ continue; }
|
||||
const excerpt = extractBestExcerpt(content, expandedTokens, 400);
|
||||
topResults.push({
|
||||
id: `brain-${s.index}`,
|
||||
source: 'brain-memory' as const,
|
||||
title: doc.relativePath,
|
||||
content: summarizeText(excerpt, 400),
|
||||
score: s.score,
|
||||
tokenEstimate: estimateTokens(excerpt),
|
||||
metadata: {
|
||||
filePath: doc.filePath,
|
||||
category: this.inferCategory(doc.relativePath),
|
||||
isProjectEvidence: this.isProjectEvidence(doc.relativePath, content),
|
||||
lastUpdated: doc.mtimeMs,
|
||||
// Phase 5: Scoring Intelligence Integration
|
||||
conflictDetected: s.conflictDetected,
|
||||
conflictSeverity: s.conflictSeverity,
|
||||
informationDensity: s.informationDensity,
|
||||
},
|
||||
});
|
||||
}
|
||||
return topResults;
|
||||
} catch {
|
||||
return [];
|
||||
}
|
||||
|
||||
+54
-15
@@ -160,6 +160,30 @@ function inverseDocumentFrequency(
|
||||
|
||||
export type ConflictSeverity = 'NONE' | 'LOW' | 'MEDIUM' | 'HIGH';
|
||||
|
||||
/**
|
||||
* Counts how many distinct conflict-indicator words are present (substring match) in `rawText`.
|
||||
* Exposed so the brain index can cache this per-file instead of re-scanning content every query.
|
||||
*/
|
||||
export function countConflictIndicators(rawText: string): number {
|
||||
const lower = (rawText || '').toLowerCase();
|
||||
let n = 0;
|
||||
for (const indicator of SCORING_CONFIG.CONFLICT_INDICATORS) {
|
||||
if (lower.includes(indicator.toLowerCase())) n++;
|
||||
}
|
||||
return n;
|
||||
}
|
||||
|
||||
/** A document whose tokens were already computed (e.g. from the persistent brain index). */
|
||||
export interface PreTokenizedDoc {
|
||||
/** tokenize(`${title} ${content}`) */
|
||||
tokens: string[];
|
||||
/** tokenize(title) */
|
||||
titleTokens: string[];
|
||||
lastModified?: number;
|
||||
/** result of countConflictIndicators(`${title} ${content}`); 0 if unknown */
|
||||
conflictCount: number;
|
||||
}
|
||||
|
||||
export interface ScoredDocument {
|
||||
index: number;
|
||||
score: number;
|
||||
@@ -173,6 +197,8 @@ export interface ScoredDocument {
|
||||
|
||||
/**
|
||||
* TF-IDF 기반으로 문서 집합을 스코어링합니다.
|
||||
* 문서 내용을 받아 즉석에서 토크나이즈합니다 — 이미 토큰화된 집합이 있다면
|
||||
* `scoreTfIdfPreTokenized` 를 직접 호출하면 토크나이즈를 건너뛸 수 있습니다.
|
||||
*/
|
||||
export function scoreTfIdf(
|
||||
queryTokens: string[],
|
||||
@@ -183,11 +209,28 @@ export function scoreTfIdf(
|
||||
}>
|
||||
): ScoredDocument[] {
|
||||
if (documents.length === 0 || queryTokens.length === 0) return [];
|
||||
return scoreTfIdfPreTokenized(queryTokens, documents.map((doc) => {
|
||||
const combined = `${doc.title} ${doc.content}`;
|
||||
return {
|
||||
tokens: tokenize(combined),
|
||||
titleTokens: tokenize(doc.title),
|
||||
lastModified: doc.lastModified,
|
||||
conflictCount: countConflictIndicators(combined),
|
||||
};
|
||||
}));
|
||||
}
|
||||
|
||||
// Pre-tokenize all documents
|
||||
const docTokenArrays = documents.map((doc) =>
|
||||
tokenize(`${doc.title} ${doc.content}`)
|
||||
);
|
||||
/**
|
||||
* TF-IDF 스코어링 — 이미 토큰화된 문서 집합 버전 (브레인 인덱스 등 캐시된 토큰을 그대로 사용).
|
||||
* `scoreTfIdf` 와 동일한 알고리즘이며 출력 형태도 같습니다.
|
||||
*/
|
||||
export function scoreTfIdfPreTokenized(
|
||||
queryTokens: string[],
|
||||
documents: PreTokenizedDoc[]
|
||||
): ScoredDocument[] {
|
||||
if (documents.length === 0 || queryTokens.length === 0) return [];
|
||||
|
||||
const docTokenArrays = documents.map((doc) => doc.tokens);
|
||||
const docTokenSets = docTokenArrays.map((tokens) => new Set(tokens));
|
||||
|
||||
// Expand query with synonyms
|
||||
@@ -205,22 +248,18 @@ export function scoreTfIdf(
|
||||
|
||||
return documents.map((doc, index) => {
|
||||
const docTokens = docTokenArrays[index];
|
||||
const titleTokens = new Set(tokenize(doc.title));
|
||||
const titleTokens = new Set(doc.titleTokens);
|
||||
let score = 0;
|
||||
const matchedTerms: string[] = [];
|
||||
|
||||
// Conflict Detection & Severity Analysis (Substring based for better recall with particles)
|
||||
const rawText = `${doc.title} ${doc.content}`.toLowerCase();
|
||||
const conflictMatches = [...SCORING_CONFIG.CONFLICT_INDICATORS].filter(indicator =>
|
||||
rawText.includes(indicator.toLowerCase())
|
||||
);
|
||||
|
||||
const conflictDetected = conflictMatches.length > 0;
|
||||
// Conflict Detection & Severity Analysis (pre-counted by caller / index)
|
||||
const conflictCount = doc.conflictCount || 0;
|
||||
const conflictDetected = conflictCount > 0;
|
||||
let conflictSeverity: ConflictSeverity = 'NONE';
|
||||
|
||||
if (conflictMatches.length >= SCORING_CONFIG.CONFLICT_THRESHOLDS.HIGH) conflictSeverity = 'HIGH';
|
||||
else if (conflictMatches.length >= SCORING_CONFIG.CONFLICT_THRESHOLDS.MEDIUM) conflictSeverity = 'MEDIUM';
|
||||
else if (conflictMatches.length >= SCORING_CONFIG.CONFLICT_THRESHOLDS.LOW) conflictSeverity = 'LOW';
|
||||
if (conflictCount >= SCORING_CONFIG.CONFLICT_THRESHOLDS.HIGH) conflictSeverity = 'HIGH';
|
||||
else if (conflictCount >= SCORING_CONFIG.CONFLICT_THRESHOLDS.MEDIUM) conflictSeverity = 'MEDIUM';
|
||||
else if (conflictCount >= SCORING_CONFIG.CONFLICT_THRESHOLDS.LOW) conflictSeverity = 'LOW';
|
||||
|
||||
for (const term of expandedQuery) {
|
||||
const tf = termFrequency(term, docTokens);
|
||||
|
||||
Reference in New Issue
Block a user