Files
connectai/src/retrieval/index.ts
T
2026-05-12 22:54:21 +09:00

321 lines
13 KiB
TypeScript

/**
* ============================================================
* RetrievalOrchestrator — Unified RAG Pipeline
*
* Astra의 모든 검색 소스를 통합 관리하는 오케스트레이터입니다.
*
* 검색 흐름:
* ① Query Planning — 의도 분류 + 검색 전략 결정
* ② Parallel Search — Brain + Memory + Project + Episode 동시 검색
* ③ Result Fusion — 통합 스코어링 + 중복 제거
* ④ Context Budget — 토큰 예산 내에서 최종 선택
* ============================================================
*/
import * as fs from 'fs';
import * as path from 'path';
import { BrainProfile } from '../config';
import { findBrainFiles, summarizeText } from '../utils';
import { isInside } from '../lib/paths';
import { MemoryManager } from '../memory';
import { RetrievalChunk, RetrievalResult, ContextBudgetConfig } from './types';
import { tokenize, expandQuery, scoreTfIdfPreTokenized, extractBestExcerpt } from './scoring';
import { selectWithinBudget, assembleContext, estimateTokens } from './contextBudget';
import { getBrainTokenIndex } from './brainIndex';
export { tokenize, expandQuery, scoreTfIdf, scoreTfIdfPreTokenized, extractBestExcerpt } from './scoring';
export { selectWithinBudget, assembleContext, estimateTokens } from './contextBudget';
export { getBrainTokenIndex, clearBrainTokenIndex } from './brainIndex';
export * from './types';
interface RetrievalOptions {
brain: BrainProfile;
memoryManager: MemoryManager;
workspacePath?: string;
chatHistory?: Array<{ role: string; content: string }>;
contextBudget?: Partial<ContextBudgetConfig>;
brainFileLimit?: number;
includeRawConversations?: boolean;
/**
* Optional absolute folder paths constraining brain-file search to those
* subtrees. When provided and non-empty, only brain files inside one of
* the folders are considered. Empty / undefined preserves whole-brain
* search (legacy behavior). Folders that escape the brain root are
* silently dropped by the caller (see `agentKnowledgeMap.resolveScopeForAgent`).
*/
scopeFolders?: string[];
}
export class RetrievalOrchestrator {
/**
* 통합 검색을 수행합니다.
* 모든 소스에서 검색 → TF-IDF 스코어링 → 중복 제거 → 예산 내 선택
*/
public retrieve(query: string, options: RetrievalOptions): RetrievalResult {
const fusionLog: string[] = [];
const allChunks: RetrievalChunk[] = [];
const queryTokens = tokenize(query);
const expandedTokens = expandQuery(queryTokens);
fusionLog.push(`Query tokens: [${queryTokens.slice(0, 10).join(', ')}]`);
fusionLog.push(`Expanded tokens: [${expandedTokens.slice(0, 15).join(', ')}]`);
// ── ① Brain File Search (TF-IDF enhanced) ──
const scopeFolders = options.scopeFolders ?? [];
const brainChunks = this.searchBrainFiles(
query,
expandedTokens,
options.brain,
options.brainFileLimit || 8,
options.includeRawConversations || false,
scopeFolders
);
allChunks.push(...brainChunks);
fusionLog.push(
scopeFolders.length > 0
? `Brain search (scoped to ${scopeFolders.length} folder(s)): ${brainChunks.length} chunks`
: `Brain search: ${brainChunks.length} chunks found`
);
// ── ② Memory Layers ──
const memoryChunks = this.searchMemoryLayers(
query,
options.memoryManager,
options.chatHistory || [],
options.workspacePath
);
allChunks.push(...memoryChunks);
fusionLog.push(`Memory search: ${memoryChunks.length} chunks found`);
// ── ③ Result Fusion — normalize scores across sources ──
this.normalizeScores(allChunks);
fusionLog.push(`Total chunks before budget: ${allChunks.length}`);
// ── ④ Context Budget Selection ──
const { selected, dropped, tokensUsed } = selectWithinBudget(
allChunks,
options.contextBudget
);
fusionLog.push(`Selected: ${selected.length}, Dropped: ${dropped.length}, Tokens: ${tokensUsed}`);
return {
query,
totalChunks: allChunks.length,
selectedChunks: selected,
droppedChunks: dropped,
totalTokensUsed: tokensUsed,
contextBudget: options.contextBudget?.totalBudget || 8000,
fusionLog
};
}
/**
* 검색 결과를 최종 컨텍스트 문자열로 변환합니다.
*/
public buildContextString(result: RetrievalResult): string {
return assembleContext(result.selectedChunks);
}
// ─── Brain File Search ───
private searchBrainFiles(
query: string,
expandedTokens: string[],
brain: BrainProfile,
limit: number,
includeRaw: boolean,
scopeFolders: string[] = []
): RetrievalChunk[] {
try {
const scoped = (file: string) => scopeFolders.length === 0
|| scopeFolders.some((folder) => isInside(folder, file));
const allFiles = findBrainFiles(brain.localBrainPath)
.filter(scoped)
.filter((file) => includeRaw || !this.isRawConversation(path.relative(brain.localBrainPath, file)));
if (allFiles.length === 0) return [];
// Tokenized docs from the persistent mtime-keyed index — unchanged files are not re-read
// or re-tokenized, so per-query work over a large brain drops from O(total content) to O(files) stats.
const indexed = getBrainTokenIndex(brain.localBrainPath, allFiles);
if (indexed.length === 0) return [];
const scored = scoreTfIdfPreTokenized(
expandedTokens,
indexed.map((d) => ({
tokens: d.tokens,
titleTokens: d.titleTokens,
lastModified: d.mtimeMs,
conflictCount: d.conflictCount,
}))
);
const topResults: RetrievalChunk[] = [];
for (const s of scored.filter((x) => x.score > 0).sort((a, b) => b.score - a.score).slice(0, limit)) {
const doc = indexed[s.index];
// Only the top `limit` files are actually read off disk (for excerpt extraction).
let content = '';
try { content = fs.readFileSync(doc.filePath, 'utf8'); } catch { /* deleted just now — skip */ continue; }
const excerpt = extractBestExcerpt(content, expandedTokens, 400);
topResults.push({
id: `brain-${s.index}`,
source: 'brain-memory' as const,
title: doc.relativePath,
content: summarizeText(excerpt, 400),
score: s.score,
tokenEstimate: estimateTokens(excerpt),
metadata: {
filePath: doc.filePath,
category: this.inferCategory(doc.relativePath),
isProjectEvidence: this.isProjectEvidence(doc.relativePath, content),
lastUpdated: doc.mtimeMs,
// Phase 5: Scoring Intelligence Integration
conflictDetected: s.conflictDetected,
conflictSeverity: s.conflictSeverity,
informationDensity: s.informationDensity,
},
});
}
return topResults;
} catch {
return [];
}
}
// ─── Memory Layer Search ───
private searchMemoryLayers(
query: string,
memoryManager: MemoryManager,
chatHistory: Array<{ role: string; content: string }>,
workspacePath?: string
): RetrievalChunk[] {
const chunks: RetrievalChunk[] = [];
// Long-Term Memory
const ltm = memoryManager.getLongTermMemory();
const ltmContext = ltm.buildContext(query);
if (ltmContext) {
chunks.push({
id: 'ltm-context',
source: 'long-term-memory',
title: ltmContext.label,
content: ltmContext.content,
score: ltmContext.relevance,
tokenEstimate: estimateTokens(ltmContext.content),
metadata: { category: 'long-term' }
});
}
// Project Memory
if (workspacePath) {
const pm = memoryManager.getProjectMemory(workspacePath);
const pmContext = pm.buildContext(query);
if (pmContext) {
chunks.push({
id: 'pm-context',
source: 'project-memory',
title: pmContext.label,
content: pmContext.content,
score: pmContext.relevance,
tokenEstimate: estimateTokens(pmContext.content),
metadata: { category: 'project', isProjectEvidence: true }
});
}
}
// Procedural Memory
const proc = memoryManager.getProceduralMemory();
const procContext = proc.buildContext(query);
if (procContext) {
chunks.push({
id: 'proc-context',
source: 'procedural-memory',
title: procContext.label,
content: procContext.content,
score: procContext.relevance,
tokenEstimate: estimateTokens(procContext.content),
metadata: { category: 'procedural' }
});
}
// Episodic Memory
const ep = memoryManager.getEpisodicMemory();
const epContext = ep.buildContext(query);
if (epContext) {
chunks.push({
id: 'ep-context',
source: 'episodic-memory',
title: epContext.label,
content: epContext.content,
score: epContext.relevance,
tokenEstimate: estimateTokens(epContext.content),
metadata: { category: 'episodic' }
});
}
return chunks;
}
// ─── Score Normalization ───
/**
* 서로 다른 스코어 스케일을 가진 소스들의 점수를 0~1로 정규화합니다.
*/
private normalizeScores(chunks: RetrievalChunk[]): void {
// Group by source
const groups = new Map<string, RetrievalChunk[]>();
for (const chunk of chunks) {
if (!groups.has(chunk.source)) groups.set(chunk.source, []);
groups.get(chunk.source)!.push(chunk);
}
// Normalize each group independently
for (const [, group] of groups) {
const maxScore = Math.max(...group.map((c) => c.score), 0.001);
for (const chunk of group) {
chunk.score = chunk.score / maxScore;
}
}
// Source priority boost (some sources are inherently more valuable for RAG)
const sourceBoost: Record<string, number> = {
'brain-trace': 1.0,
'brain-memory': 0.9,
'project-memory': 0.85,
'long-term-memory': 0.8,
'procedural-memory': 0.95, // Procedural is highly specific
'episodic-memory': 0.7,
'project-scan': 0.6,
'recent-knowledge': 0.75
};
for (const chunk of chunks) {
const boost = sourceBoost[chunk.source] || 0.5;
chunk.score *= boost;
}
}
// ─── Helpers ───
private isRawConversation(relativePath: string): boolean {
return /(^|[\\/])(00_Raw|raw-data|conversations?|transcripts?)([\\/]|$)/i.test(relativePath);
}
private inferCategory(relativePath: string): string {
const normalized = relativePath.toLowerCase();
if (/(decisions?|adr|planning)/i.test(normalized)) return 'decision';
if (/(records|development|bugs)/i.test(normalized)) return 'project-record';
if (/(architecture|design|pattern)/i.test(normalized)) return 'architecture';
if (/(knowledge|wiki|topics)/i.test(normalized)) return 'knowledge';
return 'general';
}
private isProjectEvidence(relativePath: string, content: string): boolean {
const normalized = relativePath.toLowerCase();
if (/(records|planning|development|bugs|retrospectives|projectchronicle)/i.test(normalized)) return true;
if (/adr-\d+|(^|[\\/])decisions?([\\/]|$)/i.test(normalized)) return true;
return false;
}
}