/** * ============================================================ * RetrievalOrchestrator — Unified RAG Pipeline * * Astra의 모든 검색 소스를 통합 관리하는 오케스트레이터입니다. * * 검색 흐름: * ① Query Planning — 의도 분류 + 검색 전략 결정 * ② Parallel Search — Brain + Memory + Project + Episode 동시 검색 * ③ Result Fusion — 통합 스코어링 + 중복 제거 * ④ Context Budget — 토큰 예산 내에서 최종 선택 * ============================================================ */ import * as fs from 'fs'; import * as path from 'path'; import { BrainProfile } from '../config'; import { findBrainFiles, summarizeText } from '../utils'; import { isInside } from '../lib/paths'; import { MemoryManager } from '../memory'; import { RetrievalChunk, RetrievalResult, ContextBudgetConfig } from './types'; import { tokenize, expandQuery, scoreTfIdfPreTokenized, extractBestExcerpt, extractBestSection } from './scoring'; import { selectWithinBudget, assembleContext, estimateTokens } from './contextBudget'; import { getBrainTokenIndex, getBrainEmbeddings } from './brainIndex'; import { extractLessonEssence } from './lessonHelpers'; import { cosineSimilarity } from './embeddings'; export { tokenize, expandQuery, scoreTfIdf, scoreTfIdfPreTokenized, extractBestExcerpt } from './scoring'; export { selectWithinBudget, assembleContext, estimateTokens } from './contextBudget'; export { getBrainTokenIndex, clearBrainTokenIndex } from './brainIndex'; export * from './types'; /** Compact summary of a past chat session for medium-term memory retrieval. */ export interface RecentSessionSummary { id: string; title: string; firstUserMsg: string; lastAssistantExcerpt: string; /** * Optional LLM-compressed recap stored at session end (~200 chars). * When present, retrieval uses this instead of the firstUserMsg+tail * fragment because it actually captures the decision/outcome. */ summary?: string; timestamp: number; } interface RetrievalOptions { brain: BrainProfile; memoryManager: MemoryManager; workspacePath?: string; chatHistory?: Array<{ role: string; content: string }>; contextBudget?: Partial; brainFileLimit?: number; includeRawConversations?: boolean; /** * Optional absolute folder paths constraining brain-file search to those * subtrees. When provided and non-empty, only brain files inside one of * the folders are considered. Empty / undefined preserves whole-brain * search (legacy behavior). Folders that escape the brain root are * silently dropped by the caller (see `agentKnowledgeMap.resolveScopeForAgent`). */ scopeFolders?: string[]; /** * Compact summaries of recently-touched chat sessions (excluding the * active one). Scored against the query and the top `mediumTermLimit` * are injected as medium-term memory chunks. Caller pre-computes these * to avoid threading vscode/ExtensionContext through this module. */ recentSessions?: RecentSessionSummary[]; /** Max number of medium-term session chunks to include after scoring. */ mediumTermLimit?: number; /** * Optional query embedding for hybrid (sparse+dense) brain search. When * provided, each candidate file's cached embedding is cosine-matched and * blended with the TF-IDF score by `embeddingBlendAlpha`. Caller computes * this once per turn so we don't pay the embedding RTT inside scoring. */ queryEmbedding?: number[]; /** Embedding model name (used as a cache key on the brain index side). */ embeddingModel?: string; /** Blend weight: 0 = TF-IDF only, 1 = cosine only. Default 0.5. */ embeddingBlendAlpha?: number; } export class RetrievalOrchestrator { /** * 통합 검색을 수행합니다. * 모든 소스에서 검색 → TF-IDF 스코어링 → 중복 제거 → 예산 내 선택 */ public retrieve(query: string, options: RetrievalOptions): RetrievalResult { const fusionLog: string[] = []; const allChunks: RetrievalChunk[] = []; const queryTokens = tokenize(query); const expandedTokens = expandQuery(queryTokens); fusionLog.push(`Query tokens: [${queryTokens.slice(0, 10).join(', ')}]`); fusionLog.push(`Expanded tokens: [${expandedTokens.slice(0, 15).join(', ')}]`); // ── ① Brain File Search (TF-IDF enhanced, optionally hybrid with embeddings) ── // `brainFileLimit === 0` is meaningful (Knowledge Mix "model knowledge only" // mode), so use `??` rather than `||`. When the caller explicitly passes 0, // we skip retrieval entirely instead of falling back to the default of 8. const scopeFolders = options.scopeFolders ?? []; const brainFileLimit = options.brainFileLimit ?? 8; const brainChunks = brainFileLimit > 0 ? this.searchBrainFiles( query, expandedTokens, options.brain, brainFileLimit, options.includeRawConversations || false, scopeFolders, options.queryEmbedding, options.embeddingModel, options.embeddingBlendAlpha ) : []; allChunks.push(...brainChunks); fusionLog.push( brainFileLimit === 0 ? 'Brain search: skipped (Knowledge Mix weight = 0)' : scopeFolders.length > 0 ? `Brain search (scoped to ${scopeFolders.length} folder(s)): ${brainChunks.length} chunks` : `Brain search: ${brainChunks.length} chunks found` ); // ── ② Memory Layers ── const memoryChunks = this.searchMemoryLayers( query, options.memoryManager, options.chatHistory || [], options.workspacePath ); allChunks.push(...memoryChunks); fusionLog.push(`Memory search: ${memoryChunks.length} chunks found`); // ── ②-b Medium-Term Memory (recent sessions) ── const mediumChunks = this.scoreRecentSessions( expandedTokens, options.recentSessions || [], options.mediumTermLimit ?? 0 ); allChunks.push(...mediumChunks); fusionLog.push(`Medium-term sessions: ${mediumChunks.length} chunks selected`); // ── ③ Result Fusion — normalize scores across sources ── this.normalizeScores(allChunks); fusionLog.push(`Total chunks before budget: ${allChunks.length}`); // ── ④ Context Budget Selection ── const { selected, dropped, tokensUsed } = selectWithinBudget( allChunks, options.contextBudget ); // Pull lesson/playbook/qa-finding chunks out so callers can inject them as a prominent // "verify before finalizing" block rather than burying them in the brain-knowledge section. const lessonChunks = selected.filter((c) => c.metadata.isLesson); const selectedChunks = selected.filter((c) => !c.metadata.isLesson); fusionLog.push(`Selected: ${selectedChunks.length} (+${lessonChunks.length} lesson), Dropped: ${dropped.length}, Tokens: ${tokensUsed}`); return { query, totalChunks: allChunks.length, selectedChunks, droppedChunks: dropped, lessonChunks, totalTokensUsed: tokensUsed, contextBudget: options.contextBudget?.totalBudget || 8000, fusionLog }; } /** * 검색 결과를 최종 컨텍스트 문자열로 변환합니다 (레슨 청크는 제외 — 별도 블록으로 주입). */ public buildContextString(result: RetrievalResult): string { return assembleContext(result.selectedChunks); } // ─── Brain File Search ─── private searchBrainFiles( query: string, expandedTokens: string[], brain: BrainProfile, limit: number, includeRaw: boolean, scopeFolders: string[] = [], queryEmbedding?: number[], embeddingModel?: string, embeddingBlendAlpha?: number, ): RetrievalChunk[] { try { const scoped = (file: string) => scopeFolders.length === 0 || scopeFolders.some((folder) => isInside(folder, file)); const allFiles = findBrainFiles(brain.localBrainPath) .filter(scoped) .filter((file) => includeRaw || !this.isRawConversation(path.relative(brain.localBrainPath, file))); if (allFiles.length === 0) return []; // Tokenized docs from the persistent mtime-keyed index — unchanged files are not re-read // or re-tokenized, so per-query work over a large brain drops from O(total content) to O(files) stats. const indexed = getBrainTokenIndex(brain.localBrainPath, allFiles); if (indexed.length === 0) return []; const scored = scoreTfIdfPreTokenized( expandedTokens, indexed.map((d) => ({ tokens: d.tokens, titleTokens: d.titleTokens, lastModified: d.mtimeMs, conflictCount: d.conflictCount, })) ); // Hybrid blend: when the caller provided a query embedding and an // embedding model, fetch the cached file vectors and add a cosine // similarity term to each score. We normalise TF-IDF scores by the // top observed value so the two terms live on the same scale before // blending. Files without a cached embedding keep their pure TF-IDF // score so adding/missing embeddings doesn't hurt retrieval. if (queryEmbedding && embeddingModel && (embeddingBlendAlpha ?? 0) > 0) { const alpha = Math.max(0, Math.min(1, embeddingBlendAlpha!)); const filePaths = indexed.map((d) => d.filePath); const embeddings = getBrainEmbeddings(brain.localBrainPath, filePaths, embeddingModel); if (embeddings.size > 0) { const maxTfidf = scored.reduce((m, s) => s.score > m ? s.score : m, 0) || 1; let hits = 0; for (const s of scored) { const fp = indexed[s.index].filePath; const vec = embeddings.get(fp); if (!vec) continue; const cos = cosineSimilarity(queryEmbedding, vec); // [-1, 1] in theory; positive for typical embedding spaces const tfidfNorm = s.score / maxTfidf; s.score = (1 - alpha) * tfidfNorm + alpha * Math.max(0, cos); hits++; } if (hits > 0) { // Re-sort downstream is handled by the .filter().sort() that follows. } } } // Always consider lesson cards for the top slots even if they didn't crack the raw-score top-`limit`: // they're short, high-signal, and we want them surfaced when relevant. We keep the regular top-`limit` // and additively pull in up to a few lesson cards (deduped by index). const ranked = scored.filter((x) => x.score > 0).sort((a, b) => b.score - a.score); const pickedIdx = new Set(); for (const s of ranked.slice(0, limit)) pickedIdx.add(s.index); const LESSON_EXTRA = 3; let lessonExtra = 0; for (const s of ranked) { if (lessonExtra >= LESSON_EXTRA) break; if (pickedIdx.has(s.index)) continue; if ((indexed[s.index].kind || '') === '') continue; pickedIdx.add(s.index); lessonExtra++; } // Preserve rank order for the chosen set. const chosen = ranked.filter((s) => pickedIdx.has(s.index)); const topResults: RetrievalChunk[] = []; for (const s of chosen) { const doc = indexed[s.index]; const isLesson = (doc.kind || '') !== ''; // Only the chosen files are actually read off disk (for excerpt extraction). let content = ''; try { content = fs.readFileSync(doc.filePath, 'utf8'); } catch { /* deleted just now — skip */ continue; } // Lesson cards: extract just the high-signal sections (Mistake / Root Cause / Fix / // Prevention Checklist) instead of dumping the whole 2500-char card. Old lessons // without those headings fall back to a query-targeted excerpt. Cuts retrieval tokens // by ~70% per lesson without losing the guardrail content. // // Regular notes: pick the best heading-bounded section for the query (markdown // section retrieval) so that long notes don't dump their intro/setup blocks just // because they happen to be in the top 400 chars. Falls back to keyword-window // extraction inside the section, or whole-doc extraction when there are no // headings at all. const excerpt = isLesson ? extractLessonEssence(content, 1200) || extractBestExcerpt(content, expandedTokens, 1200) : extractBestSection(content, expandedTokens, 600); const cap = isLesson ? 1200 : 600; topResults.push({ id: `brain-${s.index}`, source: 'brain-memory' as const, title: doc.relativePath, content: summarizeText(excerpt, cap), score: s.score, tokenEstimate: estimateTokens(excerpt), metadata: { filePath: doc.filePath, category: this.inferCategory(doc.relativePath), isProjectEvidence: this.isProjectEvidence(doc.relativePath, content), lastUpdated: doc.mtimeMs, // Phase 5: Scoring Intelligence Integration conflictDetected: s.conflictDetected, conflictSeverity: s.conflictSeverity, informationDensity: s.informationDensity, ...(isLesson ? { isLesson: true, lessonKind: doc.kind } : {}), }, }); } return topResults; } catch { return []; } } // ─── Memory Layer Search ─── private searchMemoryLayers( query: string, memoryManager: MemoryManager, chatHistory: Array<{ role: string; content: string }>, workspacePath?: string ): RetrievalChunk[] { const chunks: RetrievalChunk[] = []; // Long-Term Memory const ltm = memoryManager.getLongTermMemory(); const ltmContext = ltm.buildContext(query); if (ltmContext) { chunks.push({ id: 'ltm-context', source: 'long-term-memory', title: ltmContext.label, content: ltmContext.content, score: ltmContext.relevance, tokenEstimate: estimateTokens(ltmContext.content), metadata: { category: 'long-term' } }); } // Project Memory if (workspacePath) { const pm = memoryManager.getProjectMemory(workspacePath); const pmContext = pm.buildContext(query); if (pmContext) { chunks.push({ id: 'pm-context', source: 'project-memory', title: pmContext.label, content: pmContext.content, score: pmContext.relevance, tokenEstimate: estimateTokens(pmContext.content), metadata: { category: 'project', isProjectEvidence: true } }); } } // Procedural Memory const proc = memoryManager.getProceduralMemory(); const procContext = proc.buildContext(query); if (procContext) { chunks.push({ id: 'proc-context', source: 'procedural-memory', title: procContext.label, content: procContext.content, score: procContext.relevance, tokenEstimate: estimateTokens(procContext.content), metadata: { category: 'procedural' } }); } // Episodic Memory const ep = memoryManager.getEpisodicMemory(); const epContext = ep.buildContext(query); if (epContext) { chunks.push({ id: 'ep-context', source: 'episodic-memory', title: epContext.label, content: epContext.content, score: epContext.relevance, tokenEstimate: estimateTokens(epContext.content), metadata: { category: 'episodic' } }); } return chunks; } // ─── Medium-Term: Recent Sessions ─── /** * Score the user-provided session summaries against the current query * (lightweight token overlap — sessions are small so we skip the TF-IDF * machinery) and return up to `limit` as chunks. Each chunk packs the * title + first user message + last assistant excerpt — enough for the * model to recall the thread without re-injecting the whole transcript. * * Why include recent sessions at all: short-term covers "this conversation", * long-term covers "stable brain notes", but there's a gap for "what we * worked on yesterday/last week" that the user expects me to remember. */ private scoreRecentSessions( expandedTokens: string[], sessions: RecentSessionSummary[], limit: number, ): RetrievalChunk[] { if (!sessions || sessions.length === 0 || limit <= 0) return []; const qSet = new Set(expandedTokens.filter((t) => t.length >= 2)); const scored = sessions.map((s) => { // Prefer the LLM-compressed summary when present — it's a real // 2-3 sentence recap of the session, so query matches against it // are far more meaningful than against an arbitrary head/tail. const text = s.summary ? `${s.title}\n${s.summary}` : `${s.title}\n${s.firstUserMsg}\n${s.lastAssistantExcerpt}`; const docTokens = tokenize(text); let overlap = 0; for (const t of docTokens) if (qSet.has(t)) overlap++; // Tiny recency boost so equal-overlap sessions prefer the more // recent one (most users mean "what we just discussed"). +0.1 max // for sessions <7 days old, decays to 0 beyond that. const ageDays = s.timestamp ? Math.max(0, (Date.now() - s.timestamp) / 86400000) : 999; const recency = ageDays < 7 ? (7 - ageDays) / 70 : 0; return { s, score: overlap + recency }; }).filter((x) => x.score > 0); scored.sort((a, b) => b.score - a.score); const picked = scored.slice(0, limit); if (picked.length === 0) return []; return picked.map(({ s, score }, idx) => { const dateStr = s.timestamp ? new Date(s.timestamp).toISOString().slice(0, 10) : ''; // Prefer the LLM-compressed summary; fall back to the raw fragments // when the session ended before the summarizer could run (or was // too short to summarize, < 3 visible messages). const body = s.summary ? [`**${s.title}**${dateStr ? ` (${dateStr})` : ''}`, s.summary].join('\n') : [ `**${s.title}**${dateStr ? ` (${dateStr})` : ''}`, s.firstUserMsg ? `사용자 요청: ${s.firstUserMsg}` : '', s.lastAssistantExcerpt ? `이전 답변 마지막 부분: …${s.lastAssistantExcerpt}` : '', ].filter(Boolean).join('\n'); return { id: `mtm-${idx}-${s.id}`, source: 'medium-term-memory', title: s.title || '(untitled session)', content: body, score, tokenEstimate: estimateTokens(body), metadata: { category: 'medium-term', lastUpdated: s.timestamp }, }; }); } // ─── Score Normalization ─── /** * 서로 다른 스코어 스케일을 가진 소스들의 점수를 0~1로 정규화합니다. */ private normalizeScores(chunks: RetrievalChunk[]): void { // Group by source const groups = new Map(); for (const chunk of chunks) { if (!groups.has(chunk.source)) groups.set(chunk.source, []); groups.get(chunk.source)!.push(chunk); } // Normalize each group independently for (const [, group] of groups) { const maxScore = Math.max(...group.map((c) => c.score), 0.001); for (const chunk of group) { chunk.score = chunk.score / maxScore; } } // Source priority boost (some sources are inherently more valuable for RAG) const sourceBoost: Record = { 'brain-trace': 1.0, 'brain-memory': 0.9, 'project-memory': 0.85, 'long-term-memory': 0.8, 'procedural-memory': 0.95, // Procedural is highly specific 'medium-term-memory': 0.78, // recent sessions: useful when the user references "last time / yesterday" 'episodic-memory': 0.7, 'project-scan': 0.6, 'recent-knowledge': 0.75 }; for (const chunk of chunks) { const boost = sourceBoost[chunk.source] || 0.5; chunk.score *= boost; // Lesson cards are short, high-signal guardrails — nudge relevant ones above ordinary brain notes // so they survive the budget. Modest (1.4×) so they don't crowd everything out when many match. if (chunk.metadata.isLesson) chunk.score *= 1.4; } } // ─── Helpers ─── private isRawConversation(relativePath: string): boolean { return /(^|[\\/])(00_Raw|raw-data|conversations?|transcripts?)([\\/]|$)/i.test(relativePath); } private inferCategory(relativePath: string): string { const normalized = relativePath.toLowerCase(); if (/(decisions?|adr|planning)/i.test(normalized)) return 'decision'; if (/(records|development|bugs)/i.test(normalized)) return 'project-record'; if (/(architecture|design|pattern)/i.test(normalized)) return 'architecture'; if (/(knowledge|wiki|topics)/i.test(normalized)) return 'knowledge'; return 'general'; } private isProjectEvidence(relativePath: string, content: string): boolean { const normalized = relativePath.toLowerCase(); if (/(records|planning|development|bugs|retrospectives|projectchronicle)/i.test(normalized)) return true; if (/adr-\d+|(^|[\\/])decisions?([\\/]|$)/i.test(normalized)) return true; return false; } }