refactor: optimize core engine and retrieval logic for v2.80.43

2026-05-13 19:23:57 +09:00
parent c4260466b9
commit 089abf22db
17 changed files with 1311 additions and 88 deletions
@@ -19,15 +19,32 @@ import { findBrainFiles, summarizeText } from '../utils';
 import { isInside } from '../lib/paths';
 import { MemoryManager } from '../memory';
 import { RetrievalChunk, RetrievalResult, ContextBudgetConfig } from './types';
-import { tokenize, expandQuery, scoreTfIdfPreTokenized, extractBestExcerpt } from './scoring';
+import { tokenize, expandQuery, scoreTfIdfPreTokenized, extractBestExcerpt, extractBestSection } from './scoring';
 import { selectWithinBudget, assembleContext, estimateTokens } from './contextBudget';
-import { getBrainTokenIndex } from './brainIndex';
+import { getBrainTokenIndex, getBrainEmbeddings } from './brainIndex';
+import { extractLessonEssence } from './lessonHelpers';
+import { cosineSimilarity } from './embeddings';

 export { tokenize, expandQuery, scoreTfIdf, scoreTfIdfPreTokenized, extractBestExcerpt } from './scoring';
 export { selectWithinBudget, assembleContext, estimateTokens } from './contextBudget';
 export { getBrainTokenIndex, clearBrainTokenIndex } from './brainIndex';
 export * from './types';

+/** Compact summary of a past chat session for medium-term memory retrieval. */
+export interface RecentSessionSummary {
+    id: string;
+    title: string;
+    firstUserMsg: string;
+    lastAssistantExcerpt: string;
+    /**
+     * Optional LLM-compressed recap stored at session end (~200 chars).
+     * When present, retrieval uses this instead of the firstUserMsg+tail
+     * fragment because it actually captures the decision/outcome.
+     */
+    summary?: string;
+    timestamp: number;
+}
+
 interface RetrievalOptions {
    brain: BrainProfile;
    memoryManager: MemoryManager;
@@ -44,6 +61,26 @@ interface RetrievalOptions {
     * silently dropped by the caller (see `agentKnowledgeMap.resolveScopeForAgent`).
     */
    scopeFolders?: string[];
+    /**
+     * Compact summaries of recently-touched chat sessions (excluding the
+     * active one). Scored against the query and the top `mediumTermLimit`
+     * are injected as medium-term memory chunks. Caller pre-computes these
+     * to avoid threading vscode/ExtensionContext through this module.
+     */
+    recentSessions?: RecentSessionSummary[];
+    /** Max number of medium-term session chunks to include after scoring. */
+    mediumTermLimit?: number;
+    /**
+     * Optional query embedding for hybrid (sparse+dense) brain search. When
+     * provided, each candidate file's cached embedding is cosine-matched and
+     * blended with the TF-IDF score by `embeddingBlendAlpha`. Caller computes
+     * this once per turn so we don't pay the embedding RTT inside scoring.
+     */
+    queryEmbedding?: number[];
+    /** Embedding model name (used as a cache key on the brain index side). */
+    embeddingModel?: string;
+    /** Blend weight: 0 = TF-IDF only, 1 = cosine only. Default 0.5. */
+    embeddingBlendAlpha?: number;
 }

 export class RetrievalOrchestrator {
@@ -60,7 +97,7 @@ export class RetrievalOrchestrator {
        fusionLog.push(`Query tokens: [${queryTokens.slice(0, 10).join(', ')}]`);
        fusionLog.push(`Expanded tokens: [${expandedTokens.slice(0, 15).join(', ')}]`);

-        // ── ① Brain File Search (TF-IDF enhanced) ──
+        // ── ① Brain File Search (TF-IDF enhanced, optionally hybrid with embeddings) ──
        const scopeFolders = options.scopeFolders ?? [];
        const brainChunks = this.searchBrainFiles(
            query,
@@ -68,7 +105,10 @@ export class RetrievalOrchestrator {
            options.brain,
            options.brainFileLimit || 8,
            options.includeRawConversations || false,
-            scopeFolders
+            scopeFolders,
+            options.queryEmbedding,
+            options.embeddingModel,
+            options.embeddingBlendAlpha
        );
        allChunks.push(...brainChunks);
        fusionLog.push(
@@ -87,6 +127,15 @@ export class RetrievalOrchestrator {
        allChunks.push(...memoryChunks);
        fusionLog.push(`Memory search: ${memoryChunks.length} chunks found`);

+        // ── ②-b Medium-Term Memory (recent sessions) ──
+        const mediumChunks = this.scoreRecentSessions(
+            expandedTokens,
+            options.recentSessions || [],
+            options.mediumTermLimit ?? 0
+        );
+        allChunks.push(...mediumChunks);
+        fusionLog.push(`Medium-term sessions: ${mediumChunks.length} chunks selected`);
+
        // ── ③ Result Fusion — normalize scores across sources ──
        this.normalizeScores(allChunks);
        fusionLog.push(`Total chunks before budget: ${allChunks.length}`);
@@ -129,7 +178,10 @@ export class RetrievalOrchestrator {
        brain: BrainProfile,
        limit: number,
        includeRaw: boolean,
-        scopeFolders: string[] = []
+        scopeFolders: string[] = [],
+        queryEmbedding?: number[],
+        embeddingModel?: string,
+        embeddingBlendAlpha?: number,
    ): RetrievalChunk[] {
        try {
            const scoped = (file: string) => scopeFolders.length === 0
@@ -155,6 +207,34 @@ export class RetrievalOrchestrator {
                }))
            );

+            // Hybrid blend: when the caller provided a query embedding and an
+            // embedding model, fetch the cached file vectors and add a cosine
+            // similarity term to each score. We normalise TF-IDF scores by the
+            // top observed value so the two terms live on the same scale before
+            // blending. Files without a cached embedding keep their pure TF-IDF
+            // score so adding/missing embeddings doesn't hurt retrieval.
+            if (queryEmbedding && embeddingModel && (embeddingBlendAlpha ?? 0) > 0) {
+                const alpha = Math.max(0, Math.min(1, embeddingBlendAlpha!));
+                const filePaths = indexed.map((d) => d.filePath);
+                const embeddings = getBrainEmbeddings(brain.localBrainPath, filePaths, embeddingModel);
+                if (embeddings.size > 0) {
+                    const maxTfidf = scored.reduce((m, s) => s.score > m ? s.score : m, 0) || 1;
+                    let hits = 0;
+                    for (const s of scored) {
+                        const fp = indexed[s.index].filePath;
+                        const vec = embeddings.get(fp);
+                        if (!vec) continue;
+                        const cos = cosineSimilarity(queryEmbedding, vec); // [-1, 1] in theory; positive for typical embedding spaces
+                        const tfidfNorm = s.score / maxTfidf;
+                        s.score = (1 - alpha) * tfidfNorm + alpha * Math.max(0, cos);
+                        hits++;
+                    }
+                    if (hits > 0) {
+                        // Re-sort downstream is handled by the .filter().sort() that follows.
+                    }
+                }
+            }
+
            // Always consider lesson cards for the top slots even if they didn't crack the raw-score top-`limit`:
            // they're short, high-signal, and we want them surfaced when relevant. We keep the regular top-`limit`
            // and additively pull in up to a few lesson cards (deduped by index).
@@ -180,12 +260,20 @@ export class RetrievalOrchestrator {
                // Only the chosen files are actually read off disk (for excerpt extraction).
                let content = '';
                try { content = fs.readFileSync(doc.filePath, 'utf8'); } catch { /* deleted just now — skip */ continue; }
-                // Lesson cards: hand back the whole card (they're meant to be short) so the Prevention Checklist
-                // survives; fall back to a generous excerpt for long ones. Regular notes: the usual 400-char excerpt.
+                // Lesson cards: extract just the high-signal sections (Mistake / Root Cause / Fix /
+                // Prevention Checklist) instead of dumping the whole 2500-char card. Old lessons
+                // without those headings fall back to a query-targeted excerpt. Cuts retrieval tokens
+                // by ~70% per lesson without losing the guardrail content.
+                //
+                // Regular notes: pick the best heading-bounded section for the query (markdown
+                // section retrieval) so that long notes don't dump their intro/setup blocks just
+                // because they happen to be in the top 400 chars. Falls back to keyword-window
+                // extraction inside the section, or whole-doc extraction when there are no
+                // headings at all.
                const excerpt = isLesson
-                    ? (content.length <= 2500 ? content.trim() : extractBestExcerpt(content, expandedTokens, 1500))
-                    : extractBestExcerpt(content, expandedTokens, 400);
-                const cap = isLesson ? 2500 : 400;
+                    ? extractLessonEssence(content, 1200) || extractBestExcerpt(content, expandedTokens, 1200)
+                    : extractBestSection(content, expandedTokens, 600);
+                const cap = isLesson ? 1200 : 600;
                topResults.push({
                    id: `brain-${s.index}`,
                    source: 'brain-memory' as const,
@@ -287,6 +375,70 @@ export class RetrievalOrchestrator {
        return chunks;
    }

+    // ─── Medium-Term: Recent Sessions ───
+
+    /**
+     * Score the user-provided session summaries against the current query
+     * (lightweight token overlap — sessions are small so we skip the TF-IDF
+     * machinery) and return up to `limit` as chunks. Each chunk packs the
+     * title + first user message + last assistant excerpt — enough for the
+     * model to recall the thread without re-injecting the whole transcript.
+     *
+     * Why include recent sessions at all: short-term covers "this conversation",
+     * long-term covers "stable brain notes", but there's a gap for "what we
+     * worked on yesterday/last week" that the user expects me to remember.
+     */
+    private scoreRecentSessions(
+        expandedTokens: string[],
+        sessions: RecentSessionSummary[],
+        limit: number,
+    ): RetrievalChunk[] {
+        if (!sessions || sessions.length === 0 || limit <= 0) return [];
+        const qSet = new Set(expandedTokens.filter((t) => t.length >= 2));
+        const scored = sessions.map((s) => {
+            // Prefer the LLM-compressed summary when present — it's a real
+            // 2-3 sentence recap of the session, so query matches against it
+            // are far more meaningful than against an arbitrary head/tail.
+            const text = s.summary
+                ? `${s.title}\n${s.summary}`
+                : `${s.title}\n${s.firstUserMsg}\n${s.lastAssistantExcerpt}`;
+            const docTokens = tokenize(text);
+            let overlap = 0;
+            for (const t of docTokens) if (qSet.has(t)) overlap++;
+            // Tiny recency boost so equal-overlap sessions prefer the more
+            // recent one (most users mean "what we just discussed"). +0.1 max
+            // for sessions <7 days old, decays to 0 beyond that.
+            const ageDays = s.timestamp ? Math.max(0, (Date.now() - s.timestamp) / 86400000) : 999;
+            const recency = ageDays < 7 ? (7 - ageDays) / 70 : 0;
+            return { s, score: overlap + recency };
+        }).filter((x) => x.score > 0);
+        scored.sort((a, b) => b.score - a.score);
+        const picked = scored.slice(0, limit);
+        if (picked.length === 0) return [];
+        return picked.map(({ s, score }, idx) => {
+            const dateStr = s.timestamp ? new Date(s.timestamp).toISOString().slice(0, 10) : '';
+            // Prefer the LLM-compressed summary; fall back to the raw fragments
+            // when the session ended before the summarizer could run (or was
+            // too short to summarize, < 3 visible messages).
+            const body = s.summary
+                ? [`**${s.title}**${dateStr ? ` (${dateStr})` : ''}`, s.summary].join('\n')
+                : [
+                    `**${s.title}**${dateStr ? ` (${dateStr})` : ''}`,
+                    s.firstUserMsg ? `사용자 요청: ${s.firstUserMsg}` : '',
+                    s.lastAssistantExcerpt ? `이전 답변 마지막 부분: …${s.lastAssistantExcerpt}` : '',
+                ].filter(Boolean).join('\n');
+            return {
+                id: `mtm-${idx}-${s.id}`,
+                source: 'medium-term-memory',
+                title: s.title || '(untitled session)',
+                content: body,
+                score,
+                tokenEstimate: estimateTokens(body),
+                metadata: { category: 'medium-term', lastUpdated: s.timestamp },
+            };
+        });
+    }
+
    // ─── Score Normalization ───

    /**
@@ -315,6 +467,7 @@ export class RetrievalOrchestrator {
            'project-memory': 0.85,
            'long-term-memory': 0.8,
            'procedural-memory': 0.95,  // Procedural is highly specific
+            'medium-term-memory': 0.78, // recent sessions: useful when the user references "last time / yesterday"
            'episodic-memory': 0.7,
            'project-scan': 0.6,
            'recent-knowledge': 0.75