refactor: optimize core engine and retrieval logic for v2.80.43

2026-05-13 19:23:57 +09:00
parent c4260466b9
commit 089abf22db
17 changed files with 1311 additions and 88 deletions
@@ -17,7 +17,10 @@ import { tokenize, countConflictIndicators } from './scoring';
 import { detectLessonKind } from './lessonHelpers';
 import { logInfo } from '../utils';

-const INDEX_VERSION = 3;
+// v4 adds optional per-file `embedding` for hybrid (sparse+dense) retrieval.
+// Older v3 indexes are auto-rebuilt on first load — no migration needed because
+// the cache is derivable from the brain itself.
+const INDEX_VERSION = 4;
 const INDEX_DIR = '.astra';
 const INDEX_FILE = 'brain-index.json';
 /** 인덱스가 이 개수를 넘으면 이번 스캔에서 못 본 항목을 정리합니다 (삭제된 파일 누적 방지). */
@@ -34,6 +37,14 @@ interface IndexEntry {
    titleTokens: string[];  // tokenize(title)
    conflictCount: number;  // countConflictIndicators(`${title} ${content}`)
    kind: string;           // '' for an ordinary note, else 'lesson' | 'playbook' | 'qa-finding'
+    /**
+     * Dense embedding for hybrid retrieval. Populated lazily by a background
+     * pass after the file is tokenized — TF-IDF queries don't wait on it.
+     * Cleared when mtimeMs/size change because the content moved on.
+     */
+    embedding?: number[];
+    /** Embedding model the vector was produced with — invalidates the vector when the user switches models. */
+    embeddingModel?: string;
 }

 interface PersistedIndex {
@@ -212,6 +223,93 @@ export function getBrainTokenIndex(brainPath: string, files: string[]): IndexedB
    return out;
 }

+/**
+ * Pull (filePath, embedding) for every file in `filePaths` that has a current
+ * cached vector under `model`. Caller uses this to rank top TF-IDF candidates
+ * by cosine similarity. Files missing an embedding are silently omitted.
+ */
+export function getBrainEmbeddings(brainPath: string, filePaths: string[], model: string): Map<string, number[]> {
+    const out = new Map<string, number[]>();
+    if (!brainPath || !model.trim() || !Array.isArray(filePaths) || filePaths.length === 0) return out;
+    const st = _states.get(brainPath);
+    if (!st) return out;
+    for (const fp of filePaths) {
+        const entry = st.index.entries[fp];
+        if (!entry?.embedding || entry.embeddingModel !== model) continue;
+        if (!Array.isArray(entry.embedding) || entry.embedding.length === 0) continue;
+        out.set(fp, entry.embedding);
+    }
+    return out;
+}
+
+/**
+ * Background fill: for each file under `filePaths`, embed its content with
+ * `embedFn` if no current vector exists for `model`. Calls `embedFn` in
+ * caller-controlled batches (caller can chunk filePaths as wanted), and saves
+ * the disk index. Designed to be fire-and-forget — failures are logged and
+ * swallowed.
+ *
+ * Returns the count of newly embedded files (0 when everything was cached
+ * already or the model is empty).
+ */
+export async function backfillBrainEmbeddings(
+    brainPath: string,
+    filePaths: string[],
+    model: string,
+    embedFn: (texts: string[]) => Promise<number[][]>,
+): Promise<number> {
+    if (!brainPath || !model.trim() || !Array.isArray(filePaths) || filePaths.length === 0) return 0;
+    const st = _states.get(brainPath);
+    if (!st) return 0;
+    const stale: string[] = [];
+    for (const fp of filePaths) {
+        const entry = st.index.entries[fp];
+        if (!entry) continue;
+        if (entry.embedding && entry.embeddingModel === model) continue;
+        stale.push(fp);
+    }
+    if (stale.length === 0) return 0;
+    // Build embedding inputs from cached tokens (much cheaper than re-reading
+    // the file). We re-read content only when the cached tokens are missing
+    // somehow — defensive, but the index always has them after tokenization.
+    const texts: string[] = [];
+    const keys: string[] = [];
+    for (const fp of stale) {
+        const entry = st.index.entries[fp];
+        if (!entry) continue;
+        let text = '';
+        if (Array.isArray(entry.tokens) && entry.tokens.length > 0) {
+            text = `${entry.title}\n${entry.tokens.join(' ')}`;
+        } else {
+            try { text = fs.readFileSync(fp, 'utf8'); } catch { continue; }
+        }
+        if (!text.trim()) continue;
+        texts.push(text);
+        keys.push(fp);
+    }
+    if (texts.length === 0) return 0;
+    try {
+        const vectors = await embedFn(texts);
+        for (let i = 0; i < vectors.length && i < keys.length; i++) {
+            const v = vectors[i];
+            if (!Array.isArray(v) || v.length === 0) continue;
+            const entry = st.index.entries[keys[i]];
+            if (!entry) continue;
+            entry.embedding = v;
+            entry.embeddingModel = model;
+            st.dirty = true;
+        }
+        if (st.dirty) {
+            logInfo('Brain embeddings backfilled.', { brainPath, model, embedded: vectors.length });
+            scheduleWrite(st, brainPath);
+        }
+        return vectors.length;
+    } catch (e: any) {
+        logInfo('Brain embedding backfill failed (TF-IDF still works).', { brainPath, model, error: e?.message ?? String(e) });
+        return 0;
+    }
+}
+
 /** Drop the in-memory index (and pending write) for one brain, or all brains. The disk file is left as-is. */
 export function clearBrainTokenIndex(brainPath?: string): void {
    if (brainPath === undefined) {
@@ -101,6 +101,7 @@ export function assembleContext(chunks: RetrievalChunk[]): string {
        'brain-trace': '📚 Second Brain Knowledge',
        'brain-memory': '📚 Brain Knowledge',
        'long-term-memory': '🧠 Long-Term Memory (사용자 규칙/결정)',
+        'medium-term-memory': '🗂️ Medium-Term Memory (최근 세션 요약)',
        'project-memory': '📂 Project Memory (프로젝트 컨텍스트)',
        'procedural-memory': '📋 Procedural Memory (반복 절차)',
        'episodic-memory': '📖 Episodic Memory (과거 대화 흐름)',
@@ -0,0 +1,167 @@
+/**
+ * ============================================================
+ * Embeddings — local hybrid (sparse + dense) retrieval support
+ *
+ * TF-IDF is fast and zero-cost but misses synonyms / paraphrase. A small local
+ * embedding model (BGE-small, multilingual-e5-small, nomic-embed-text, …)
+ * loaded in LM Studio or Ollama bridges that gap without sending anything
+ * off the machine.
+ *
+ * Design choices:
+ *   - Opt-in via g1nation.embeddingModel (empty = disabled). We don't auto-
+ *     pick a model because the user has to load it in LM Studio/Ollama first.
+ *   - Calls are best-effort: a missing model / network blip falls back to
+ *     pure TF-IDF without breaking the query.
+ *   - We never block retrieval on embedding work. Missing-file embeddings are
+ *     populated by a separate fire-and-forget pass after the TF-IDF answer
+ *     ships, so the *next* query benefits.
+ *
+ * Numerical format:
+ *   - Vectors are `number[]` (not Float32Array) so they JSON-serialize for
+ *     the brain-index cache without per-element conversion. The hot loop
+ *     (cosine) is small enough that the extra precision is irrelevant to
+ *     throughput on typical brain sizes.
+ * ============================================================
+ */
+
+import { resolveEngine, buildApiUrl, logError, logInfo } from '../utils';
+
+/** Maximum characters of a single text chunk fed to the embedding model. */
+const EMBED_INPUT_CAP = 4000;
+/** Maximum texts per embedding API call. */
+const BATCH_SIZE = 16;
+/** Request timeout for one embedding batch. */
+const REQ_TIMEOUT_MS = 30000;
+
+export interface EmbeddingCallOptions {
+    /** OpenAI-compatible base URL (e.g. http://127.0.0.1:1234 for LM Studio). */
+    baseUrl: string;
+    /** Embedding model name as registered in LM Studio / Ollama. Empty disables. */
+    model: string;
+    /** AbortSignal for cancellation propagation. */
+    signal?: AbortSignal;
+}
+
+/**
+ * Embed a batch of texts. Returns one vector per input. Throws if the call
+ * fails — callers wrap with try/catch and fall back to TF-IDF.
+ *
+ * Engine selection mirrors the chat path: LM Studio takes precedence when the
+ * URL points at port 1234 or includes the /v1/ prefix, otherwise Ollama.
+ */
+export async function embedTexts(texts: string[], opts: EmbeddingCallOptions): Promise<number[][]> {
+    if (!opts.model.trim()) throw new Error('Embedding model not configured.');
+    if (!texts || texts.length === 0) return [];
+    const engine = resolveEngine(opts.baseUrl);
+    const url = buildApiUrl(opts.baseUrl, engine, 'embeddings');
+    const out: number[][] = [];
+    for (let i = 0; i < texts.length; i += BATCH_SIZE) {
+        const batch = texts.slice(i, i + BATCH_SIZE).map((t) => clipForEmbedding(t));
+        const body = engine === 'lmstudio'
+            ? { model: opts.model, input: batch }
+            : { model: opts.model, input: batch };  // Ollama 0.1.30+ also accepts array input
+        const controller = opts.signal ? undefined : new AbortController();
+        const timer = controller ? setTimeout(() => controller.abort(), REQ_TIMEOUT_MS) : undefined;
+        try {
+            const response = await fetch(url, {
+                method: 'POST',
+                headers: { 'Content-Type': 'application/json' },
+                body: JSON.stringify(body),
+                signal: opts.signal ?? controller?.signal,
+            });
+            if (!response.ok) {
+                const errText = await response.text().catch(() => '');
+                throw new Error(`Embedding endpoint returned ${response.status}: ${errText.slice(0, 200)}`);
+            }
+            const json = await response.json() as any;
+            // OpenAI-compatible: { data: [{ embedding: [...] }, ...] }
+            // Ollama: { embedding: [...] } (single) or { embeddings: [[...], ...] } (newer)
+            if (Array.isArray(json?.data)) {
+                for (const row of json.data) {
+                    if (Array.isArray(row?.embedding)) out.push(row.embedding as number[]);
+                }
+            } else if (Array.isArray(json?.embeddings)) {
+                for (const v of json.embeddings) {
+                    if (Array.isArray(v)) out.push(v as number[]);
+                }
+            } else if (Array.isArray(json?.embedding)) {
+                out.push(json.embedding as number[]);
+            }
+        } finally {
+            if (timer) clearTimeout(timer);
+        }
+    }
+    return out;
+}
+
+/** Cosine similarity for equal-length vectors. Returns 0 when either vector is empty / zero. */
+export function cosineSimilarity(a: number[], b: number[]): number {
+    if (!a || !b || a.length === 0 || b.length === 0) return 0;
+    const n = Math.min(a.length, b.length);
+    let dot = 0, na = 0, nb = 0;
+    for (let i = 0; i < n; i++) {
+        const va = a[i], vb = b[i];
+        dot += va * vb;
+        na += va * va;
+        nb += vb * vb;
+    }
+    if (na === 0 || nb === 0) return 0;
+    return dot / (Math.sqrt(na) * Math.sqrt(nb));
+}
+
+/** Clip a text to a length the embedding model will accept without truncation surprises. */
+function clipForEmbedding(text: string): string {
+    if (!text) return '';
+    return text.length <= EMBED_INPUT_CAP ? text : text.slice(0, EMBED_INPUT_CAP);
+}
+
+/**
+ * Tiny LRU for query embeddings: typing the same query twice (or retrying)
+ * shouldn't re-hit the embedding endpoint. Keyed on `model + text`.
+ *
+ * Capped at QUERY_CACHE_MAX entries; oldest evicted. Strictly process-local
+ * (no disk persistence) because the query strings are short and the gains
+ * across restarts are marginal.
+ */
+const QUERY_CACHE_MAX = 32;
+const _queryCache = new Map<string, number[]>();
+function queryCacheKey(model: string, text: string): string { return `${model}|${text}`; }
+export function getCachedQueryEmbedding(model: string, text: string): number[] | undefined {
+    const k = queryCacheKey(model, text);
+    const v = _queryCache.get(k);
+    if (!v) return undefined;
+    // refresh recency
+    _queryCache.delete(k);
+    _queryCache.set(k, v);
+    return v;
+}
+export function setCachedQueryEmbedding(model: string, text: string, vec: number[]): void {
+    const k = queryCacheKey(model, text);
+    _queryCache.set(k, vec);
+    if (_queryCache.size > QUERY_CACHE_MAX) {
+        const oldest = _queryCache.keys().next().value;
+        if (oldest !== undefined) _queryCache.delete(oldest);
+    }
+}
+
+/**
+ * Embed a single query string, using the in-process LRU. Returns `undefined`
+ * if the embedding endpoint fails — callers treat that as "semantic
+ * scoring unavailable for this turn, fall back to TF-IDF".
+ */
+export async function embedQuery(text: string, opts: EmbeddingCallOptions): Promise<number[] | undefined> {
+    if (!opts.model.trim() || !text.trim()) return undefined;
+    const cached = getCachedQueryEmbedding(opts.model, text);
+    if (cached) return cached;
+    try {
+        const [vec] = await embedTexts([text], opts);
+        if (vec && vec.length > 0) {
+            setCachedQueryEmbedding(opts.model, text, vec);
+            logInfo('Query embedding computed.', { model: opts.model, dim: vec.length });
+            return vec;
+        }
+    } catch (e: any) {
+        logError('Query embedding failed.', { model: opts.model, error: e?.message ?? String(e) });
+    }
+    return undefined;
+}
@@ -19,15 +19,32 @@ import { findBrainFiles, summarizeText } from '../utils';
 import { isInside } from '../lib/paths';
 import { MemoryManager } from '../memory';
 import { RetrievalChunk, RetrievalResult, ContextBudgetConfig } from './types';
-import { tokenize, expandQuery, scoreTfIdfPreTokenized, extractBestExcerpt } from './scoring';
+import { tokenize, expandQuery, scoreTfIdfPreTokenized, extractBestExcerpt, extractBestSection } from './scoring';
 import { selectWithinBudget, assembleContext, estimateTokens } from './contextBudget';
-import { getBrainTokenIndex } from './brainIndex';
+import { getBrainTokenIndex, getBrainEmbeddings } from './brainIndex';
+import { extractLessonEssence } from './lessonHelpers';
+import { cosineSimilarity } from './embeddings';

 export { tokenize, expandQuery, scoreTfIdf, scoreTfIdfPreTokenized, extractBestExcerpt } from './scoring';
 export { selectWithinBudget, assembleContext, estimateTokens } from './contextBudget';
 export { getBrainTokenIndex, clearBrainTokenIndex } from './brainIndex';
 export * from './types';

+/** Compact summary of a past chat session for medium-term memory retrieval. */
+export interface RecentSessionSummary {
+    id: string;
+    title: string;
+    firstUserMsg: string;
+    lastAssistantExcerpt: string;
+    /**
+     * Optional LLM-compressed recap stored at session end (~200 chars).
+     * When present, retrieval uses this instead of the firstUserMsg+tail
+     * fragment because it actually captures the decision/outcome.
+     */
+    summary?: string;
+    timestamp: number;
+}
+
 interface RetrievalOptions {
    brain: BrainProfile;
    memoryManager: MemoryManager;
@@ -44,6 +61,26 @@ interface RetrievalOptions {
     * silently dropped by the caller (see `agentKnowledgeMap.resolveScopeForAgent`).
     */
    scopeFolders?: string[];
+    /**
+     * Compact summaries of recently-touched chat sessions (excluding the
+     * active one). Scored against the query and the top `mediumTermLimit`
+     * are injected as medium-term memory chunks. Caller pre-computes these
+     * to avoid threading vscode/ExtensionContext through this module.
+     */
+    recentSessions?: RecentSessionSummary[];
+    /** Max number of medium-term session chunks to include after scoring. */
+    mediumTermLimit?: number;
+    /**
+     * Optional query embedding for hybrid (sparse+dense) brain search. When
+     * provided, each candidate file's cached embedding is cosine-matched and
+     * blended with the TF-IDF score by `embeddingBlendAlpha`. Caller computes
+     * this once per turn so we don't pay the embedding RTT inside scoring.
+     */
+    queryEmbedding?: number[];
+    /** Embedding model name (used as a cache key on the brain index side). */
+    embeddingModel?: string;
+    /** Blend weight: 0 = TF-IDF only, 1 = cosine only. Default 0.5. */
+    embeddingBlendAlpha?: number;
 }

 export class RetrievalOrchestrator {
@@ -60,7 +97,7 @@ export class RetrievalOrchestrator {
        fusionLog.push(`Query tokens: [${queryTokens.slice(0, 10).join(', ')}]`);
        fusionLog.push(`Expanded tokens: [${expandedTokens.slice(0, 15).join(', ')}]`);

-        // ── ① Brain File Search (TF-IDF enhanced) ──
+        // ── ① Brain File Search (TF-IDF enhanced, optionally hybrid with embeddings) ──
        const scopeFolders = options.scopeFolders ?? [];
        const brainChunks = this.searchBrainFiles(
            query,
@@ -68,7 +105,10 @@ export class RetrievalOrchestrator {
            options.brain,
            options.brainFileLimit || 8,
            options.includeRawConversations || false,
-            scopeFolders
+            scopeFolders,
+            options.queryEmbedding,
+            options.embeddingModel,
+            options.embeddingBlendAlpha
        );
        allChunks.push(...brainChunks);
        fusionLog.push(
@@ -87,6 +127,15 @@ export class RetrievalOrchestrator {
        allChunks.push(...memoryChunks);
        fusionLog.push(`Memory search: ${memoryChunks.length} chunks found`);

+        // ── ②-b Medium-Term Memory (recent sessions) ──
+        const mediumChunks = this.scoreRecentSessions(
+            expandedTokens,
+            options.recentSessions || [],
+            options.mediumTermLimit ?? 0
+        );
+        allChunks.push(...mediumChunks);
+        fusionLog.push(`Medium-term sessions: ${mediumChunks.length} chunks selected`);
+
        // ── ③ Result Fusion — normalize scores across sources ──
        this.normalizeScores(allChunks);
        fusionLog.push(`Total chunks before budget: ${allChunks.length}`);
@@ -129,7 +178,10 @@ export class RetrievalOrchestrator {
        brain: BrainProfile,
        limit: number,
        includeRaw: boolean,
-        scopeFolders: string[] = []
+        scopeFolders: string[] = [],
+        queryEmbedding?: number[],
+        embeddingModel?: string,
+        embeddingBlendAlpha?: number,
    ): RetrievalChunk[] {
        try {
            const scoped = (file: string) => scopeFolders.length === 0
@@ -155,6 +207,34 @@ export class RetrievalOrchestrator {
                }))
            );

+            // Hybrid blend: when the caller provided a query embedding and an
+            // embedding model, fetch the cached file vectors and add a cosine
+            // similarity term to each score. We normalise TF-IDF scores by the
+            // top observed value so the two terms live on the same scale before
+            // blending. Files without a cached embedding keep their pure TF-IDF
+            // score so adding/missing embeddings doesn't hurt retrieval.
+            if (queryEmbedding && embeddingModel && (embeddingBlendAlpha ?? 0) > 0) {
+                const alpha = Math.max(0, Math.min(1, embeddingBlendAlpha!));
+                const filePaths = indexed.map((d) => d.filePath);
+                const embeddings = getBrainEmbeddings(brain.localBrainPath, filePaths, embeddingModel);
+                if (embeddings.size > 0) {
+                    const maxTfidf = scored.reduce((m, s) => s.score > m ? s.score : m, 0) || 1;
+                    let hits = 0;
+                    for (const s of scored) {
+                        const fp = indexed[s.index].filePath;
+                        const vec = embeddings.get(fp);
+                        if (!vec) continue;
+                        const cos = cosineSimilarity(queryEmbedding, vec); // [-1, 1] in theory; positive for typical embedding spaces
+                        const tfidfNorm = s.score / maxTfidf;
+                        s.score = (1 - alpha) * tfidfNorm + alpha * Math.max(0, cos);
+                        hits++;
+                    }
+                    if (hits > 0) {
+                        // Re-sort downstream is handled by the .filter().sort() that follows.
+                    }
+                }
+            }
+
            // Always consider lesson cards for the top slots even if they didn't crack the raw-score top-`limit`:
            // they're short, high-signal, and we want them surfaced when relevant. We keep the regular top-`limit`
            // and additively pull in up to a few lesson cards (deduped by index).
@@ -180,12 +260,20 @@ export class RetrievalOrchestrator {
                // Only the chosen files are actually read off disk (for excerpt extraction).
                let content = '';
                try { content = fs.readFileSync(doc.filePath, 'utf8'); } catch { /* deleted just now — skip */ continue; }
-                // Lesson cards: hand back the whole card (they're meant to be short) so the Prevention Checklist
-                // survives; fall back to a generous excerpt for long ones. Regular notes: the usual 400-char excerpt.
+                // Lesson cards: extract just the high-signal sections (Mistake / Root Cause / Fix /
+                // Prevention Checklist) instead of dumping the whole 2500-char card. Old lessons
+                // without those headings fall back to a query-targeted excerpt. Cuts retrieval tokens
+                // by ~70% per lesson without losing the guardrail content.
+                //
+                // Regular notes: pick the best heading-bounded section for the query (markdown
+                // section retrieval) so that long notes don't dump their intro/setup blocks just
+                // because they happen to be in the top 400 chars. Falls back to keyword-window
+                // extraction inside the section, or whole-doc extraction when there are no
+                // headings at all.
                const excerpt = isLesson
-                    ? (content.length <= 2500 ? content.trim() : extractBestExcerpt(content, expandedTokens, 1500))
-                    : extractBestExcerpt(content, expandedTokens, 400);
-                const cap = isLesson ? 2500 : 400;
+                    ? extractLessonEssence(content, 1200) || extractBestExcerpt(content, expandedTokens, 1200)
+                    : extractBestSection(content, expandedTokens, 600);
+                const cap = isLesson ? 1200 : 600;
                topResults.push({
                    id: `brain-${s.index}`,
                    source: 'brain-memory' as const,
@@ -287,6 +375,70 @@ export class RetrievalOrchestrator {
        return chunks;
    }

+    // ─── Medium-Term: Recent Sessions ───
+
+    /**
+     * Score the user-provided session summaries against the current query
+     * (lightweight token overlap — sessions are small so we skip the TF-IDF
+     * machinery) and return up to `limit` as chunks. Each chunk packs the
+     * title + first user message + last assistant excerpt — enough for the
+     * model to recall the thread without re-injecting the whole transcript.
+     *
+     * Why include recent sessions at all: short-term covers "this conversation",
+     * long-term covers "stable brain notes", but there's a gap for "what we
+     * worked on yesterday/last week" that the user expects me to remember.
+     */
+    private scoreRecentSessions(
+        expandedTokens: string[],
+        sessions: RecentSessionSummary[],
+        limit: number,
+    ): RetrievalChunk[] {
+        if (!sessions || sessions.length === 0 || limit <= 0) return [];
+        const qSet = new Set(expandedTokens.filter((t) => t.length >= 2));
+        const scored = sessions.map((s) => {
+            // Prefer the LLM-compressed summary when present — it's a real
+            // 2-3 sentence recap of the session, so query matches against it
+            // are far more meaningful than against an arbitrary head/tail.
+            const text = s.summary
+                ? `${s.title}\n${s.summary}`
+                : `${s.title}\n${s.firstUserMsg}\n${s.lastAssistantExcerpt}`;
+            const docTokens = tokenize(text);
+            let overlap = 0;
+            for (const t of docTokens) if (qSet.has(t)) overlap++;
+            // Tiny recency boost so equal-overlap sessions prefer the more
+            // recent one (most users mean "what we just discussed"). +0.1 max
+            // for sessions <7 days old, decays to 0 beyond that.
+            const ageDays = s.timestamp ? Math.max(0, (Date.now() - s.timestamp) / 86400000) : 999;
+            const recency = ageDays < 7 ? (7 - ageDays) / 70 : 0;
+            return { s, score: overlap + recency };
+        }).filter((x) => x.score > 0);
+        scored.sort((a, b) => b.score - a.score);
+        const picked = scored.slice(0, limit);
+        if (picked.length === 0) return [];
+        return picked.map(({ s, score }, idx) => {
+            const dateStr = s.timestamp ? new Date(s.timestamp).toISOString().slice(0, 10) : '';
+            // Prefer the LLM-compressed summary; fall back to the raw fragments
+            // when the session ended before the summarizer could run (or was
+            // too short to summarize, < 3 visible messages).
+            const body = s.summary
+                ? [`**${s.title}**${dateStr ? ` (${dateStr})` : ''}`, s.summary].join('\n')
+                : [
+                    `**${s.title}**${dateStr ? ` (${dateStr})` : ''}`,
+                    s.firstUserMsg ? `사용자 요청: ${s.firstUserMsg}` : '',
+                    s.lastAssistantExcerpt ? `이전 답변 마지막 부분: …${s.lastAssistantExcerpt}` : '',
+                ].filter(Boolean).join('\n');
+            return {
+                id: `mtm-${idx}-${s.id}`,
+                source: 'medium-term-memory',
+                title: s.title || '(untitled session)',
+                content: body,
+                score,
+                tokenEstimate: estimateTokens(body),
+                metadata: { category: 'medium-term', lastUpdated: s.timestamp },
+            };
+        });
+    }
+
    // ─── Score Normalization ───

    /**
@@ -315,6 +467,7 @@ export class RetrievalOrchestrator {
            'project-memory': 0.85,
            'long-term-memory': 0.8,
            'procedural-memory': 0.95,  // Procedural is highly specific
+            'medium-term-memory': 0.78, // recent sessions: useful when the user references "last time / yesterday"
            'episodic-memory': 0.7,
            'project-scan': 0.6,
            'recent-knowledge': 0.75
@@ -47,6 +47,54 @@ function parseFrontmatterType(content: string): string {
    return m ? m[1].trim().toLowerCase() : '';
 }

+/**
+ * Pull a specific markdown section ("## NAME ... up to the next heading") from a lesson card.
+ * Returns trimmed body text, or '' if the heading isn't found.
+ */
+function extractSection(content: string, headingRe: RegExp): string {
+    const m = content.match(headingRe);
+    if (!m || m.index === undefined) return '';
+    const after = content.slice(m.index + m[0].length);
+    const stop = after.search(/\n#{1,6}\s/);
+    const section = stop >= 0 ? after.slice(0, stop) : after;
+    return section.trim();
+}
+
+/**
+ * Slim a lesson card down to the sections that actually matter for guardrails:
+ * Mistake / Risk, Root Cause, Fix, and Prevention Checklist. Drops Situation,
+ * Applies-To, and any verbose narrative. Returned text is markdown-compatible
+ * with the original headings so the model still sees the structure.
+ *
+ * Falls back to the original content (clipped to `maxLen`) if no recognised
+ * sections are found — keeps backwards-compat for old lessons that don't
+ * follow the current template.
+ *
+ * Why: lesson cards are loaded at 2500 chars each and three cards can eat
+ * ~11K tokens. The essence sections are usually <600 chars total per card,
+ * which trims retrieval tokens by ~70% without losing the signal.
+ */
+export function extractLessonEssence(content: string, maxLen = 1200): string {
+    if (!content) return '';
+    const sections: Array<{ heading: string; body: string }> = [];
+    const want: Array<[string, RegExp]> = [
+        ['## Mistake / Risk', /^#{1,6}\s*(?:mistake\s*\/?\s*risk|mistake|risk|실수|문제)\s*$/im],
+        ['## Root Cause', /^#{1,6}\s*(?:root\s*cause|근본\s*원인|원인)\s*$/im],
+        ['## Fix', /^#{1,6}\s*(?:fix|해결|수정)\s*$/im],
+        ['## Prevention Checklist', /^#{1,6}\s*(?:prevention\s*checklist|prevention|체크리스트|예방\s*체크리스트)\s*$/im],
+    ];
+    for (const [heading, re] of want) {
+        const body = extractSection(content, re);
+        if (body && !/^<[^>]+>$/.test(body)) sections.push({ heading, body });
+    }
+    if (sections.length === 0) {
+        return content.length <= maxLen ? content.trim() : content.slice(0, maxLen).trim() + '\n…';
+    }
+    let assembled = sections.map((s) => `${s.heading}\n${s.body}`).join('\n\n');
+    if (assembled.length > maxLen) assembled = assembled.slice(0, maxLen).trim() + '\n…';
+    return assembled;
+}
+
 /** Extract the "## Prevention Checklist" bullet list from a lesson card, if present. */
 export function extractPreventionChecklist(content: string): string[] {
    if (!content) return [];
@@ -316,6 +316,121 @@ export function scoreTfIdfPreTokenized(
    });
 }

+/**
+ * Split markdown content into top-level sections by `#` / `##` / `###` headings.
+ *
+ * Returned sections are `{ heading, body }` — `heading` includes the heading
+ * line itself (preserving level), `body` is the text up to the next heading
+ * of the same-or-shallower depth. Front-matter (a leading `--- … ---` block)
+ * is dropped because it's not query-relevant.
+ *
+ * A document with no headings returns one synthetic section
+ * `{ heading: '', body: content }` so callers can treat the result uniformly.
+ *
+ * Why this exists: retrieval was returning whole files (excerpts capped at
+ * 400 chars). On long notes, that excerpt was often the file's intro/setup,
+ * not the section that actually matched the query. Section-level retrieval
+ * lets us pick the relevant heading directly and drop everything else.
+ */
+export interface MarkdownSection {
+    heading: string;
+    body: string;
+}
+export function splitMarkdownSections(content: string): MarkdownSection[] {
+    if (!content) return [];
+    // Strip frontmatter
+    let text = content;
+    if (/^?---\s*\n/.test(text)) {
+        const end = text.indexOf('\n---', 4);
+        if (end >= 0) text = text.slice(end + 4).replace(/^\s*\n/, '');
+    }
+    const lines = text.split('\n');
+    const headingIdx: Array<{ line: number; level: number }> = [];
+    for (let i = 0; i < lines.length; i++) {
+        const m = /^(#{1,6})\s+\S/.exec(lines[i]);
+        if (m) headingIdx.push({ line: i, level: m[1].length });
+    }
+    if (headingIdx.length === 0) {
+        return [{ heading: '', body: text.trim() }];
+    }
+    const sections: MarkdownSection[] = [];
+    // Capture any leading content above the first heading as a "preamble" section.
+    if (headingIdx[0].line > 0) {
+        const preamble = lines.slice(0, headingIdx[0].line).join('\n').trim();
+        if (preamble) sections.push({ heading: '', body: preamble });
+    }
+    for (let i = 0; i < headingIdx.length; i++) {
+        const start = headingIdx[i].line;
+        const end = i + 1 < headingIdx.length ? headingIdx[i + 1].line : lines.length;
+        const heading = lines[start].trim();
+        const body = lines.slice(start + 1, end).join('\n').trim();
+        sections.push({ heading, body });
+    }
+    return sections;
+}
+
+/**
+ * Pick the best heading-bounded section of a markdown document for a query,
+ * then fall back to keyword-window extraction inside that section if the
+ * section itself is still too long.
+ *
+ * Strategy:
+ *   1. Split into sections by heading (`splitMarkdownSections`).
+ *   2. Score each section's heading + body by query token overlap; weight
+ *      heading matches 3× so "## Foo" beats a body mention of "foo".
+ *   3. If the top section's text fits, return it as-is (heading + body).
+ *   4. Otherwise, run `extractBestExcerpt` inside the top section's body and
+ *      prepend the heading.
+ *
+ * Falls back to a plain `extractBestExcerpt` when the document has no
+ * headings — that's what `splitMarkdownSections` returns as a single
+ * synthetic section.
+ *
+ * Caps:
+ *   - Output is always ≤ `maxLength` (final excerpt is sliced as a safety net).
+ *   - Sections smaller than 24 chars after stripping are skipped — they're
+ *     usually empty headings the author left as placeholders.
+ */
+export function extractBestSection(
+    content: string,
+    queryTokens: string[],
+    maxLength = 600
+): string {
+    const sections = splitMarkdownSections(content);
+    if (sections.length === 0) return content.slice(0, maxLength);
+    if (sections.length === 1 && !sections[0].heading) {
+        return extractBestExcerpt(sections[0].body || content, queryTokens, maxLength);
+    }
+    const expanded = expandQuery(queryTokens);
+    const expandedSet = new Set(expanded);
+    const scoreText = (text: string) => {
+        if (!text) return 0;
+        const toks = tokenize(text);
+        let hits = 0;
+        for (const t of toks) if (expandedSet.has(t)) hits++;
+        return hits;
+    };
+    let best = { idx: -1, score: -1 };
+    for (let i = 0; i < sections.length; i++) {
+        const s = sections[i];
+        if ((s.heading.length + s.body.length) < 24) continue;
+        const score = scoreText(s.heading) * 3 + scoreText(s.body);
+        if (score > best.score) best = { idx: i, score };
+    }
+    if (best.idx < 0) {
+        // No section contained any query terms — fall back to a whole-doc excerpt.
+        return extractBestExcerpt(content, queryTokens, maxLength);
+    }
+    const picked = sections[best.idx];
+    const headingLine = picked.heading ? `${picked.heading}\n` : '';
+    const room = Math.max(64, maxLength - headingLine.length);
+    if (picked.body.length <= room) {
+        return (headingLine + picked.body).slice(0, maxLength).trim();
+    }
+    const inner = extractBestExcerpt(picked.body, queryTokens, room);
+    return (headingLine + inner).slice(0, maxLength).trim();
+}
+
 /**
 * 텍스트에서 가장 관련성 높은 구간(excerpt)을 추출합니다.
 * 단순 paragraph 단위가 아니라, 키워드 밀도가 높은 윈도우를 찾습니다.
@@ -7,15 +7,16 @@
 * ============================================================
 */

-export type RetrievalSource = 
-    | 'brain-trace'       // Second Brain Trace
-    | 'brain-memory'      // findRelevantBrainMemory (legacy)
-    | 'long-term-memory'  // Long-Term Memory
-    | 'project-memory'    // Project Memory
-    | 'procedural-memory' // Procedural Memory
-    | 'episodic-memory'   // Episodic Memory
-    | 'project-scan'      // Local Project Path scan
-    | 'recent-knowledge'; // Recent Project Knowledge record
+export type RetrievalSource =
+    | 'brain-trace'        // Second Brain Trace
+    | 'brain-memory'       // findRelevantBrainMemory (legacy)
+    | 'long-term-memory'   // Long-Term Memory
+    | 'medium-term-memory' // Recent session summaries (memoryMediumTermSessions)
+    | 'project-memory'     // Project Memory
+    | 'procedural-memory'  // Procedural Memory
+    | 'episodic-memory'    // Episodic Memory
+    | 'project-scan'       // Local Project Path scan
+    | 'recent-knowledge';  // Recent Project Knowledge record

 export type ConflictSeverity = 'NONE' | 'LOW' | 'MEDIUM' | 'HIGH';