connectai/src/retrieval/index.ts

/**
 * ============================================================
 * RetrievalOrchestrator — Unified RAG Pipeline
 *
 * Astra의 모든 검색 소스를 통합 관리하는 오케스트레이터입니다.
 *
 * 검색 흐름:
 * ① Query Planning   — 의도 분류 + 검색 전략 결정
 * ② Parallel Search   — Brain + Memory + Project + Episode 동시 검색
 * ③ Result Fusion     — 통합 스코어링 + 중복 제거
 * ④ Context Budget    — 토큰 예산 내에서 최종 선택
 * ============================================================
 */

import * as fs from 'fs';
import * as path from 'path';
import { BrainProfile } from '../config';
import { findBrainFiles, summarizeText } from '../utils';
import { isInside } from '../lib/paths';
import { MemoryManager } from '../memory';
import { RetrievalChunk, RetrievalResult, ContextBudgetConfig } from './types';
import { tokenize, expandQuery, scoreTfIdfPreTokenized, extractBestExcerpt, extractBestSection } from './scoring';
import { selectWithinBudget, assembleContext, estimateTokens } from './contextBudget';
import { getBrainTokenIndex, getBrainEmbeddings } from './brainIndex';
import { extractLessonEssence } from './lessonHelpers';
import { cosineSimilarity } from './embeddings';

export { tokenize, expandQuery, scoreTfIdf, scoreTfIdfPreTokenized, extractBestExcerpt } from './scoring';
export { selectWithinBudget, assembleContext, estimateTokens } from './contextBudget';
export { getBrainTokenIndex, clearBrainTokenIndex } from './brainIndex';
export * from './types';

/** Compact summary of a past chat session for medium-term memory retrieval. */
export interface RecentSessionSummary {
    id: string;
    title: string;
    firstUserMsg: string;
    lastAssistantExcerpt: string;
    /**
     * Optional LLM-compressed recap stored at session end (~200 chars).
     * When present, retrieval uses this instead of the firstUserMsg+tail
     * fragment because it actually captures the decision/outcome.
     */
    summary?: string;
    timestamp: number;
}

interface RetrievalOptions {
    brain: BrainProfile;
    memoryManager: MemoryManager;
    workspacePath?: string;
    chatHistory?: Array<{ role: string; content: string }>;
    contextBudget?: Partial<ContextBudgetConfig>;
    brainFileLimit?: number;
    includeRawConversations?: boolean;
    /**
     * Optional absolute folder paths constraining brain-file search to those
     * subtrees. When provided and non-empty, only brain files inside one of
     * the folders are considered. Empty / undefined preserves whole-brain
     * search (legacy behavior). Folders that escape the brain root are
     * silently dropped by the caller (see `agentKnowledgeMap.resolveScopeForAgent`).
     */
    scopeFolders?: string[];
    /**
     * Compact summaries of recently-touched chat sessions (excluding the
     * active one). Scored against the query and the top `mediumTermLimit`
     * are injected as medium-term memory chunks. Caller pre-computes these
     * to avoid threading vscode/ExtensionContext through this module.
     */
    recentSessions?: RecentSessionSummary[];
    /** Max number of medium-term session chunks to include after scoring. */
    mediumTermLimit?: number;
    /**
     * Optional query embedding for hybrid (sparse+dense) brain search. When
     * provided, each candidate file's cached embedding is cosine-matched and
     * blended with the TF-IDF score by `embeddingBlendAlpha`. Caller computes
     * this once per turn so we don't pay the embedding RTT inside scoring.
     */
    queryEmbedding?: number[];
    /** Embedding model name (used as a cache key on the brain index side). */
    embeddingModel?: string;
    /** Blend weight: 0 = TF-IDF only, 1 = cosine only. Default 0.5. */
    embeddingBlendAlpha?: number;
}

export class RetrievalOrchestrator {
    /**
     * 통합 검색을 수행합니다.
     * 모든 소스에서 검색 → TF-IDF 스코어링 → 중복 제거 → 예산 내 선택
     */
    public retrieve(query: string, options: RetrievalOptions): RetrievalResult {
        const fusionLog: string[] = [];
        const allChunks: RetrievalChunk[] = [];
        const queryTokens = tokenize(query);
        const expandedTokens = expandQuery(queryTokens);

        fusionLog.push(`Query tokens: [${queryTokens.slice(0, 10).join(', ')}]`);
        fusionLog.push(`Expanded tokens: [${expandedTokens.slice(0, 15).join(', ')}]`);

        // ── ① Brain File Search (TF-IDF enhanced, optionally hybrid with embeddings) ──
        // `brainFileLimit === 0` is meaningful (Knowledge Mix "model knowledge only"
        // mode), so use `??` rather than `||`. When the caller explicitly passes 0,
        // we skip retrieval entirely instead of falling back to the default of 8.
        const scopeFolders = options.scopeFolders ?? [];
        const brainFileLimit = options.brainFileLimit ?? 8;
        const brainChunks = brainFileLimit > 0
            ? this.searchBrainFiles(
                query,
                expandedTokens,
                options.brain,
                brainFileLimit,
                options.includeRawConversations || false,
                scopeFolders,
                options.queryEmbedding,
                options.embeddingModel,
                options.embeddingBlendAlpha
            )
            : [];
        allChunks.push(...brainChunks);
        fusionLog.push(
            brainFileLimit === 0
                ? 'Brain search: skipped (Knowledge Mix weight = 0)'
                : scopeFolders.length > 0
                    ? `Brain search (scoped to ${scopeFolders.length} folder(s)): ${brainChunks.length} chunks`
                    : `Brain search: ${brainChunks.length} chunks found`
        );

        // ── ② Memory Layers ──
        const memoryChunks = this.searchMemoryLayers(
            query,
            options.memoryManager,
            options.chatHistory || [],
            options.workspacePath
        );
        allChunks.push(...memoryChunks);
        fusionLog.push(`Memory search: ${memoryChunks.length} chunks found`);

        // ── ②-b Medium-Term Memory (recent sessions) ──
        const mediumChunks = this.scoreRecentSessions(
            expandedTokens,
            options.recentSessions || [],
            options.mediumTermLimit ?? 0
        );
        allChunks.push(...mediumChunks);
        fusionLog.push(`Medium-term sessions: ${mediumChunks.length} chunks selected`);

        // ── ③ Result Fusion — normalize scores across sources ──
        this.normalizeScores(allChunks);
        fusionLog.push(`Total chunks before budget: ${allChunks.length}`);

        // ── ④ Context Budget Selection ──
        const { selected, dropped, tokensUsed } = selectWithinBudget(
            allChunks,
            options.contextBudget
        );
        // Pull lesson/playbook/qa-finding chunks out so callers can inject them as a prominent
        // "verify before finalizing" block rather than burying them in the brain-knowledge section.
        const lessonChunks = selected.filter((c) => c.metadata.isLesson);
        const selectedChunks = selected.filter((c) => !c.metadata.isLesson);
        fusionLog.push(`Selected: ${selectedChunks.length} (+${lessonChunks.length} lesson), Dropped: ${dropped.length}, Tokens: ${tokensUsed}`);

        return {
            query,
            totalChunks: allChunks.length,
            selectedChunks,
            droppedChunks: dropped,
            lessonChunks,
            totalTokensUsed: tokensUsed,
            contextBudget: options.contextBudget?.totalBudget || 8000,
            fusionLog
        };
    }

    /**
     * 검색 결과를 최종 컨텍스트 문자열로 변환합니다 (레슨 청크는 제외 — 별도 블록으로 주입).
     */
    public buildContextString(result: RetrievalResult): string {
        return assembleContext(result.selectedChunks);
    }

    // ─── Brain File Search ───

    private searchBrainFiles(
        query: string,
        expandedTokens: string[],
        brain: BrainProfile,
        limit: number,
        includeRaw: boolean,
        scopeFolders: string[] = [],
        queryEmbedding?: number[],
        embeddingModel?: string,
        embeddingBlendAlpha?: number,
    ): RetrievalChunk[] {
        try {
            const scoped = (file: string) => scopeFolders.length === 0
                || scopeFolders.some((folder) => isInside(folder, file));
            const allFiles = findBrainFiles(brain.localBrainPath)
                .filter(scoped)
                .filter((file) => includeRaw || !this.isRawConversation(path.relative(brain.localBrainPath, file)));

            if (allFiles.length === 0) return [];

            // Tokenized docs from the persistent mtime-keyed index — unchanged files are not re-read
            // or re-tokenized, so per-query work over a large brain drops from O(total content) to O(files) stats.
            const indexed = getBrainTokenIndex(brain.localBrainPath, allFiles);
            if (indexed.length === 0) return [];

            const scored = scoreTfIdfPreTokenized(
                expandedTokens,
                indexed.map((d) => ({
                    tokens: d.tokens,
                    titleTokens: d.titleTokens,
                    lastModified: d.mtimeMs,
                    conflictCount: d.conflictCount,
                }))
            );

            // Hybrid blend: when the caller provided a query embedding and an
            // embedding model, fetch the cached file vectors and add a cosine
            // similarity term to each score. We normalise TF-IDF scores by the
            // top observed value so the two terms live on the same scale before
            // blending. Files without a cached embedding keep their pure TF-IDF
            // score so adding/missing embeddings doesn't hurt retrieval.
            if (queryEmbedding && embeddingModel && (embeddingBlendAlpha ?? 0) > 0) {
                const alpha = Math.max(0, Math.min(1, embeddingBlendAlpha!));
                const filePaths = indexed.map((d) => d.filePath);
                const embeddings = getBrainEmbeddings(brain.localBrainPath, filePaths, embeddingModel);
                if (embeddings.size > 0) {
                    const maxTfidf = scored.reduce((m, s) => s.score > m ? s.score : m, 0) || 1;
                    let hits = 0;
                    for (const s of scored) {
                        const fp = indexed[s.index].filePath;
                        const vec = embeddings.get(fp);
                        if (!vec) continue;
                        const cos = cosineSimilarity(queryEmbedding, vec); // [-1, 1] in theory; positive for typical embedding spaces
                        const tfidfNorm = s.score / maxTfidf;
                        s.score = (1 - alpha) * tfidfNorm + alpha * Math.max(0, cos);
                        hits++;
                    }
                    if (hits > 0) {
                        // Re-sort downstream is handled by the .filter().sort() that follows.
                    }
                }
            }

            // Always consider lesson cards for the top slots even if they didn't crack the raw-score top-`limit`:
            // they're short, high-signal, and we want them surfaced when relevant. We keep the regular top-`limit`
            // and additively pull in up to a few lesson cards (deduped by index).
            const ranked = scored.filter((x) => x.score > 0).sort((a, b) => b.score - a.score);
            const pickedIdx = new Set<number>();
            for (const s of ranked.slice(0, limit)) pickedIdx.add(s.index);
            const LESSON_EXTRA = 3;
            let lessonExtra = 0;
            for (const s of ranked) {
                if (lessonExtra >= LESSON_EXTRA) break;
                if (pickedIdx.has(s.index)) continue;
                if ((indexed[s.index].kind || '') === '') continue;
                pickedIdx.add(s.index);
                lessonExtra++;
            }
            // Preserve rank order for the chosen set.
            const chosen = ranked.filter((s) => pickedIdx.has(s.index));

            const topResults: RetrievalChunk[] = [];
            for (const s of chosen) {
                const doc = indexed[s.index];
                const isLesson = (doc.kind || '') !== '';
                // Only the chosen files are actually read off disk (for excerpt extraction).
                let content = '';
                try { content = fs.readFileSync(doc.filePath, 'utf8'); } catch { /* deleted just now — skip */ continue; }
                // Lesson cards: extract just the high-signal sections (Mistake / Root Cause / Fix /
                // Prevention Checklist) instead of dumping the whole 2500-char card. Old lessons
                // without those headings fall back to a query-targeted excerpt. Cuts retrieval tokens
                // by ~70% per lesson without losing the guardrail content.
                //
                // Regular notes: pick the best heading-bounded section for the query (markdown
                // section retrieval) so that long notes don't dump their intro/setup blocks just
                // because they happen to be in the top 400 chars. Falls back to keyword-window
                // extraction inside the section, or whole-doc extraction when there are no
                // headings at all.
                const excerpt = isLesson
                    ? extractLessonEssence(content, 1200) || extractBestExcerpt(content, expandedTokens, 1200)
                    : extractBestSection(content, expandedTokens, 600);
                const cap = isLesson ? 1200 : 600;
                topResults.push({
                    id: `brain-${s.index}`,
                    source: 'brain-memory' as const,
                    title: doc.relativePath,
                    content: summarizeText(excerpt, cap),
                    score: s.score,
                    tokenEstimate: estimateTokens(excerpt),
                    metadata: {
                        filePath: doc.filePath,
                        category: this.inferCategory(doc.relativePath),
                        isProjectEvidence: this.isProjectEvidence(doc.relativePath, content),
                        lastUpdated: doc.mtimeMs,
                        // Phase 5: Scoring Intelligence Integration
                        conflictDetected: s.conflictDetected,
                        conflictSeverity: s.conflictSeverity,
                        queryCoverage: s.queryCoverage,
                        ...(isLesson ? { isLesson: true, lessonKind: doc.kind } : {}),
                    },
                });
            }
            return topResults;
        } catch {
            return [];
        }
    }

    // ─── Memory Layer Search ───

    private searchMemoryLayers(
        query: string,
        memoryManager: MemoryManager,
        chatHistory: Array<{ role: string; content: string }>,
        workspacePath?: string
    ): RetrievalChunk[] {
        const chunks: RetrievalChunk[] = [];

        // Long-Term Memory
        const ltm = memoryManager.getLongTermMemory();
        const ltmContext = ltm.buildContext(query);
        if (ltmContext) {
            chunks.push({
                id: 'ltm-context',
                source: 'long-term-memory',
                title: ltmContext.label,
                content: ltmContext.content,
                score: ltmContext.relevance,
                tokenEstimate: estimateTokens(ltmContext.content),
                metadata: { category: 'long-term' }
            });
        }

        // Project Memory
        if (workspacePath) {
            const pm = memoryManager.getProjectMemory(workspacePath);
            const pmContext = pm.buildContext(query);
            if (pmContext) {
                chunks.push({
                    id: 'pm-context',
                    source: 'project-memory',
                    title: pmContext.label,
                    content: pmContext.content,
                    score: pmContext.relevance,
                    tokenEstimate: estimateTokens(pmContext.content),
                    metadata: { category: 'project', isProjectEvidence: true }
                });
            }
        }

        // Procedural Memory
        const proc = memoryManager.getProceduralMemory();
        const procContext = proc.buildContext(query);
        if (procContext) {
            chunks.push({
                id: 'proc-context',
                source: 'procedural-memory',
                title: procContext.label,
                content: procContext.content,
                score: procContext.relevance,
                tokenEstimate: estimateTokens(procContext.content),
                metadata: { category: 'procedural' }
            });
        }

        // Episodic Memory
        const ep = memoryManager.getEpisodicMemory();
        const epContext = ep.buildContext(query);
        if (epContext) {
            chunks.push({
                id: 'ep-context',
                source: 'episodic-memory',
                title: epContext.label,
                content: epContext.content,
                score: epContext.relevance,
                tokenEstimate: estimateTokens(epContext.content),
                metadata: { category: 'episodic' }
            });
        }

        return chunks;
    }

    // ─── Medium-Term: Recent Sessions ───

    /**
     * Score the user-provided session summaries against the current query
     * (lightweight token overlap — sessions are small so we skip the TF-IDF
     * machinery) and return up to `limit` as chunks. Each chunk packs the
     * title + first user message + last assistant excerpt — enough for the
     * model to recall the thread without re-injecting the whole transcript.
     *
     * Why include recent sessions at all: short-term covers "this conversation",
     * long-term covers "stable brain notes", but there's a gap for "what we
     * worked on yesterday/last week" that the user expects me to remember.
     */
    private scoreRecentSessions(
        expandedTokens: string[],
        sessions: RecentSessionSummary[],
        limit: number,
    ): RetrievalChunk[] {
        if (!sessions || sessions.length === 0 || limit <= 0) return [];
        const qSet = new Set(expandedTokens.filter((t) => t.length >= 2));
        const scored = sessions.map((s) => {
            // Prefer the LLM-compressed summary when present — it's a real
            // 2-3 sentence recap of the session, so query matches against it
            // are far more meaningful than against an arbitrary head/tail.
            const text = s.summary
                ? `${s.title}\n${s.summary}`
                : `${s.title}\n${s.firstUserMsg}\n${s.lastAssistantExcerpt}`;
            const docTokens = tokenize(text);
            let overlap = 0;
            for (const t of docTokens) if (qSet.has(t)) overlap++;
            // Tiny recency boost so equal-overlap sessions prefer the more
            // recent one (most users mean "what we just discussed"). +0.1 max
            // for sessions <7 days old, decays to 0 beyond that.
            const ageDays = s.timestamp ? Math.max(0, (Date.now() - s.timestamp) / 86400000) : 999;
            const recency = ageDays < 7 ? (7 - ageDays) / 70 : 0;
            return { s, score: overlap + recency };
        }).filter((x) => x.score > 0);
        scored.sort((a, b) => b.score - a.score);
        const picked = scored.slice(0, limit);
        if (picked.length === 0) return [];
        return picked.map(({ s, score }, idx) => {
            const dateStr = s.timestamp ? new Date(s.timestamp).toISOString().slice(0, 10) : '';
            // Prefer the LLM-compressed summary; fall back to the raw fragments
            // when the session ended before the summarizer could run (or was
            // too short to summarize, < 3 visible messages).
            const body = s.summary
                ? [`**${s.title}**${dateStr ? ` (${dateStr})` : ''}`, s.summary].join('\n')
                : [
                    `**${s.title}**${dateStr ? ` (${dateStr})` : ''}`,
                    s.firstUserMsg ? `사용자 요청: ${s.firstUserMsg}` : '',
                    s.lastAssistantExcerpt ? `이전 답변 마지막 부분: …${s.lastAssistantExcerpt}` : '',
                ].filter(Boolean).join('\n');
            return {
                id: `mtm-${idx}-${s.id}`,
                source: 'medium-term-memory',
                title: s.title || '(untitled session)',
                content: body,
                score,
                tokenEstimate: estimateTokens(body),
                metadata: { category: 'medium-term', lastUpdated: s.timestamp },
            };
        });
    }

    // ─── Score Normalization ───

    /**
     * 서로 다른 스코어 스케일을 가진 소스들의 점수를 0~1로 정규화합니다.
     */
    private normalizeScores(chunks: RetrievalChunk[]): void {
        // Group by source
        const groups = new Map<string, RetrievalChunk[]>();
        for (const chunk of chunks) {
            if (!groups.has(chunk.source)) groups.set(chunk.source, []);
            groups.get(chunk.source)!.push(chunk);
        }

        // Normalize each group independently
        for (const [, group] of groups) {
            const maxScore = Math.max(...group.map((c) => c.score), 0.001);
            for (const chunk of group) {
                chunk.score = chunk.score / maxScore;
            }
        }

        // Source priority boost (some sources are inherently more valuable for RAG)
        const sourceBoost: Record<string, number> = {
            'brain-trace': 1.0,
            'brain-memory': 0.9,
            'project-memory': 0.85,
            'long-term-memory': 0.8,
            'procedural-memory': 0.95,  // Procedural is highly specific
            'medium-term-memory': 0.78, // recent sessions: useful when the user references "last time / yesterday"
            'episodic-memory': 0.7,
            'project-scan': 0.6,
            'recent-knowledge': 0.75
        };

        for (const chunk of chunks) {
            const boost = sourceBoost[chunk.source] || 0.5;
            chunk.score *= boost;
            // Lesson cards are short, high-signal guardrails — nudge relevant ones above ordinary brain notes
            // so they survive the budget. Modest (1.4×) so they don't crowd everything out when many match.
            if (chunk.metadata.isLesson) chunk.score *= 1.4;
        }
    }

    // ─── Helpers ───

    private isRawConversation(relativePath: string): boolean {
        return /(^|[\\/])(00_Raw|raw-data|conversations?|transcripts?)([\\/]|$)/i.test(relativePath);
    }

    private inferCategory(relativePath: string): string {
        const normalized = relativePath.toLowerCase();
        if (/(decisions?|adr|planning)/i.test(normalized)) return 'decision';
        if (/(records|development|bugs)/i.test(normalized)) return 'project-record';
        if (/(architecture|design|pattern)/i.test(normalized)) return 'architecture';
        if (/(knowledge|wiki|topics)/i.test(normalized)) return 'knowledge';
        return 'general';
    }

    private isProjectEvidence(relativePath: string, content: string): boolean {
        const normalized = relativePath.toLowerCase();
        if (/(records|planning|development|bugs|retrospectives|projectchronicle)/i.test(normalized)) return true;
        if (/adr-\d+|(^|[\\/])decisions?([\\/]|$)/i.test(normalized)) return true;
        return false;
    }
}