connectai/src/retrieval/index.ts

/**
 * ============================================================
 * RetrievalOrchestrator — Unified RAG Pipeline
 *
 * Astra의 모든 검색 소스를 통합 관리하는 오케스트레이터입니다.
 *
 * 검색 흐름:
 * ① Query Planning   — 의도 분류 + 검색 전략 결정
 * ② Parallel Search   — Brain + Memory + Project + Episode 동시 검색
 * ③ Result Fusion     — 통합 스코어링 + 중복 제거
 * ④ Context Budget    — 토큰 예산 내에서 최종 선택
 * ============================================================
 */

import * as fs from 'fs';
import * as path from 'path';
import { BrainProfile } from '../config';
import { findBrainFiles, summarizeText } from '../utils';
import { isInside } from '../lib/paths';
import { MemoryManager } from '../memory';
import { RetrievalChunk, RetrievalResult, ContextBudgetConfig } from './types';
import { tokenize, expandQuery, scoreTfIdfPreTokenized, extractBestExcerpt } from './scoring';
import { selectWithinBudget, assembleContext, estimateTokens } from './contextBudget';
import { getBrainTokenIndex } from './brainIndex';

export { tokenize, expandQuery, scoreTfIdf, scoreTfIdfPreTokenized, extractBestExcerpt } from './scoring';
export { selectWithinBudget, assembleContext, estimateTokens } from './contextBudget';
export { getBrainTokenIndex, clearBrainTokenIndex } from './brainIndex';
export * from './types';

interface RetrievalOptions {
    brain: BrainProfile;
    memoryManager: MemoryManager;
    workspacePath?: string;
    chatHistory?: Array<{ role: string; content: string }>;
    contextBudget?: Partial<ContextBudgetConfig>;
    brainFileLimit?: number;
    includeRawConversations?: boolean;
    /**
     * Optional absolute folder paths constraining brain-file search to those
     * subtrees. When provided and non-empty, only brain files inside one of
     * the folders are considered. Empty / undefined preserves whole-brain
     * search (legacy behavior). Folders that escape the brain root are
     * silently dropped by the caller (see `agentKnowledgeMap.resolveScopeForAgent`).
     */
    scopeFolders?: string[];
}

export class RetrievalOrchestrator {
    /**
     * 통합 검색을 수행합니다.
     * 모든 소스에서 검색 → TF-IDF 스코어링 → 중복 제거 → 예산 내 선택
     */
    public retrieve(query: string, options: RetrievalOptions): RetrievalResult {
        const fusionLog: string[] = [];
        const allChunks: RetrievalChunk[] = [];
        const queryTokens = tokenize(query);
        const expandedTokens = expandQuery(queryTokens);

        fusionLog.push(`Query tokens: [${queryTokens.slice(0, 10).join(', ')}]`);
        fusionLog.push(`Expanded tokens: [${expandedTokens.slice(0, 15).join(', ')}]`);

        // ── ① Brain File Search (TF-IDF enhanced) ──
        const scopeFolders = options.scopeFolders ?? [];
        const brainChunks = this.searchBrainFiles(
            query,
            expandedTokens,
            options.brain,
            options.brainFileLimit || 8,
            options.includeRawConversations || false,
            scopeFolders
        );
        allChunks.push(...brainChunks);
        fusionLog.push(
            scopeFolders.length > 0
                ? `Brain search (scoped to ${scopeFolders.length} folder(s)): ${brainChunks.length} chunks`
                : `Brain search: ${brainChunks.length} chunks found`
        );

        // ── ② Memory Layers ──
        const memoryChunks = this.searchMemoryLayers(
            query,
            options.memoryManager,
            options.chatHistory || [],
            options.workspacePath
        );
        allChunks.push(...memoryChunks);
        fusionLog.push(`Memory search: ${memoryChunks.length} chunks found`);

        // ── ③ Result Fusion — normalize scores across sources ──
        this.normalizeScores(allChunks);
        fusionLog.push(`Total chunks before budget: ${allChunks.length}`);

        // ── ④ Context Budget Selection ──
        const { selected, dropped, tokensUsed } = selectWithinBudget(
            allChunks,
            options.contextBudget
        );
        // Pull lesson/playbook/qa-finding chunks out so callers can inject them as a prominent
        // "verify before finalizing" block rather than burying them in the brain-knowledge section.
        const lessonChunks = selected.filter((c) => c.metadata.isLesson);
        const selectedChunks = selected.filter((c) => !c.metadata.isLesson);
        fusionLog.push(`Selected: ${selectedChunks.length} (+${lessonChunks.length} lesson), Dropped: ${dropped.length}, Tokens: ${tokensUsed}`);

        return {
            query,
            totalChunks: allChunks.length,
            selectedChunks,
            droppedChunks: dropped,
            lessonChunks,
            totalTokensUsed: tokensUsed,
            contextBudget: options.contextBudget?.totalBudget || 8000,
            fusionLog
        };
    }

    /**
     * 검색 결과를 최종 컨텍스트 문자열로 변환합니다 (레슨 청크는 제외 — 별도 블록으로 주입).
     */
    public buildContextString(result: RetrievalResult): string {
        return assembleContext(result.selectedChunks);
    }

    // ─── Brain File Search ───

    private searchBrainFiles(
        query: string,
        expandedTokens: string[],
        brain: BrainProfile,
        limit: number,
        includeRaw: boolean,
        scopeFolders: string[] = []
    ): RetrievalChunk[] {
        try {
            const scoped = (file: string) => scopeFolders.length === 0
                || scopeFolders.some((folder) => isInside(folder, file));
            const allFiles = findBrainFiles(brain.localBrainPath)
                .filter(scoped)
                .filter((file) => includeRaw || !this.isRawConversation(path.relative(brain.localBrainPath, file)));

            if (allFiles.length === 0) return [];

            // Tokenized docs from the persistent mtime-keyed index — unchanged files are not re-read
            // or re-tokenized, so per-query work over a large brain drops from O(total content) to O(files) stats.
            const indexed = getBrainTokenIndex(brain.localBrainPath, allFiles);
            if (indexed.length === 0) return [];

            const scored = scoreTfIdfPreTokenized(
                expandedTokens,
                indexed.map((d) => ({
                    tokens: d.tokens,
                    titleTokens: d.titleTokens,
                    lastModified: d.mtimeMs,
                    conflictCount: d.conflictCount,
                }))
            );

            // Always consider lesson cards for the top slots even if they didn't crack the raw-score top-`limit`:
            // they're short, high-signal, and we want them surfaced when relevant. We keep the regular top-`limit`
            // and additively pull in up to a few lesson cards (deduped by index).
            const ranked = scored.filter((x) => x.score > 0).sort((a, b) => b.score - a.score);
            const pickedIdx = new Set<number>();
            for (const s of ranked.slice(0, limit)) pickedIdx.add(s.index);
            const LESSON_EXTRA = 3;
            let lessonExtra = 0;
            for (const s of ranked) {
                if (lessonExtra >= LESSON_EXTRA) break;
                if (pickedIdx.has(s.index)) continue;
                if ((indexed[s.index].kind || '') === '') continue;
                pickedIdx.add(s.index);
                lessonExtra++;
            }
            // Preserve rank order for the chosen set.
            const chosen = ranked.filter((s) => pickedIdx.has(s.index));

            const topResults: RetrievalChunk[] = [];
            for (const s of chosen) {
                const doc = indexed[s.index];
                const isLesson = (doc.kind || '') !== '';
                // Only the chosen files are actually read off disk (for excerpt extraction).
                let content = '';
                try { content = fs.readFileSync(doc.filePath, 'utf8'); } catch { /* deleted just now — skip */ continue; }
                // Lesson cards: hand back the whole card (they're meant to be short) so the Prevention Checklist
                // survives; fall back to a generous excerpt for long ones. Regular notes: the usual 400-char excerpt.
                const excerpt = isLesson
                    ? (content.length <= 2500 ? content.trim() : extractBestExcerpt(content, expandedTokens, 1500))
                    : extractBestExcerpt(content, expandedTokens, 400);
                const cap = isLesson ? 2500 : 400;
                topResults.push({
                    id: `brain-${s.index}`,
                    source: 'brain-memory' as const,
                    title: doc.relativePath,
                    content: summarizeText(excerpt, cap),
                    score: s.score,
                    tokenEstimate: estimateTokens(excerpt),
                    metadata: {
                        filePath: doc.filePath,
                        category: this.inferCategory(doc.relativePath),
                        isProjectEvidence: this.isProjectEvidence(doc.relativePath, content),
                        lastUpdated: doc.mtimeMs,
                        // Phase 5: Scoring Intelligence Integration
                        conflictDetected: s.conflictDetected,
                        conflictSeverity: s.conflictSeverity,
                        informationDensity: s.informationDensity,
                        ...(isLesson ? { isLesson: true, lessonKind: doc.kind } : {}),
                    },
                });
            }
            return topResults;
        } catch {
            return [];
        }
    }

    // ─── Memory Layer Search ───

    private searchMemoryLayers(
        query: string,
        memoryManager: MemoryManager,
        chatHistory: Array<{ role: string; content: string }>,
        workspacePath?: string
    ): RetrievalChunk[] {
        const chunks: RetrievalChunk[] = [];

        // Long-Term Memory
        const ltm = memoryManager.getLongTermMemory();
        const ltmContext = ltm.buildContext(query);
        if (ltmContext) {
            chunks.push({
                id: 'ltm-context',
                source: 'long-term-memory',
                title: ltmContext.label,
                content: ltmContext.content,
                score: ltmContext.relevance,
                tokenEstimate: estimateTokens(ltmContext.content),
                metadata: { category: 'long-term' }
            });
        }

        // Project Memory
        if (workspacePath) {
            const pm = memoryManager.getProjectMemory(workspacePath);
            const pmContext = pm.buildContext(query);
            if (pmContext) {
                chunks.push({
                    id: 'pm-context',
                    source: 'project-memory',
                    title: pmContext.label,
                    content: pmContext.content,
                    score: pmContext.relevance,
                    tokenEstimate: estimateTokens(pmContext.content),
                    metadata: { category: 'project', isProjectEvidence: true }
                });
            }
        }

        // Procedural Memory
        const proc = memoryManager.getProceduralMemory();
        const procContext = proc.buildContext(query);
        if (procContext) {
            chunks.push({
                id: 'proc-context',
                source: 'procedural-memory',
                title: procContext.label,
                content: procContext.content,
                score: procContext.relevance,
                tokenEstimate: estimateTokens(procContext.content),
                metadata: { category: 'procedural' }
            });
        }

        // Episodic Memory
        const ep = memoryManager.getEpisodicMemory();
        const epContext = ep.buildContext(query);
        if (epContext) {
            chunks.push({
                id: 'ep-context',
                source: 'episodic-memory',
                title: epContext.label,
                content: epContext.content,
                score: epContext.relevance,
                tokenEstimate: estimateTokens(epContext.content),
                metadata: { category: 'episodic' }
            });
        }

        return chunks;
    }

    // ─── Score Normalization ───

    /**
     * 서로 다른 스코어 스케일을 가진 소스들의 점수를 0~1로 정규화합니다.
     */
    private normalizeScores(chunks: RetrievalChunk[]): void {
        // Group by source
        const groups = new Map<string, RetrievalChunk[]>();
        for (const chunk of chunks) {
            if (!groups.has(chunk.source)) groups.set(chunk.source, []);
            groups.get(chunk.source)!.push(chunk);
        }

        // Normalize each group independently
        for (const [, group] of groups) {
            const maxScore = Math.max(...group.map((c) => c.score), 0.001);
            for (const chunk of group) {
                chunk.score = chunk.score / maxScore;
            }
        }

        // Source priority boost (some sources are inherently more valuable for RAG)
        const sourceBoost: Record<string, number> = {
            'brain-trace': 1.0,
            'brain-memory': 0.9,
            'project-memory': 0.85,
            'long-term-memory': 0.8,
            'procedural-memory': 0.95,  // Procedural is highly specific
            'episodic-memory': 0.7,
            'project-scan': 0.6,
            'recent-knowledge': 0.75
        };

        for (const chunk of chunks) {
            const boost = sourceBoost[chunk.source] || 0.5;
            chunk.score *= boost;
            // Lesson cards are short, high-signal guardrails — nudge relevant ones above ordinary brain notes
            // so they survive the budget. Modest (1.4×) so they don't crowd everything out when many match.
            if (chunk.metadata.isLesson) chunk.score *= 1.4;
        }
    }

    // ─── Helpers ───

    private isRawConversation(relativePath: string): boolean {
        return /(^|[\\/])(00_Raw|raw-data|conversations?|transcripts?)([\\/]|$)/i.test(relativePath);
    }

    private inferCategory(relativePath: string): string {
        const normalized = relativePath.toLowerCase();
        if (/(decisions?|adr|planning)/i.test(normalized)) return 'decision';
        if (/(records|development|bugs)/i.test(normalized)) return 'project-record';
        if (/(architecture|design|pattern)/i.test(normalized)) return 'architecture';
        if (/(knowledge|wiki|topics)/i.test(normalized)) return 'knowledge';
        return 'general';
    }

    private isProjectEvidence(relativePath: string, content: string): boolean {
        const normalized = relativePath.toLowerCase();
        if (/(records|planning|development|bugs|retrospectives|projectchronicle)/i.test(normalized)) return true;
        if (/adr-\d+|(^|[\\/])decisions?([\\/]|$)/i.test(normalized)) return true;
        return false;
    }
}