feat(retrieval): 청킹/평가 하니스 + 검색 인덱스 개선

- src/retrieval/chunker.ts: 문서 청킹 로직 추가 - src/retrieval/evalHarness.ts + src/extension/evalCommands.ts: 검색 품질 평가 하니스 - brainIndex.ts / retrieval/index.ts / memoryContext.ts: 인덱싱·컨텍스트 빌더 개선 - config.ts / extension.ts / sidebarProvider.ts / package.json 갱신 - ADR-0030~0032 및 개발 기록, .astra 런타임 상태 동기화 Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-08 19:27:10 +09:00
parent b94e6ad1da
commit d39eb27c90
26 changed files with 1471 additions and 208 deletions
@@ -21,7 +21,7 @@ import { MemoryManager } from '../memory';
 import { RetrievalChunk, RetrievalResult, ContextBudgetConfig } from './types';
 import { tokenize, expandQuery, scoreTfIdfPreTokenized, extractBestExcerpt, extractBestSection } from './scoring';
 import { selectWithinBudget, assembleContext, estimateTokens } from './contextBudget';
-import { getBrainTokenIndex, getBrainEmbeddings } from './brainIndex';
+import { getBrainTokenIndex, getBrainEmbeddings, getBrainChunkIndex, getBrainChunkEmbeddings } from './brainIndex';
 import { extractLessonEssence } from './lessonHelpers';
 import { cosineSimilarity } from './embeddings';
 import { applyActionabilityBoost, WorkStateSignals, ActionabilityWeights } from './actionabilityScoring';
@@ -97,6 +97,14 @@ interface RetrievalOptions {
    hierarchicalReweightEnabled?: boolean;
    /** Hierarchical 가중치 override. undefined 면 default. */
    hierarchicalWeights?: HierarchicalWeights;
+    /**
+     * Section-level chunking (Phase 1-가). true 면 brain 검색이 파일이 아니라 섹션 청크
+     * 단위로 색인·스코어링하고, 매치된 *섹션* 을 그대로 주입한다. false/undefined 면 기존
+     * 파일 단위 동작.
+     */
+    chunkLevelRetrieval?: boolean;
+    /** 섹션 청크 목표 길이(문자). 기본 1200. chunkLevelRetrieval 일 때만 사용. */
+    chunkTargetChars?: number;
 }

 export class RetrievalOrchestrator {
@@ -129,7 +137,9 @@ export class RetrievalOrchestrator {
                scopeFolders,
                options.queryEmbedding,
                options.embeddingModel,
-                options.embeddingBlendAlpha
+                options.embeddingBlendAlpha,
+                options.chunkLevelRetrieval || false,
+                options.chunkTargetChars ?? 1200,
            )
            : [];
        allChunks.push(...brainChunks);
@@ -213,6 +223,58 @@ export class RetrievalOrchestrator {
        return assembleContext(result.selectedChunks);
    }

+    /**
+     * 평가 전용 — 한 질의에 대한 brain 파일 랭킹(점수 내림차순)을 *context budget 적용 전*
+     * 으로 반환한다. recall@k / MRR 계산용. 프로덕션 `retrieve()` 와 동일한 scoring 경로
+     * (`searchBrainFiles`) 를 그대로 재사용하므로, 측정값이 실제 검색 동작을 반영한다 (무결성).
+     */
+    public rankBrainForEval(
+        query: string,
+        brain: BrainProfile,
+        opts: {
+            limit?: number;
+            scopeFolders?: string[];
+            includeRawConversations?: boolean;
+            queryEmbedding?: number[];
+            embeddingModel?: string;
+            embeddingBlendAlpha?: number;
+            chunkLevelRetrieval?: boolean;
+            chunkTargetChars?: number;
+        } = {},
+    ): Array<{ relativePath: string; filePath: string; score: number }> {
+        const limit = opts.limit ?? 20;
+        const expandedTokens = expandQuery(tokenize(query));
+        // chunk 모드는 파일당 여러 청크를 반환하므로, recall 을 *파일 단위* 로 측정하려면
+        // 넉넉히 받아 dedup 한다 (limit 개의 고유 파일 확보).
+        const internalLimit = opts.chunkLevelRetrieval ? limit * 3 : limit;
+        const chunks = this.searchBrainFiles(
+            query,
+            expandedTokens,
+            brain,
+            internalLimit,
+            opts.includeRawConversations ?? false,
+            opts.scopeFolders ?? [],
+            opts.queryEmbedding,
+            opts.embeddingModel,
+            opts.embeddingBlendAlpha,
+            opts.chunkLevelRetrieval || false,
+            opts.chunkTargetChars ?? 1200,
+        );
+        // dedup by file, 점수 내림차순 순서 유지 → 파일 단위 랭킹.
+        const out: Array<{ relativePath: string; filePath: string; score: number }> = [];
+        const seen = new Set<string>();
+        const brainRoot = brain.localBrainPath;
+        for (const c of chunks) {
+            const filePath = (c.metadata.filePath as string) || '';
+            if (!filePath || seen.has(filePath)) continue;
+            seen.add(filePath);
+            const relativePath = filePath ? (path.relative(brainRoot, filePath) || c.title) : c.title;
+            out.push({ relativePath, filePath, score: c.score });
+            if (out.length >= limit) break;
+        }
+        return out;
+    }
+
    // ─── Brain File Search ───

    private searchBrainFiles(
@@ -225,16 +287,29 @@ export class RetrievalOrchestrator {
        queryEmbedding?: number[],
        embeddingModel?: string,
        embeddingBlendAlpha?: number,
+        chunkLevel: boolean = false,
+        chunkTargetChars: number = 1200,
    ): RetrievalChunk[] {
        try {
            const scoped = (file: string) => scopeFolders.length === 0
                || scopeFolders.some((folder) => isInside(folder, file));
            const allFiles = findBrainFiles(brain.localBrainPath)
                .filter(scoped)
-                .filter((file) => includeRaw || !this.isRawConversation(path.relative(brain.localBrainPath, file)));
+                .filter((file) => {
+                    const rel = path.relative(brain.localBrainPath, file);
+                    return (includeRaw || !this.isRawConversation(rel)) && !this.isOperationalPath(rel);
+                });

            if (allFiles.length === 0) return [];

+            // Phase 1-가: 섹션 청크 단위 검색 경로. 파일 단위와 분리해 회귀 위험 격리.
+            if (chunkLevel) {
+                return this.searchBrainChunks(
+                    expandedTokens, brain, allFiles, limit, chunkTargetChars,
+                    queryEmbedding, embeddingModel, embeddingBlendAlpha,
+                );
+            }
+
            // Tokenized docs from the persistent mtime-keyed index — unchanged files are not re-read
            // or re-tokenized, so per-query work over a large brain drops from O(total content) to O(files) stats.
            const indexed = getBrainTokenIndex(brain.localBrainPath, allFiles);
@@ -343,6 +418,118 @@ export class RetrievalOrchestrator {
        }
    }

+    // ─── Brain Chunk Search (Phase 1-가) ───
+
+    /**
+     * 섹션 청크 단위 검색. 파일 단위 `searchBrainFiles` 와 동일한 TF-IDF scoring 을
+     * *청크* 에 적용하고, 매치된 섹션 본문을 그대로 발췌(파일 모드의 read-time
+     * extractBestSection 불필요). dense blend 는 v1 에서 파일 단위 임베딩을 그 파일의
+     * 모든 청크에 공유 적용한다(청크별 임베딩은 후속 단계). 한 파일이 결과를 독식하지
+     * 않도록 파일당 청크 수를 제한한다.
+     */
+    private searchBrainChunks(
+        expandedTokens: string[],
+        brain: BrainProfile,
+        allFiles: string[],
+        limit: number,
+        chunkTargetChars: number,
+        queryEmbedding?: number[],
+        embeddingModel?: string,
+        embeddingBlendAlpha?: number,
+    ): RetrievalChunk[] {
+        const chunks = getBrainChunkIndex(brain.localBrainPath, allFiles, chunkTargetChars);
+        if (chunks.length === 0) return [];
+
+        const scored = scoreTfIdfPreTokenized(
+            expandedTokens,
+            chunks.map((c) => ({
+                tokens: c.tokens,
+                titleTokens: c.headingTokens,
+                lastModified: c.mtimeMs,
+                conflictCount: 0,
+            })),
+        );
+
+        // Hybrid: 청크 단위 임베딩(`${filePath}#${chunkIndex}`)으로 dense blend. 청크 벡터가
+        // 아직 없는 항목은 파일 단위 임베딩으로 fallback → 둘 다 없으면 순수 TF-IDF 유지.
+        if (queryEmbedding && embeddingModel && (embeddingBlendAlpha ?? 0) > 0) {
+            const alpha = Math.max(0, Math.min(1, embeddingBlendAlpha!));
+            const chunkEmb = getBrainChunkEmbeddings(brain.localBrainPath, embeddingModel);
+            const filePaths = Array.from(new Set(chunks.map((c) => c.filePath)));
+            const fileEmb = getBrainEmbeddings(brain.localBrainPath, filePaths, embeddingModel);
+            if (chunkEmb.size > 0 || fileEmb.size > 0) {
+                const maxTfidf = scored.reduce((m, s) => (s.score > m ? s.score : m), 0) || 1;
+                for (const s of scored) {
+                    const c = chunks[s.index];
+                    const vec = chunkEmb.get(`${c.filePath}#${c.chunkIndex}`) || fileEmb.get(c.filePath);
+                    if (!vec) continue;
+                    const cos = cosineSimilarity(queryEmbedding, vec);
+                    s.score = (1 - alpha) * (s.score / maxTfidf) + alpha * Math.max(0, cos);
+                }
+            }
+        }
+
+        const ranked = scored.filter((x) => x.score > 0).sort((a, b) => b.score - a.score);
+
+        // 파일당 청크 상한 — 한 문서가 top 슬롯을 독식하지 않게.
+        const PER_FILE_CAP = 3;
+        const perFile = new Map<string, number>();
+        const chosen: typeof ranked = [];
+        for (const s of ranked) {
+            const fp = chunks[s.index].filePath;
+            const n = perFile.get(fp) || 0;
+            if (n >= PER_FILE_CAP) continue;
+            perFile.set(fp, n + 1);
+            chosen.push(s);
+            if (chosen.length >= limit) break;
+        }
+
+        const fileContentCache = new Map<string, string>();
+        const readFile = (fp: string): string => {
+            let c = fileContentCache.get(fp);
+            if (c === undefined) {
+                try { c = fs.readFileSync(fp, 'utf8'); } catch { c = ''; }
+                fileContentCache.set(fp, c);
+            }
+            return c;
+        };
+
+        const topResults: RetrievalChunk[] = [];
+        for (const s of chosen) {
+            const c = chunks[s.index];
+            const content = readFile(c.filePath);
+            if (!content) continue;
+            const isLesson = (c.kind || '') !== '';
+            // 일반 노트: 매치된 섹션 본문 그대로. lesson 카드: 통째 청크라 essence 추출 유지.
+            let body = isLesson
+                ? (extractLessonEssence(content, 1200) || content.slice(c.charStart, c.charEnd))
+                : content.slice(c.charStart, c.charEnd);
+            const cap = isLesson ? 1200 : 700;
+            // 섹션 breadcrumb 을 본문 맨 앞에 — 모델이 어느 맥락의 섹션인지 알도록.
+            const crumb = !isLesson && c.headingPath.length ? `〔${c.headingPath.join(' › ')}〕\n` : '';
+            body = crumb + body.trim();
+            topResults.push({
+                id: `brain-chunk-${s.index}`,
+                source: 'brain-memory' as const,
+                title: c.relativePath,
+                content: summarizeText(body, cap + crumb.length),
+                score: s.score,
+                tokenEstimate: estimateTokens(body),
+                metadata: {
+                    filePath: c.filePath,
+                    category: this.inferCategory(c.relativePath),
+                    isProjectEvidence: this.isProjectEvidence(c.relativePath, content),
+                    lastUpdated: c.mtimeMs,
+                    conflictDetected: s.conflictDetected,
+                    conflictSeverity: s.conflictSeverity,
+                    queryCoverage: s.queryCoverage,
+                    ...(isLesson ? { isLesson: true, lessonKind: c.kind } : {}),
+                },
+            });
+        }
+        return topResults;
+    }
+
    // ─── Memory Layer Search ───

    private searchMemoryLayers(
@@ -531,6 +718,17 @@ export class RetrievalOrchestrator {
        return /(^|[\\/])(00_Raw|raw-data|conversations?|transcripts?)([\\/]|$)/i.test(relativePath);
    }

+    /**
+     * 운영(operational) 로그 — 지식이 아니라 세션/메모리/프로젝트 로그. 사용자 wiki taxonomy
+     * 에 정의된 폴더 fragment 들. 지식 검색에서 제외한다 (= raw 대화와 동일 취급). recall 지표를
+     * 올리진 않지만, 로그를 "지식"으로 끌어오는 의미적 오류와 인덱스/토큰 낭비를 막는다.
+     */
+    private isOperationalPath(relativePath: string): boolean {
+        return /(^|[\\/])(sessions|_agents|_company|memory|Project_Logs|_Archive_Orphans|Post_Drafts|UX_Scenarios)([\\/])/i.test(relativePath)
+            || /docs[\\/]records([\\/]|$)/i.test(relativePath)
+            || /Harness_Research_/i.test(relativePath);
+    }
+
    private inferCategory(relativePath: string): string {
        const normalized = relativePath.toLowerCase();
        if (/(decisions?|adr|planning)/i.test(normalized)) return 'decision';