connectai/tests/retrievalEvalCompare.test.ts

/**
 * 검색 모드 A/B 측정 — 파일 단위(baseline) vs 섹션 청크(Phase 1-가).
 *
 * 평소 테스트 런에서는 skip 된다 (실제 두뇌 폴더 + 수천 파일 인덱싱이 필요해
 * CI/패키징에 부적합). 수동 실행:
 *
 *   ASTRA_EVAL_BRAIN="E:/Wiki/2nd/10_Wiki/Topics" npx jest tests/retrievalEvalCompare.test.ts --verbose
 *
 * 골든셋(<brain>/.astra/eval/golden.jsonl) 기준 recall@k / MRR 을 두 모드로 측정해
 * 비교표를 콘솔에 출력한다. TF-IDF 경로 기준 (임베딩은 LM 서버 의존이라 제외 —
 * 청킹의 효과는 sparse 항과 발췌 품질에 먼저 나타난다).
 */
import * as fs from 'fs';
import { RetrievalOrchestrator } from '../src/retrieval';
import { loadGoldenSet, runRetrievalEval, type EvalReport } from '../src/retrieval/evalHarness';
import { findBrainFiles } from '../src/utils';
import { getBrainTokenIndex } from '../src/retrieval/brainIndex';

const BRAIN = (process.env.ASTRA_EVAL_BRAIN || '').trim();
const KS = [1, 3, 5];

const maybe = BRAIN && fs.existsSync(BRAIN) ? describe : describe.skip;

maybe('retrieval A/B — file vs chunk', () => {
    jest.setTimeout(10 * 60_000);

    test('golden set comparison', async () => {
        const { entries, parseErrors } = loadGoldenSet(BRAIN);
        expect(entries.length).toBeGreaterThan(0);

        // 인덱스 워밍업 (양 모드 공통 전제)
        const allFiles = findBrainFiles(BRAIN);
        getBrainTokenIndex(BRAIN, allFiles);

        const brain = { id: 'eval', name: 'EvalBrain', localBrainPath: BRAIN } as any;
        const orchestrator = new RetrievalOrchestrator();

        const run = (chunkMode: boolean): Promise<EvalReport> =>
            runRetrievalEval({
                entries,
                ks: KS,
                ranker: async (query: string) =>
                    orchestrator
                        .rankBrainForEval(query, brain, {
                            limit: Math.max(...KS) + 5,
                            chunkLevelRetrieval: chunkMode,
                            chunkTargetChars: 1200,
                        })
                        .map(r => r.relativePath),
            });

        const fileReport = await run(false);
        const chunkReport = await run(true);

        const pct = (x: number) => (x * 100).toFixed(1) + '%';
        const lines: string[] = [];
        lines.push('');
        lines.push(`══ 검색 A/B (질의 ${entries.length}건, 파싱오류 ${parseErrors}) ══`);
        lines.push(`지표        | 파일 단위 | 섹션 청크 | Δ`);
        for (const k of KS) {
            const a = fileReport.recallAtK[k], b = chunkReport.recallAtK[k];
            lines.push(`recall@${k}    | ${pct(a).padStart(7)} | ${pct(b).padStart(7)} | ${(b - a >= 0 ? '+' : '')}${pct(b - a)}`);
        }
        lines.push(`MRR         | ${fileReport.mrr.toFixed(3).padStart(7)} | ${chunkReport.mrr.toFixed(3).padStart(7)} | ${(chunkReport.mrr - fileReport.mrr >= 0 ? '+' : '')}${(chunkReport.mrr - fileReport.mrr).toFixed(3)}`);
        // 모드별 win/loss 질의
        const flips: string[] = [];
        fileReport.perQuery.forEach((fq, i) => {
            const cq = chunkReport.perQuery[i];
            const f = fq.firstHitRank, c = cq.firstHitRank;
            if ((f === null) !== (c === null) || (f !== null && c !== null && f !== c)) {
                flips.push(`  · "${fq.query.slice(0, 38)}" 파일=#${f ?? 'miss'} → 청크=#${c ?? 'miss'}`);
            }
        });
        if (flips.length) { lines.push('순위 변동:'); lines.push(...flips); }
        // miss 진단 (청크 모드)
        const misses = chunkReport.perQuery.filter(q => q.firstHitRank === null);
        if (misses.length) {
            lines.push(`청크 모드 miss ${misses.length}건:`);
            for (const m of misses) lines.push(`  ✗ "${m.query.slice(0, 38)}" → 상위: ${m.topPaths.slice(0, 3).join(' · ')}`);
        }
        // eslint-disable-next-line no-console
        console.log(lines.join('\n'));

        expect(chunkReport.total).toBe(fileReport.total);
    });
});