connectai/tests/retrievalEvalEmbedding.test.ts

/**
 * 하이브리드(sparse+dense) 검색 측정 — 청크 TF-IDF vs 청크+임베딩 (alpha sweep).
 *
 * 평소 테스트 런에서는 skip (실제 두뇌 + 로컬 임베딩 서버 필요). 수동 실행:
 *
 *   ASTRA_EVAL_BRAIN="E:/Wiki/2nd/10_Wiki/Topics" \
 *   ASTRA_EVAL_EMBED_MODEL="text-embedding-nomic-embed-text-v1.5" \
 *   npx jest tests/retrievalEvalEmbedding.test.ts --verbose
 *
 * (서버 URL 은 ASTRA_EVAL_EMBED_URL, 기본 http://127.0.0.1:1234 — LM Studio)
 *
 * 측정 전에 두뇌 전체 청크 임베딩을 백필한다 — 결과 벡터는 brain-index 캐시에
 * 영속되므로 이 테스트 1회 실행이 곧 런타임 초기 색인을 겸한다.
 */
import * as fs from 'fs';
import { RetrievalOrchestrator } from '../src/retrieval';
import { loadGoldenSet, runRetrievalEval, type EvalReport } from '../src/retrieval/evalHarness';
import { findBrainFiles } from '../src/utils';
import { getBrainTokenIndex, backfillBrainChunkEmbeddings } from '../src/retrieval/brainIndex';
import { embedTexts, embedQuery } from '../src/retrieval/embeddings';

const BRAIN = (process.env.ASTRA_EVAL_BRAIN || '').trim();
const EMBED_MODEL = (process.env.ASTRA_EVAL_EMBED_MODEL || '').trim();
const EMBED_URL = (process.env.ASTRA_EVAL_EMBED_URL || 'http://127.0.0.1:1234').trim();
const KS = [1, 3, 5];
const ALPHAS = [0.3, 0.5, 0.7];
const CHUNK_TARGET = 1200;

const maybe = BRAIN && EMBED_MODEL && fs.existsSync(BRAIN) ? describe : describe.skip;

maybe('retrieval A/B — chunk TF-IDF vs chunk+embedding', () => {
    jest.setTimeout(40 * 60_000);

    test('golden set hybrid comparison', async () => {
        const { entries, parseErrors } = loadGoldenSet(BRAIN);
        expect(entries.length).toBeGreaterThan(0);

        const allFiles = findBrainFiles(BRAIN);
        getBrainTokenIndex(BRAIN, allFiles);

        // ── 전체 청크 임베딩 백필 (이미 벡터 있는 청크는 건너뜀 → 재실행 저렴) ──
        const embed = (texts: string[]) => embedTexts(texts, { baseUrl: EMBED_URL, model: EMBED_MODEL });
        const SLICE = 300;
        let embedded = 0;
        for (let i = 0; i < allFiles.length; i += SLICE) {
            embedded += await backfillBrainChunkEmbeddings(BRAIN, allFiles.slice(i, i + SLICE), EMBED_MODEL, embed, CHUNK_TARGET);
            // eslint-disable-next-line no-console
            console.log(`백필 진행 ${Math.min(i + SLICE, allFiles.length)}/${allFiles.length} 파일 · 신규 벡터 ${embedded}`);
        }

        const brain = { id: 'eval', name: 'EvalBrain', localBrainPath: BRAIN } as any;
        const orchestrator = new RetrievalOrchestrator();

        // 질의 임베딩은 alpha 무관하게 동일 — 1회만 계산해 재사용.
        const queryVecs = new Map<string, number[] | undefined>();
        for (const e of entries) {
            queryVecs.set(e.query, await embedQuery(e.query, { baseUrl: EMBED_URL, model: EMBED_MODEL }));
        }

        const run = (alpha: number): Promise<EvalReport> =>
            runRetrievalEval({
                entries,
                ks: KS,
                ranker: async (query: string) =>
                    orchestrator
                        .rankBrainForEval(query, brain, {
                            limit: Math.max(...KS) + 5,
                            chunkLevelRetrieval: true,
                            chunkTargetChars: CHUNK_TARGET,
                            queryEmbedding: alpha > 0 ? queryVecs.get(query) : undefined,
                            embeddingModel: alpha > 0 ? EMBED_MODEL : undefined,
                            embeddingBlendAlpha: alpha,
                        })
                        .map(r => r.relativePath),
            });

        const base = await run(0);
        const hybrids: Array<{ alpha: number; report: EvalReport }> = [];
        for (const a of ALPHAS) hybrids.push({ alpha: a, report: await run(a) });

        const pct = (x: number) => (x * 100).toFixed(1) + '%';
        const lines: string[] = [];
        lines.push('');
        lines.push(`══ 하이브리드 검색 측정 (질의 ${entries.length}건, 파싱오류 ${parseErrors}, 신규 벡터 ${embedded}) ══`);
        lines.push(`지표        | TF-IDF만 | ${ALPHAS.map(a => `α=${a}`.padStart(7)).join(' | ')}`);
        for (const k of KS) {
            lines.push(`recall@${k}    | ${pct(base.recallAtK[k]).padStart(7)} | ${hybrids.map(h => pct(h.report.recallAtK[k]).padStart(7)).join(' | ')}`);
        }
        lines.push(`MRR         | ${base.mrr.toFixed(3).padStart(7)} | ${hybrids.map(h => h.report.mrr.toFixed(3).padStart(7)).join(' | ')}`);
        // 최고 alpha 기준 miss/flip 진단
        const best = hybrids.reduce((p, c) => (c.report.mrr > p.report.mrr ? c : p), hybrids[0]);
        lines.push(`-- α=${best.alpha} 기준 순위 변동 --`);
        base.perQuery.forEach((bq, i) => {
            const hq = best.report.perQuery[i];
            if ((bq.firstHitRank === null) !== (hq.firstHitRank === null) || bq.firstHitRank !== hq.firstHitRank) {
                lines.push(`  · "${bq.query.slice(0, 38)}" sparse=#${bq.firstHitRank ?? 'miss'} → hybrid=#${hq.firstHitRank ?? 'miss'}`);
            }
        });
        const misses = best.report.perQuery.filter(q => q.firstHitRank === null);
        for (const m of misses) lines.push(`  ✗ miss "${m.query.slice(0, 38)}" → 상위: ${m.topPaths.slice(0, 3).join(' · ')}`);
        // eslint-disable-next-line no-console
        console.log(lines.join('\n'));

        expect(base.total).toBe(entries.length);
    });
});