connectai/tests/scoring.test.ts

import { tokenize, expandQuery, scoreTfIdf, extractBestExcerpt, clearScoringCache } from '../src/retrieval/scoring';

describe('Scoring Engine Unit Tests (v2.72.0)', () => {
    beforeEach(() => {
        clearScoringCache();
    });

    test('Bilingual Tokenization: should handle mixed KO/EN text and zero-width characters', () => {
        const text = 'Astra의 성능 최적화\u200B 전략 performance strategy.';
        const tokens = tokenize(text);

        expect(tokens).toContain('astra');
        expect(tokens).toContain('성능');
        expect(tokens).toContain('최적화');
        expect(tokens).toContain('전략');
        expect(tokens).toContain('performance');
        expect(tokens).toContain('strategy');
        // Zero-width space should be gone and not cause issues
        expect(tokens.every(t => !t.includes('\u200B'))).toBe(true);
    });

    test('Synonym Expansion: should expand "성능" to "performance"', () => {
        const tokens = ['성능'];
        const expanded = expandQuery(tokens);

        expect(expanded).toContain('성능');
        expect(expanded).toContain('performance');
        expect(expanded).toContain('optimization');
    });

    test('Conflict Detection & Severity: should flag documents with tiered severity', () => {
        const query = ['설계'];
        const docs = [
            { title: '정상 설계 문서', content: '이 시스템은 효율적으로 설계되었습니다.' },
            { title: '상충 발생 문서 (LOW)', content: '이 설계는 기존 아키텍처와 충돌 위험이 있습니다.' },
            { title: '강한 상충 문서 (HIGH)', content: '이 설계는 오류가 많고 논란이 크며 반대 의견과 반박이 거셉니다.' }
        ];

        const results = scoreTfIdf(tokenize(query.join(' ')), docs);

        expect(results[0].conflictSeverity).toBe('NONE');
        expect(results[1].conflictSeverity).toBe('LOW');
        expect(results[2].conflictSeverity).toBe('HIGH');
    });

    test('IDF Smoothing: should provide stable scores for small datasets', () => {
        const query = tokenize('특이값');
        const docs = [
            { title: '문서 1', content: '특이값 발견' },
            { title: '문서 2', content: '일반 내용' }
        ];

        const results = scoreTfIdf(query, docs);
        expect(results[0].score).toBeGreaterThan(0);
        expect(results[1].score).toBeLessThan(results[0].score);
        // Should not be Infinity or NaN
        expect(isFinite(results[0].score)).toBe(true);
    });

    test('Excerpt Density Filtering: should pick high-density sentence window', () => {
        const content = `
            이것은 첫 번째 문장입니다. 키워드가 전혀 없습니다.
            Astra의 성능 최적화 전략은 매우 중요합니다. 성능 향상을 위해 최적화가 필요합니다.
            마지막 문장도 키워드가 거의 없습니다.
        `;
        const query = ['성능', '최적화'];
        const excerpt = extractBestExcerpt(content, query, 100);

        expect(excerpt).toContain('성능');
        expect(excerpt).toContain('최적화');
        expect(excerpt).not.toContain('첫 번째 문장');
    });

    test('Edge Case Tokenization: should handle extreme mixed strings and symbols', () => {
        const text = 'A한B글C1!@#$ D.E.F_G 🚀Astra_v2.0';
        const tokens = tokenize(text);

        // Language boundary split should handle alternating chars
        expect(tokens).toContain('astra');
        expect(tokens).toContain('v2.0'); // [Structural Fix] 점(.)이 포함된 버전 번호 보존 확인
        expect(tokens).toContain('한');
        expect(tokens).toContain('글');

        // [New Feature] 기술 기호 보존 확인 (C++, C#, .net)
        const techText = 'I love C++ and C# programming on .net platform.';
        const techTokens = tokenize(techText);
        expect(techTokens).toContain('c++');
        expect(techTokens).toContain('c#');
        expect(techTokens).toContain('.net');

        // Symbols should be filtered out (except the preserved ones)
        expect(tokens.some(t => /^[!@#$]+$/.test(t))).toBe(false);
    });

    test('Long String Performance: should handle 10k character content', () => {
        const longContent = '성능 '.repeat(2000) + '최적화 '.repeat(2000);
        const start = Date.now();
        const tokens = tokenize(longContent);
        const duration = Date.now() - start;

        expect(tokens.length).toBeGreaterThan(0);
        expect(duration).toBeLessThan(200); // Tokenizer should be efficient even for long text (relaxed for CI variance)
    });

    test('Contextual Completeness: should include adjacent sentences for semantic padding', () => {
        const content = `
            도입부 문장입니다.
            핵심 키워드 성능 최적화가 포함된 문장입니다.
            마무리 문장입니다.
        `;
        const query = ['성능', '최적화'];
        const excerpt = extractBestExcerpt(content, query, 200);

        // Should include introduction and conclusion due to padding
        expect(excerpt).toContain('도입부 문장');
        expect(excerpt).toContain('핵심 키워드');
        expect(excerpt).toContain('마무리 문장');
    });

    test('Performance Benchmark: should process 100 documents within threshold', () => {
        const query = tokenize('performance optimization');
        const largeDocs = Array.from({ length: 100 }, (_, i) => ({
            title: `Document ${i}`,
            content: `Content of document ${i} with performance and optimization keywords repeated.`
        }));

        const start = Date.now();
        scoreTfIdf(query, largeDocs);
        const duration = Date.now() - start;

        console.log(`[Benchmark] 100 docs processing time: ${duration}ms`);
        expect(duration).toBeLessThan(200);
    });
});