From 563e49932484d6469d25c5b7d8455291ddec40d2 Mon Sep 17 00:00:00 2001 From: g1nation Date: Tue, 5 May 2026 11:10:31 +0900 Subject: [PATCH] feat(scoring): added comprehensive unit tests and refined bilingual tokenization v2.72.0 --- package-lock.json | 4 +- package.json | 2 +- src/retrieval/scoring.ts | 13 ++++-- tests/scoring.test.ts | 86 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 98 insertions(+), 7 deletions(-) create mode 100644 tests/scoring.test.ts diff --git a/package-lock.json b/package-lock.json index e02c4ab..799427f 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "g1nation", - "version": "2.71.0", + "version": "2.72.0", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "g1nation", - "version": "2.71.0", + "version": "2.72.0", "license": "MIT", "dependencies": { "marked": "^18.0.2" diff --git a/package.json b/package.json index 6a529c4..9641c5f 100644 --- a/package.json +++ b/package.json @@ -2,7 +2,7 @@ "name": "astra", "displayName": "Astra", "description": "The personal intelligence layer for Antigravity and VS Code. A private cognitive partner for deep project context, memory, and proactive strategic decision-making.", - "version": "2.71.0", + "version": "2.72.0", "publisher": "g1nation", "license": "MIT", "icon": "assets/icon.png", diff --git a/src/retrieval/scoring.ts b/src/retrieval/scoring.ts index cb410dc..47b77fa 100644 --- a/src/retrieval/scoring.ts +++ b/src/retrieval/scoring.ts @@ -80,11 +80,13 @@ export function tokenize(text: string): string[] { const normalized = text .toLowerCase() .replace(/[\u200B-\u200D\uFEFF]/g, '') - .replace(/[^\w\s가-힣_.-]/g, ' ') - .trim(); + .replace(/[^\w\s가-힣_.-]/g, ' '); - const tokens = normalized - .split(/[^a-z0-9가-힣_.-]+/g) + // [Refinement] 영문/숫자와 한글 경계에서 분리하도록 개선 + const splitText = normalized.replace(/([a-z0-9]+)([가-힣]+)/gi, '$1 $2').replace(/([가-힣]+)([a-z0-9]+)/gi, '$1 $2'); + + const tokens = splitText + .split(/[^a-z0-9가-힣_]+/g) .map((t) => t.trim()) .filter((t) => t.length >= 2) .filter((t) => !SCORING_CONFIG.STOP_WORDS_EN.has(t) && !SCORING_CONFIG.STOP_WORDS_KO.has(t)); @@ -297,6 +299,9 @@ export function extractBestExcerpt( let bestLen = 0; for (let i = 0; i < scored.length; i++) { + // [Refinement] 정보 밀도가 낮은 문장은 윈도우의 시작점이 될 수 없음 + if (scored[i].density < SCORING_CONFIG.DENSITY_THRESHOLD) continue; + let windowText = ''; let windowScore = 0; let j = i; diff --git a/tests/scoring.test.ts b/tests/scoring.test.ts new file mode 100644 index 0000000..a7c3792 --- /dev/null +++ b/tests/scoring.test.ts @@ -0,0 +1,86 @@ +import { tokenize, expandQuery, scoreTfIdf, extractBestExcerpt, clearScoringCache } from '../src/retrieval/scoring'; + +describe('Scoring Engine Unit Tests (v2.72.0)', () => { + beforeEach(() => { + clearScoringCache(); + }); + + test('Bilingual Tokenization: should handle mixed KO/EN text and zero-width characters', () => { + const text = 'Astra의 성능 최적화\u200B 전략 performance strategy.'; + const tokens = tokenize(text); + + expect(tokens).toContain('astra'); + expect(tokens).toContain('성능'); + expect(tokens).toContain('최적화'); + expect(tokens).toContain('전략'); + expect(tokens).toContain('performance'); + expect(tokens).toContain('strategy'); + // Zero-width space should be gone and not cause issues + expect(tokens.every(t => !t.includes('\u200B'))).toBe(true); + }); + + test('Synonym Expansion: should expand "성능" to "performance"', () => { + const tokens = ['성능']; + const expanded = expandQuery(tokens); + + expect(expanded).toContain('성능'); + expect(expanded).toContain('performance'); + expect(expanded).toContain('optimization'); + }); + + test('Conflict Detection: should flag documents with controversial terms', () => { + const query = ['설계']; + const docs = [ + { title: '정상 설계 문서', content: '이 시스템은 효율적으로 설계되었습니다.' }, + { title: '상충 발생 문서', content: '이 설계는 기존 아키텍처와 충돌 논란이 있습니다.' } + ]; + + const results = scoreTfIdf(tokenize(query.join(' ')), docs); + + expect(results[0].conflictDetected).toBe(false); + expect(results[1].conflictDetected).toBe(true); + }); + + test('IDF Smoothing: should provide stable scores for small datasets', () => { + const query = tokenize('특이값'); + const docs = [ + { title: '문서 1', content: '특이값 발견' }, + { title: '문서 2', content: '일반 내용' } + ]; + + const results = scoreTfIdf(query, docs); + expect(results[0].score).toBeGreaterThan(0); + expect(results[1].score).toBeLessThan(results[0].score); + // Should not be Infinity or NaN + expect(isFinite(results[0].score)).toBe(true); + }); + + test('Excerpt Density Filtering: should pick high-density sentence window', () => { + const content = ` + 이것은 첫 번째 문장입니다. 키워드가 전혀 없습니다. + Astra의 성능 최적화 전략은 매우 중요합니다. 성능 향상을 위해 최적화가 필요합니다. + 마지막 문장도 키워드가 거의 없습니다. + `; + const query = ['성능', '최적화']; + const excerpt = extractBestExcerpt(content, query, 100); + + expect(excerpt).toContain('성능'); + expect(excerpt).toContain('최적화'); + expect(excerpt).not.toContain('첫 번째 문장'); + }); + + test('Performance Benchmark: should process 100 documents within threshold', () => { + const query = tokenize('performance optimization'); + const largeDocs = Array.from({ length: 100 }, (_, i) => ({ + title: `Document ${i}`, + content: `Content of document ${i} with performance and optimization keywords repeated.` + })); + + const start = Date.now(); + scoreTfIdf(query, largeDocs); + const duration = Date.now() - start; + + console.log(`[Benchmark] 100 docs processing time: ${duration}ms`); + expect(duration).toBeLessThan(200); // Should be very fast due to caching + }); +});