From 563e49932484d6469d25c5b7d8455291ddec40d2 Mon Sep 17 00:00:00 2001
From: g1nation <g1nation@users.noreply.github.com>
Date: Tue, 5 May 2026 11:10:31 +0900
Subject: [PATCH] feat(scoring): added comprehensive unit tests and refined
 bilingual tokenization v2.72.0

---
 package-lock.json        |  4 +-
 package.json             |  2 +-
 src/retrieval/scoring.ts | 13 ++++--
 tests/scoring.test.ts    | 86 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 98 insertions(+), 7 deletions(-)
 create mode 100644 tests/scoring.test.ts

diff --git a/package-lock.json b/package-lock.json
index e02c4ab..799427f 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -1,12 +1,12 @@
 {
   "name": "g1nation",
-  "version": "2.71.0",
+  "version": "2.72.0",
   "lockfileVersion": 3,
   "requires": true,
   "packages": {
     "": {
       "name": "g1nation",
-      "version": "2.71.0",
+      "version": "2.72.0",
       "license": "MIT",
       "dependencies": {
         "marked": "^18.0.2"
diff --git a/package.json b/package.json
index 6a529c4..9641c5f 100644
--- a/package.json
+++ b/package.json
@@ -2,7 +2,7 @@
   "name": "astra",
   "displayName": "Astra",
   "description": "The personal intelligence layer for Antigravity and VS Code. A private cognitive partner for deep project context, memory, and proactive strategic decision-making.",
-  "version": "2.71.0",
+  "version": "2.72.0",
   "publisher": "g1nation",
   "license": "MIT",
   "icon": "assets/icon.png",
diff --git a/src/retrieval/scoring.ts b/src/retrieval/scoring.ts
index cb410dc..47b77fa 100644
--- a/src/retrieval/scoring.ts
+++ b/src/retrieval/scoring.ts
@@ -80,11 +80,13 @@ export function tokenize(text: string): string[] {
     const normalized = text
         .toLowerCase()
         .replace(/[\u200B-\u200D\uFEFF]/g, '')
-        .replace(/[^\w\s가-힣_.-]/g, ' ')
-        .trim();
+        .replace(/[^\w\s가-힣_.-]/g, ' ');
 
-    const tokens = normalized
-        .split(/[^a-z0-9가-힣_.-]+/g)
+    // [Refinement] 영문/숫자와 한글 경계에서 분리하도록 개선
+    const splitText = normalized.replace(/([a-z0-9]+)([가-힣]+)/gi, '$1 $2').replace(/([가-힣]+)([a-z0-9]+)/gi, '$1 $2');
+    
+    const tokens = splitText
+        .split(/[^a-z0-9가-힣_]+/g)
         .map((t) => t.trim())
         .filter((t) => t.length >= 2)
         .filter((t) => !SCORING_CONFIG.STOP_WORDS_EN.has(t) && !SCORING_CONFIG.STOP_WORDS_KO.has(t));
@@ -297,6 +299,9 @@ export function extractBestExcerpt(
     let bestLen = 0;
 
     for (let i = 0; i < scored.length; i++) {
+        // [Refinement] 정보 밀도가 낮은 문장은 윈도우의 시작점이 될 수 없음
+        if (scored[i].density < SCORING_CONFIG.DENSITY_THRESHOLD) continue;
+
         let windowText = '';
         let windowScore = 0;
         let j = i;
diff --git a/tests/scoring.test.ts b/tests/scoring.test.ts
new file mode 100644
index 0000000..a7c3792
--- /dev/null
+++ b/tests/scoring.test.ts
@@ -0,0 +1,86 @@
+import { tokenize, expandQuery, scoreTfIdf, extractBestExcerpt, clearScoringCache } from '../src/retrieval/scoring';
+
+describe('Scoring Engine Unit Tests (v2.72.0)', () => {
+    beforeEach(() => {
+        clearScoringCache();
+    });
+
+    test('Bilingual Tokenization: should handle mixed KO/EN text and zero-width characters', () => {
+        const text = 'Astra의 성능 최적화\u200B 전략 performance strategy.';
+        const tokens = tokenize(text);
+        
+        expect(tokens).toContain('astra');
+        expect(tokens).toContain('성능');
+        expect(tokens).toContain('최적화');
+        expect(tokens).toContain('전략');
+        expect(tokens).toContain('performance');
+        expect(tokens).toContain('strategy');
+        // Zero-width space should be gone and not cause issues
+        expect(tokens.every(t => !t.includes('\u200B'))).toBe(true);
+    });
+
+    test('Synonym Expansion: should expand "성능" to "performance"', () => {
+        const tokens = ['성능'];
+        const expanded = expandQuery(tokens);
+        
+        expect(expanded).toContain('성능');
+        expect(expanded).toContain('performance');
+        expect(expanded).toContain('optimization');
+    });
+
+    test('Conflict Detection: should flag documents with controversial terms', () => {
+        const query = ['설계'];
+        const docs = [
+            { title: '정상 설계 문서', content: '이 시스템은 효율적으로 설계되었습니다.' },
+            { title: '상충 발생 문서', content: '이 설계는 기존 아키텍처와 충돌 논란이 있습니다.' }
+        ];
+        
+        const results = scoreTfIdf(tokenize(query.join(' ')), docs);
+        
+        expect(results[0].conflictDetected).toBe(false);
+        expect(results[1].conflictDetected).toBe(true);
+    });
+
+    test('IDF Smoothing: should provide stable scores for small datasets', () => {
+        const query = tokenize('특이값');
+        const docs = [
+            { title: '문서 1', content: '특이값 발견' },
+            { title: '문서 2', content: '일반 내용' }
+        ];
+        
+        const results = scoreTfIdf(query, docs);
+        expect(results[0].score).toBeGreaterThan(0);
+        expect(results[1].score).toBeLessThan(results[0].score);
+        // Should not be Infinity or NaN
+        expect(isFinite(results[0].score)).toBe(true);
+    });
+
+    test('Excerpt Density Filtering: should pick high-density sentence window', () => {
+        const content = `
+            이것은 첫 번째 문장입니다. 키워드가 전혀 없습니다.
+            Astra의 성능 최적화 전략은 매우 중요합니다. 성능 향상을 위해 최적화가 필요합니다.
+            마지막 문장도 키워드가 거의 없습니다.
+        `;
+        const query = ['성능', '최적화'];
+        const excerpt = extractBestExcerpt(content, query, 100);
+        
+        expect(excerpt).toContain('성능');
+        expect(excerpt).toContain('최적화');
+        expect(excerpt).not.toContain('첫 번째 문장');
+    });
+
+    test('Performance Benchmark: should process 100 documents within threshold', () => {
+        const query = tokenize('performance optimization');
+        const largeDocs = Array.from({ length: 100 }, (_, i) => ({
+            title: `Document ${i}`,
+            content: `Content of document ${i} with performance and optimization keywords repeated.`
+        }));
+        
+        const start = Date.now();
+        scoreTfIdf(query, largeDocs);
+        const duration = Date.now() - start;
+        
+        console.log(`[Benchmark] 100 docs processing time: ${duration}ms`);
+        expect(duration).toBeLessThan(200); // Should be very fast due to caching
+    });
+});