From 0bac9d4b21f894252cefa84bbbb6abf10726b3bf Mon Sep 17 00:00:00 2001
From: g1nation <g1nation@users.noreply.github.com>
Date: Tue, 5 May 2026 10:57:40 +0900
Subject: [PATCH] feat(scoring): implemented global caching and multi-stage
 density filtering v2.70.0

---
 package-lock.json        |   4 +-
 package.json             |   2 +-
 src/retrieval/scoring.ts | 108 +++++++++++++++++++++++++--------------
 3 files changed, 72 insertions(+), 42 deletions(-)

diff --git a/package-lock.json b/package-lock.json
index 0198776..f9b50ed 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -1,12 +1,12 @@
 {
   "name": "g1nation",
-  "version": "2.69.0",
+  "version": "2.70.0",
   "lockfileVersion": 3,
   "requires": true,
   "packages": {
     "": {
       "name": "g1nation",
-      "version": "2.69.0",
+      "version": "2.70.0",
       "license": "MIT",
       "dependencies": {
         "marked": "^18.0.2"
diff --git a/package.json b/package.json
index 08ae2d4..376cb62 100644
--- a/package.json
+++ b/package.json
@@ -2,7 +2,7 @@
   "name": "astra",
   "displayName": "Astra",
   "description": "The personal intelligence layer for Antigravity and VS Code. A private cognitive partner for deep project context, memory, and proactive strategic decision-making.",
-  "version": "2.69.0",
+  "version": "2.70.0",
   "publisher": "g1nation",
   "license": "MIT",
   "icon": "assets/icon.png",
diff --git a/src/retrieval/scoring.ts b/src/retrieval/scoring.ts
index bb8f7c4..860e21e 100644
--- a/src/retrieval/scoring.ts
+++ b/src/retrieval/scoring.ts
@@ -10,63 +10,88 @@
 
 // ─── Bilingual Tokenizer ───
 
-const STOP_WORDS_EN = new Set([
-    'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
-    'of', 'with', 'by', 'from', 'is', 'are', 'was', 'were', 'be', 'been',
-    'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
-    'should', 'may', 'might', 'can', 'this', 'that', 'these', 'those',
-    'it', 'its', 'not', 'no', 'what', 'how', 'when', 'where', 'which',
-    'who', 'whom', 'why', 'if', 'then', 'than', 'so', 'as', 'just',
-    'about', 'also', 'more', 'some', 'very', 'all', 'each', 'every',
-    'such', 'please', 'write', 'use', 'using', 'used'
-]);
+// ─── Scoring Engine Configuration ───
 
-const STOP_WORDS_KO = new Set([
-    '그리고', '그런데', '그래서', '하지만', '또한', '또는', '해서', '하는',
-    '있어', '없어', '아래', '위에', '어떻게', '이것', '저것', '그것',
-    '이런', '저런', '그런', '여기', '거기', '필요', '사용', '관련',
-    '대한', '대해', '통해', '따라', '위해', '대로', '만큼'
-]);
+const SCORING_CONFIG = {
+    STOP_WORDS_EN: new Set([
+        'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
+        'of', 'with', 'by', 'from', 'is', 'are', 'was', 'were', 'be', 'been',
+        'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
+        'should', 'may', 'might', 'can', 'this', 'that', 'these', 'those',
+        'it', 'its', 'not', 'no', 'what', 'how', 'when', 'where', 'which',
+        'who', 'whom', 'why', 'if', 'then', 'than', 'so', 'as', 'just',
+        'about', 'also', 'more', 'some', 'very', 'all', 'each', 'every',
+        'such', 'please', 'write', 'use', 'using', 'used'
+    ]),
+    STOP_WORDS_KO: new Set([
+        '그리고', '그런데', '그래서', '하지만', '또한', '또는', '해서', '하는',
+        '있어', '없어', '아래', '위에', '어떻게', '이것', '저것', '그것',
+        '이런', '저런', '그런', '여기', '거기', '필요', '사용', '관련',
+        '대한', '대해', '통해', '따라', '위해', '대로', '만큼'
+    ]),
+    SYNONYM_DATA: [
+        ['성능', ['performance', 'optimization', '최적화', 'speed']],
+        ['performance', ['성능', '최적화', 'optimization', 'speed']],
+        ['아키텍처', ['architecture', '구조', 'structure', 'design']],
+        ['architecture', ['아키텍처', '구조', 'structure', 'design']],
+        ['메모리', ['memory', '기억', 'cache', 'storage']],
+        ['memory', ['메모리', '기억', 'cache', 'storage']],
+        ['버그', ['bug', 'error', '오류', 'issue', 'defect']],
+        ['bug', ['버그', 'error', '오류', 'issue']],
+        ['설계', ['design', '아키텍처', 'architecture', 'pattern']],
+        ['design', ['설계', '아키텍처', 'architecture', 'pattern']],
+        ['배포', ['deploy', 'deployment', 'release', 'ci', 'cd']],
+        ['deploy', ['배포', 'deployment', 'release']],
+        ['테스트', ['test', 'testing', 'spec', 'jest', 'mocha']],
+        ['test', ['테스트', 'testing', 'spec']],
+        ['프로젝트', ['project', '프로그램', 'repo', 'repository']],
+        ['project', ['프로젝트', '프로그램', 'repo']],
+        ['방향', ['direction', '전략', 'strategy', '목표', 'goal']],
+        ['direction', ['방향', '전략', 'strategy', '목표']]
+    ] as [string, string[]][],
+    DENSITY_THRESHOLD: 0.15, // 발췌문 추출 시 최소 키워드 밀도
+    TITLE_MULTIPLIER: 3.0,   // 제목 일치 가중치
+    GLOBAL_CACHE_LIMIT: 2000
+};
 
-// ─── Internal Cache for Tokenization ───
+// ─── Global Search State & Cache ───
 const TOKEN_CACHE = new Map<string, string[]>();
-const MAX_CACHE_SIZE = 1000;
+const IDF_CACHE = new Map<string, number>();
+
+/**
+ * 캐시를 명시적으로 비웁니다. 문서 집합이 크게 변경되었을 때 사용합니다.
+ */
+export function clearScoringCache() {
+    TOKEN_CACHE.clear();
+    IDF_CACHE.clear();
+}
 
 /**
  * 한국어/영어 혼합 텍스트를 정규화하고 토큰으로 분리합니다.
- * (Performance Optimization: 내부 캐시 적용)
  */
 export function tokenize(text: string): string[] {
     if (!text) return [];
-    
-    // 캐시 확인
-    if (TOKEN_CACHE.has(text)) {
-        return TOKEN_CACHE.get(text)!;
-    }
+    if (TOKEN_CACHE.has(text)) return TOKEN_CACHE.get(text)!;
 
-    // 1. Pre-normalization: 특수문자 정제 및 표준화
     const normalized = text
         .toLowerCase()
-        .replace(/[\u200B-\u200D\uFEFF]/g, '') // Zero-width spaces 제거
-        .replace(/[^\w\s가-힣_.-]/g, ' ')      // 허용된 문자 외에는 공백 처리
+        .replace(/[\u200B-\u200D\uFEFF]/g, '')
+        .replace(/[^\w\s가-힣_.-]/g, ' ')
         .trim();
 
-    // 2. Tokenization: 정제된 텍스트 분리
     const tokens = normalized
         .split(/[^a-z0-9가-힣_.-]+/g)
         .map((t) => t.trim())
         .filter((t) => t.length >= 2)
-        .filter((t) => !STOP_WORDS_EN.has(t) && !STOP_WORDS_KO.has(t));
+        .filter((t) => !SCORING_CONFIG.STOP_WORDS_EN.has(t) && !SCORING_CONFIG.STOP_WORDS_KO.has(t));
 
-    // 캐시 저장 (메모리 관리: 임계값 초과 시 비우기)
-    if (TOKEN_CACHE.size >= MAX_CACHE_SIZE) {
-        TOKEN_CACHE.clear();
-    }
+    if (TOKEN_CACHE.size >= SCORING_CONFIG.GLOBAL_CACHE_LIMIT) TOKEN_CACHE.clear();
     TOKEN_CACHE.set(text, tokens);
-
     return tokens;
 }
 
+const synonymMap = new Map<string, string[]>(SCORING_CONFIG.SYNONYM_DATA);
+
 /**
  * 동의어/관련어 확장을 수행합니다.
  */
@@ -231,7 +256,7 @@ export function extractBestExcerpt(
     const expanded = expandQuery(queryTokens);
     const expandedSet = new Set(expanded);
 
-    // Split into sentences (한국어 + 영어)
+    // 1. Sentence splitting & Initial filtering
     const sentences = content
         .split(/(?<=[.!?。！？\n])\s*/)
         .map((s) => s.trim())
@@ -239,15 +264,20 @@ export function extractBestExcerpt(
 
     if (sentences.length === 0) return content.slice(0, maxLength);
 
-    // Score each sentence
+    // 2. Phase 1: Density-based filtering (Multi-stage)
+    // 최소 정보 밀도를 충족하지 못하는 문장은 후보군에서 제외하거나 가중치를 낮춤
     const scored = sentences.map((sentence, idx) => {
         const tokens = tokenize(sentence);
         const matchCount = tokens.filter((t) => expandedSet.has(t)).length;
         const density = tokens.length > 0 ? matchCount / tokens.length : 0;
-        return { sentence, idx, matchCount, density };
+        
+        // 정보 밀도가 임계값 미만이면 점수를 크게 깎음
+        const densityMultiplier = density >= SCORING_CONFIG.DENSITY_THRESHOLD ? 1.5 : 0.5;
+        
+        return { sentence, idx, matchCount, density, score: (matchCount + density * 2) * densityMultiplier };
     });
 
-    // Find the best window of consecutive sentences
+    // 3. Phase 2: Optimal window search
     let bestStart = 0;
     let bestScore = -1;
     let bestLen = 0;
@@ -259,7 +289,7 @@ export function extractBestExcerpt(
 
         while (j < scored.length && windowText.length < maxLength) {
             windowText += scored[j].sentence + ' ';
-            windowScore += scored[j].matchCount + scored[j].density * 2;
+            windowScore += scored[j].score;
             j++;
         }