From 0bac9d4b21f894252cefa84bbbb6abf10726b3bf Mon Sep 17 00:00:00 2001 From: g1nation Date: Tue, 5 May 2026 10:57:40 +0900 Subject: [PATCH] feat(scoring): implemented global caching and multi-stage density filtering v2.70.0 --- package-lock.json | 4 +- package.json | 2 +- src/retrieval/scoring.ts | 108 +++++++++++++++++++++++++-------------- 3 files changed, 72 insertions(+), 42 deletions(-) diff --git a/package-lock.json b/package-lock.json index 0198776..f9b50ed 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "g1nation", - "version": "2.69.0", + "version": "2.70.0", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "g1nation", - "version": "2.69.0", + "version": "2.70.0", "license": "MIT", "dependencies": { "marked": "^18.0.2" diff --git a/package.json b/package.json index 08ae2d4..376cb62 100644 --- a/package.json +++ b/package.json @@ -2,7 +2,7 @@ "name": "astra", "displayName": "Astra", "description": "The personal intelligence layer for Antigravity and VS Code. A private cognitive partner for deep project context, memory, and proactive strategic decision-making.", - "version": "2.69.0", + "version": "2.70.0", "publisher": "g1nation", "license": "MIT", "icon": "assets/icon.png", diff --git a/src/retrieval/scoring.ts b/src/retrieval/scoring.ts index bb8f7c4..860e21e 100644 --- a/src/retrieval/scoring.ts +++ b/src/retrieval/scoring.ts @@ -10,63 +10,88 @@ // ─── Bilingual Tokenizer ─── -const STOP_WORDS_EN = new Set([ - 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', - 'of', 'with', 'by', 'from', 'is', 'are', 'was', 'were', 'be', 'been', - 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', - 'should', 'may', 'might', 'can', 'this', 'that', 'these', 'those', - 'it', 'its', 'not', 'no', 'what', 'how', 'when', 'where', 'which', - 'who', 'whom', 'why', 'if', 'then', 'than', 'so', 'as', 'just', - 'about', 'also', 'more', 'some', 'very', 'all', 'each', 'every', - 'such', 'please', 'write', 'use', 'using', 'used' -]); +// ─── Scoring Engine Configuration ─── -const STOP_WORDS_KO = new Set([ - '그리고', '그런데', '그래서', '하지만', '또한', '또는', '해서', '하는', - '있어', '없어', '아래', '위에', '어떻게', '이것', '저것', '그것', - '이런', '저런', '그런', '여기', '거기', '필요', '사용', '관련', - '대한', '대해', '통해', '따라', '위해', '대로', '만큼' -]); +const SCORING_CONFIG = { + STOP_WORDS_EN: new Set([ + 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', + 'of', 'with', 'by', 'from', 'is', 'are', 'was', 'were', 'be', 'been', + 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', + 'should', 'may', 'might', 'can', 'this', 'that', 'these', 'those', + 'it', 'its', 'not', 'no', 'what', 'how', 'when', 'where', 'which', + 'who', 'whom', 'why', 'if', 'then', 'than', 'so', 'as', 'just', + 'about', 'also', 'more', 'some', 'very', 'all', 'each', 'every', + 'such', 'please', 'write', 'use', 'using', 'used' + ]), + STOP_WORDS_KO: new Set([ + '그리고', '그런데', '그래서', '하지만', '또한', '또는', '해서', '하는', + '있어', '없어', '아래', '위에', '어떻게', '이것', '저것', '그것', + '이런', '저런', '그런', '여기', '거기', '필요', '사용', '관련', + '대한', '대해', '통해', '따라', '위해', '대로', '만큼' + ]), + SYNONYM_DATA: [ + ['성능', ['performance', 'optimization', '최적화', 'speed']], + ['performance', ['성능', '최적화', 'optimization', 'speed']], + ['아키텍처', ['architecture', '구조', 'structure', 'design']], + ['architecture', ['아키텍처', '구조', 'structure', 'design']], + ['메모리', ['memory', '기억', 'cache', 'storage']], + ['memory', ['메모리', '기억', 'cache', 'storage']], + ['버그', ['bug', 'error', '오류', 'issue', 'defect']], + ['bug', ['버그', 'error', '오류', 'issue']], + ['설계', ['design', '아키텍처', 'architecture', 'pattern']], + ['design', ['설계', '아키텍처', 'architecture', 'pattern']], + ['배포', ['deploy', 'deployment', 'release', 'ci', 'cd']], + ['deploy', ['배포', 'deployment', 'release']], + ['테스트', ['test', 'testing', 'spec', 'jest', 'mocha']], + ['test', ['테스트', 'testing', 'spec']], + ['프로젝트', ['project', '프로그램', 'repo', 'repository']], + ['project', ['프로젝트', '프로그램', 'repo']], + ['방향', ['direction', '전략', 'strategy', '목표', 'goal']], + ['direction', ['방향', '전략', 'strategy', '목표']] + ] as [string, string[]][], + DENSITY_THRESHOLD: 0.15, // 발췌문 추출 시 최소 키워드 밀도 + TITLE_MULTIPLIER: 3.0, // 제목 일치 가중치 + GLOBAL_CACHE_LIMIT: 2000 +}; -// ─── Internal Cache for Tokenization ─── +// ─── Global Search State & Cache ─── const TOKEN_CACHE = new Map(); -const MAX_CACHE_SIZE = 1000; +const IDF_CACHE = new Map(); + +/** + * 캐시를 명시적으로 비웁니다. 문서 집합이 크게 변경되었을 때 사용합니다. + */ +export function clearScoringCache() { + TOKEN_CACHE.clear(); + IDF_CACHE.clear(); +} /** * 한국어/영어 혼합 텍스트를 정규화하고 토큰으로 분리합니다. - * (Performance Optimization: 내부 캐시 적용) */ export function tokenize(text: string): string[] { if (!text) return []; - - // 캐시 확인 - if (TOKEN_CACHE.has(text)) { - return TOKEN_CACHE.get(text)!; - } + if (TOKEN_CACHE.has(text)) return TOKEN_CACHE.get(text)!; - // 1. Pre-normalization: 특수문자 정제 및 표준화 const normalized = text .toLowerCase() - .replace(/[\u200B-\u200D\uFEFF]/g, '') // Zero-width spaces 제거 - .replace(/[^\w\s가-힣_.-]/g, ' ') // 허용된 문자 외에는 공백 처리 + .replace(/[\u200B-\u200D\uFEFF]/g, '') + .replace(/[^\w\s가-힣_.-]/g, ' ') .trim(); - // 2. Tokenization: 정제된 텍스트 분리 const tokens = normalized .split(/[^a-z0-9가-힣_.-]+/g) .map((t) => t.trim()) .filter((t) => t.length >= 2) - .filter((t) => !STOP_WORDS_EN.has(t) && !STOP_WORDS_KO.has(t)); + .filter((t) => !SCORING_CONFIG.STOP_WORDS_EN.has(t) && !SCORING_CONFIG.STOP_WORDS_KO.has(t)); - // 캐시 저장 (메모리 관리: 임계값 초과 시 비우기) - if (TOKEN_CACHE.size >= MAX_CACHE_SIZE) { - TOKEN_CACHE.clear(); - } + if (TOKEN_CACHE.size >= SCORING_CONFIG.GLOBAL_CACHE_LIMIT) TOKEN_CACHE.clear(); TOKEN_CACHE.set(text, tokens); - return tokens; } +const synonymMap = new Map(SCORING_CONFIG.SYNONYM_DATA); + /** * 동의어/관련어 확장을 수행합니다. */ @@ -231,7 +256,7 @@ export function extractBestExcerpt( const expanded = expandQuery(queryTokens); const expandedSet = new Set(expanded); - // Split into sentences (한국어 + 영어) + // 1. Sentence splitting & Initial filtering const sentences = content .split(/(?<=[.!?。!?\n])\s*/) .map((s) => s.trim()) @@ -239,15 +264,20 @@ export function extractBestExcerpt( if (sentences.length === 0) return content.slice(0, maxLength); - // Score each sentence + // 2. Phase 1: Density-based filtering (Multi-stage) + // 최소 정보 밀도를 충족하지 못하는 문장은 후보군에서 제외하거나 가중치를 낮춤 const scored = sentences.map((sentence, idx) => { const tokens = tokenize(sentence); const matchCount = tokens.filter((t) => expandedSet.has(t)).length; const density = tokens.length > 0 ? matchCount / tokens.length : 0; - return { sentence, idx, matchCount, density }; + + // 정보 밀도가 임계값 미만이면 점수를 크게 깎음 + const densityMultiplier = density >= SCORING_CONFIG.DENSITY_THRESHOLD ? 1.5 : 0.5; + + return { sentence, idx, matchCount, density, score: (matchCount + density * 2) * densityMultiplier }; }); - // Find the best window of consecutive sentences + // 3. Phase 2: Optimal window search let bestStart = 0; let bestScore = -1; let bestLen = 0; @@ -259,7 +289,7 @@ export function extractBestExcerpt( while (j < scored.length && windowText.length < maxLength) { windowText += scored[j].sentence + ' '; - windowScore += scored[j].matchCount + scored[j].density * 2; + windowScore += scored[j].score; j++; }