feat: ConnectAI structural hardening and retrieval precision improvements

2026-05-05 21:37:45 +09:00
parent c2f17cfb03
commit 466e9e4d5f
17 changed files with 424 additions and 160 deletions
@@ -52,12 +52,16 @@ export function selectWithinBudget(
    // 1. Sort by score descending
    const sorted = [...chunks].sort((a, b) => b.score - a.score);

-    // 2. Deduplicate by filePath
-    const seen = new Set<string>();
-    const deduped = sorted.filter((chunk) => {
+    // 2. [Structural Fix] 파일당 청크 수 제한 완화 (Deduplication -> Multi-context)
+    const fileChunkCounts = new Map<string, number>();
+    const filtered = sorted.filter((chunk) => {
        const key = chunk.metadata.filePath || chunk.id;
-        if (seen.has(key)) return false;
-        seen.add(key);
+        const count = fileChunkCounts.get(key) || 0;
+        
+        // 파일당 최대 3개까지의 주요 맥락 허용 (정보 유실 방지)
+        if (count >= 3) return false;
+        
+        fileChunkCounts.set(key, count + 1);
        return true;
    });

@@ -66,7 +70,7 @@ export function selectWithinBudget(
    const dropped: RetrievalChunk[] = [];
    let tokensUsed = 0;

-    for (const chunk of deduped) {
+    for (const chunk of filtered) {
        const chunkTokens = chunk.tokenEstimate || estimateTokens(chunk.content);

        if (selected.length >= cfg.maxChunks) {
@@ -65,14 +65,12 @@ const SCORING_CONFIG = {

 // ─── Global Search State & Cache ───
 const TOKEN_CACHE = new Map<string, string[]>();
-const IDF_CACHE = new Map<string, number>();

 /**
 * 캐시를 명시적으로 비웁니다. 문서 집합이 크게 변경되었을 때 사용합니다.
 */
 export function clearScoringCache() {
    TOKEN_CACHE.clear();
-    IDF_CACHE.clear();
 }

 /**
@@ -85,16 +83,18 @@ export function tokenize(text: string): string[] {
    const normalized = text
        .toLowerCase()
        .replace(/[\u200B-\u200D\uFEFF]/g, '')
-        .replace(/[^\w\s가-힣_.-]/g, ' ');
+        .replace(/[^\w\s가-힣_+#.-]/g, ' ');

    // [Refinement] 영문/숫자와 한글 경계에서 분리하도록 개선
    const splitText = normalized.replace(/([a-z0-9]+)([가-힣]+)/gi, '$1 $2').replace(/([가-힣]+)([a-z0-9]+)/gi, '$1 $2');
    
    const tokens = splitText
-        .split(/[^a-z0-9가-힣]+/g)
-        .map((t) => t.trim())
+        .split(/[^a-z0-9가-힣+#.-]+/g) // [Structural Fix] C++, C#, .net 등 특수 기호 보존
+        .map((t) => t.trim().replace(/[.,]$/g, '')) // [Refinement] 문장 끝 마침표/쉼표 제거
        .filter((t) => {
            if (!t) return false;
+            // 특수문자만 남은 토큰 제거 (단일 + 나 . 등)
+            if (/^[+#.-]+$/.test(t)) return false;
            // 한글이 포함된 경우 한 글자라도 허용, 그 외(영문/숫자)는 2글자 이상
            if (/[가-힣]/.test(t)) return t.length >= 1;
            return t.length >= 2;
@@ -110,29 +110,9 @@ const synonymMap = new Map<string, string[]>(SCORING_CONFIG.SYNONYM_DATA);

 /**
 * 동의어/관련어 확장을 수행합니다.
+ * SCORING_CONFIG의 중앙 데이터를 참조합니다.
 */
 export function expandQuery(tokens: string[]): string[] {
-    const synonymMap = new Map<string, string[]>([
-        ['성능', ['performance', 'optimization', '최적화', 'speed']],
-        ['performance', ['성능', '최적화', 'optimization', 'speed']],
-        ['아키텍처', ['architecture', '구조', 'structure', 'design']],
-        ['architecture', ['아키텍처', '구조', 'structure', 'design']],
-        ['메모리', ['memory', '기억', 'cache', 'storage']],
-        ['memory', ['메모리', '기억', 'cache', 'storage']],
-        ['버그', ['bug', 'error', '오류', 'issue', 'defect']],
-        ['bug', ['버그', 'error', '오류', 'issue']],
-        ['설계', ['design', '아키텍처', 'architecture', 'pattern']],
-        ['design', ['설계', '아키텍처', 'architecture', 'pattern']],
-        ['배포', ['deploy', 'deployment', 'release', 'ci', 'cd']],
-        ['deploy', ['배포', 'deployment', 'release']],
-        ['테스트', ['test', 'testing', 'spec', 'jest', 'mocha']],
-        ['test', ['테스트', 'testing', 'spec']],
-        ['프로젝트', ['project', '프로그램', 'repo', 'repository']],
-        ['project', ['프로젝트', '프로그램', 'repo']],
-        ['방향', ['direction', '전략', 'strategy', '목표', 'goal']],
-        ['direction', ['방향', '전략', 'strategy', '목표']]
-    ]);
-
    const expanded = new Set(tokens);
    for (const token of tokens) {
        const synonyms = synonymMap.get(token);
@@ -213,7 +193,7 @@ export function scoreTfIdf(
    // Expand query with synonyms
    const expandedQuery = expandQuery(queryTokens);

-    // Compute IDF for each query term
+    // Compute IDF for each query term (Local cache per document set)
    const idfCache = new Map<string, number>();
    for (const term of expandedQuery) {
        if (!idfCache.has(term)) {
@@ -271,15 +251,28 @@ export function scoreTfIdf(
        // Title match bonus for exact query term presence
        const titleBoost = queryTokens.some((t) => titleTokens.has(t)) ? 0.2 : 0;

+        // [Structural Fix] Conflict Penalty 및 음수 점수 방지 (Floor Zero 정책)
+        const conflictPenalty = conflictSeverity === 'HIGH' ? -1.0
+                              : conflictSeverity === 'MEDIUM' ? -0.5
+                              : conflictSeverity === 'LOW' ? -0.2
+                              : 0;
+        
+        const finalScore = Math.max(0, score + recencyBoost + titleBoost + conflictPenalty);
+
+        // [Structural Fix] Information Density: 쿼리 커버리지 기반으로 계산 방식 정상화
+        const queryCoverage = expandedQuery.length > 0 
+            ? new Set(matchedTerms).size / expandedQuery.length 
+            : 0;
+
        return {
            index,
-            score: score + recencyBoost + titleBoost,
+            score: finalScore,
            titleBoost,
            recencyBoost,
            matchedTerms: [...new Set(matchedTerms)],
            conflictDetected,
            conflictSeverity,
-            informationDensity
+            informationDensity: queryCoverage // 밀도를 쿼리 커버리지로 대체
        };
    });
 }
@@ -343,6 +336,17 @@ export function extractBestExcerpt(
        }
    }

+    // [Structural Fix] 임계값을 충족하는 윈도우가 없을 경우 Fallback (빈 컨텍스트 방지)
+    if (bestScore <= 0) {
+        const fallbackSentences = [...scored] // [Structural Fix] 원본 배열 변이 방지 (Shallow Copy)
+            .sort((a, b) => b.score - a.score)
+            .slice(0, 2) // 가장 관련성 높은 문장 2개만 추출
+            .map((s) => s.sentence);
+        
+        const fallbackResult = fallbackSentences.join(' ');
+        return fallbackResult.length > maxLength ? fallbackResult.slice(0, maxLength - 3) + '...' : fallbackResult;
+    }
+
    // 4. Result construction with semantic context padding
    let finalStart = bestStart;
    let finalEnd = bestStart + bestLen;