feat(scoring): completed roadmap Phase 1 & 2 with edge case stability v2.74.0

2026-05-05 11:20:44 +09:00
parent e6bc263872
commit 518a5ed317
5 changed files with 86 additions and 10 deletions
@@ -0,0 +1,43 @@
 # Project Chronicle Guard: Search Engine Roadmap
 ## 🎯 Current Status: v2.74.0
 - [x] **Phase 1: Linguistic Foundation Stabilization** (Completed)
 - [x] **Phase 2: Conflict Scoring Refinement** (Completed)
 - [ ] **Phase 3: Performance Scaling & Caching** (In Progress)
 - [ ] **Phase 4: Excerpt Precision Tuning** (Planned)
 - [ ] **Phase 5: Downstream Integration API** (Planned)
 ---
 ## 🔬 Phase Details
 ### Phase 1: Linguistic Foundation (v2.72.0 - v2.74.0)
 - **Goal**: Perfect tokenization for mixed KO/EN/Special characters.
 - **Achievement**: 
    - Bilingual boundary split (e.g., 'Astra의' -> 'Astra', '의').
    - Hangeul monosyllable preservation (e.g., '한', '글').
    - Zero-width character cleaning.
 ### Phase 2: Conflict Scoring (v2.73.0 - v2.74.0)
 - **Goal**: Quantitative risk assessment for information conflicts.
 - **Achievement**:
    - Tiered severity logic (NONE, LOW, MEDIUM, HIGH).
    - Substring-based detection to overcome particle interference.
    - Configurable thresholds via `SCORING_CONFIG`.
 ### Phase 3: Performance Scaling (v2.75.0+)
 - **Goal**: Sub-10ms response for 10k+ documents.
 - **Action**:
    - Global module-level caching for IDF and tokens.
    - Potential worker thread offloading for heavy scoring.
 ### Phase 4: Excerpt Precision (Planned)
 - **Goal**: Maximize context signal-to-noise ratio.
 - **Action**:
    - Density-based window starting point restriction.
    - Multi-stage filtering for optimal text chunking.
 ### Phase 5: Integration (Planned)
 - **Goal**: Seamless RAG pipeline integration.
 - **Action**:
    - Strict IO schema definition for downstream AI agents.
@@ -1,12 +1,12 @@
 {
  "name": "g1nation",
-  "version": "2.73.0",
+  "version": "2.74.0",
  "lockfileVersion": 3,
  "requires": true,
  "packages": {
    "": {
      "name": "g1nation",
-      "version": "2.73.0",
+      "version": "2.74.0",
      "license": "MIT",
      "dependencies": {
        "marked": "^18.0.2"
@@ -2,7 +2,7 @@
  "name": "astra",
  "displayName": "Astra",
  "description": "The personal intelligence layer for Antigravity and VS Code. A private cognitive partner for deep project context, memory, and proactive strategic decision-making.",
-  "version": "2.73.0",
+  "version": "2.74.0",
  "publisher": "g1nation",
  "license": "MIT",
  "icon": "assets/icon.png",
@@ -55,7 +55,12 @@ const SCORING_CONFIG = {
    CONFLICT_INDICATORS: new Set([
        '반대', '충돌', '오류', '논란', '반박', '차이', '대조',
        'conflict', 'contradict', 'dispute', 'controversy', 'error', 'mismatch', 'vs'
-    ])
+    ]),
    CONFLICT_THRESHOLDS: {
        HIGH: 4,
        MEDIUM: 2,
        LOW: 1
    }
 };
 // ─── Global Search State & Cache ───
@@ -86,9 +91,14 @@ export function tokenize(text: string): string[] {
    const splitText = normalized.replace(/([a-z0-9]+)([가-힣]+)/gi, '$1 $2').replace(/([가-힣]+)([a-z0-9]+)/gi, '$1 $2');
    const tokens = splitText
-        .split(/[^a-z0-9가-힣_]+/g)
+        .split(/[^a-z0-9가-힣]+/g)
        .map((t) => t.trim())
-        .filter((t) => t.length >= 2)
+        .filter((t) => {
            if (!t) return false;
            // 한글이 포함된 경우 한 글자라도 허용, 그 외(영문/숫자)는 2글자 이상
            if (/[가-힣]/.test(t)) return t.length >= 1;
            return t.length >= 2;
        })
        .filter((t) => !SCORING_CONFIG.STOP_WORDS_EN.has(t) && !SCORING_CONFIG.STOP_WORDS_KO.has(t));
    if (TOKEN_CACHE.size >= SCORING_CONFIG.GLOBAL_CACHE_LIMIT) TOKEN_CACHE.clear();
@@ -228,9 +238,9 @@ export function scoreTfIdf(
        const conflictDetected = conflictMatches.length > 0;
        let conflictSeverity: ConflictSeverity = 'NONE';
-        if (conflictMatches.length >= 4) conflictSeverity = 'HIGH';
+        if (conflictMatches.length >= SCORING_CONFIG.CONFLICT_THRESHOLDS.HIGH) conflictSeverity = 'HIGH';
-        else if (conflictMatches.length >= 2) conflictSeverity = 'MEDIUM';
+        else if (conflictMatches.length >= SCORING_CONFIG.CONFLICT_THRESHOLDS.MEDIUM) conflictSeverity = 'MEDIUM';
-        else if (conflictMatches.length === 1) conflictSeverity = 'LOW';
+        else if (conflictMatches.length >= SCORING_CONFIG.CONFLICT_THRESHOLDS.LOW) conflictSeverity = 'LOW';
        for (const term of expandedQuery) {
            const tf = termFrequency(term, docTokens);
@@ -71,6 +71,29 @@ describe('Scoring Engine Unit Tests (v2.72.0)', () => {
        expect(excerpt).not.toContain('첫 번째 문장');
    });
    test('Edge Case Tokenization: should handle extreme mixed strings and symbols', () => {
        const text = 'A한B글C1!@#$ D.E.F_G 🚀Astra_v2.0';
        const tokens = tokenize(text);
        // Language boundary split should handle alternating chars
        expect(tokens).toContain('astra');
        expect(tokens).toContain('v2');
        expect(tokens).toContain('한');
        expect(tokens).toContain('글');
        // Symbols should be filtered out
        expect(tokens.some(t => /^[!@#$]+$/.test(t))).toBe(false);
    });
    test('Long String Performance: should handle 10k character content', () => {
        const longContent = '성능 '.repeat(2000) + '최적화 '.repeat(2000);
        const start = Date.now();
        const tokens = tokenize(longContent);
        const duration = Date.now() - start;
        expect(tokens.length).toBeGreaterThan(0);
        expect(duration).toBeLessThan(100); // Tokenizer should be efficient even for long text
    });
    test('Performance Benchmark: should process 100 documents within threshold', () => {
        const query = tokenize('performance optimization');
        const largeDocs = Array.from({ length: 100 }, (_, i) => ({
@@ -83,6 +106,6 @@ describe('Scoring Engine Unit Tests (v2.72.0)', () => {
        const duration = Date.now() - start;
        console.log(`[Benchmark] 100 docs processing time: ${duration}ms`);
-        expect(duration).toBeLessThan(200); // Should be very fast due to caching
+        expect(duration).toBeLessThan(200); 
    });
 });