From 518a5ed31744c3f2d419cec3f8ff80230e1ea18c Mon Sep 17 00:00:00 2001
From: g1nation <g1nation@users.noreply.github.com>
Date: Tue, 5 May 2026 11:20:44 +0900
Subject: [PATCH] feat(scoring): completed roadmap Phase 1 & 2 with edge case
 stability v2.74.0

---
 docs/PROJECT_CHRONICLE_GUARD_ROADMAP.md | 43 +++++++++++++++++++++++++
 package-lock.json                       |  4 +--
 package.json                            |  2 +-
 src/retrieval/scoring.ts                | 22 +++++++++----
 tests/scoring.test.ts                   | 25 +++++++++++++-
 5 files changed, 86 insertions(+), 10 deletions(-)
 create mode 100644 docs/PROJECT_CHRONICLE_GUARD_ROADMAP.md

diff --git a/docs/PROJECT_CHRONICLE_GUARD_ROADMAP.md b/docs/PROJECT_CHRONICLE_GUARD_ROADMAP.md
new file mode 100644
index 0000000..f59f456
--- /dev/null
+++ b/docs/PROJECT_CHRONICLE_GUARD_ROADMAP.md
@@ -0,0 +1,43 @@
+# Project Chronicle Guard: Search Engine Roadmap
+
+## 🎯 Current Status: v2.74.0
+- [x] **Phase 1: Linguistic Foundation Stabilization** (Completed)
+- [x] **Phase 2: Conflict Scoring Refinement** (Completed)
+- [ ] **Phase 3: Performance Scaling & Caching** (In Progress)
+- [ ] **Phase 4: Excerpt Precision Tuning** (Planned)
+- [ ] **Phase 5: Downstream Integration API** (Planned)
+
+---
+
+## 🔬 Phase Details
+
+### Phase 1: Linguistic Foundation (v2.72.0 - v2.74.0)
+- **Goal**: Perfect tokenization for mixed KO/EN/Special characters.
+- **Achievement**: 
+    - Bilingual boundary split (e.g., 'Astra의' -> 'Astra', '의').
+    - Hangeul monosyllable preservation (e.g., '한', '글').
+    - Zero-width character cleaning.
+
+### Phase 2: Conflict Scoring (v2.73.0 - v2.74.0)
+- **Goal**: Quantitative risk assessment for information conflicts.
+- **Achievement**:
+    - Tiered severity logic (NONE, LOW, MEDIUM, HIGH).
+    - Substring-based detection to overcome particle interference.
+    - Configurable thresholds via `SCORING_CONFIG`.
+
+### Phase 3: Performance Scaling (v2.75.0+)
+- **Goal**: Sub-10ms response for 10k+ documents.
+- **Action**:
+    - Global module-level caching for IDF and tokens.
+    - Potential worker thread offloading for heavy scoring.
+
+### Phase 4: Excerpt Precision (Planned)
+- **Goal**: Maximize context signal-to-noise ratio.
+- **Action**:
+    - Density-based window starting point restriction.
+    - Multi-stage filtering for optimal text chunking.
+
+### Phase 5: Integration (Planned)
+- **Goal**: Seamless RAG pipeline integration.
+- **Action**:
+    - Strict IO schema definition for downstream AI agents.
diff --git a/package-lock.json b/package-lock.json
index ba4a88d..52637d0 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -1,12 +1,12 @@
 {
   "name": "g1nation",
-  "version": "2.73.0",
+  "version": "2.74.0",
   "lockfileVersion": 3,
   "requires": true,
   "packages": {
     "": {
       "name": "g1nation",
-      "version": "2.73.0",
+      "version": "2.74.0",
       "license": "MIT",
       "dependencies": {
         "marked": "^18.0.2"
diff --git a/package.json b/package.json
index 778e8a7..54c5cfa 100644
--- a/package.json
+++ b/package.json
@@ -2,7 +2,7 @@
   "name": "astra",
   "displayName": "Astra",
   "description": "The personal intelligence layer for Antigravity and VS Code. A private cognitive partner for deep project context, memory, and proactive strategic decision-making.",
-  "version": "2.73.0",
+  "version": "2.74.0",
   "publisher": "g1nation",
   "license": "MIT",
   "icon": "assets/icon.png",
diff --git a/src/retrieval/scoring.ts b/src/retrieval/scoring.ts
index 90378d8..64f10cd 100644
--- a/src/retrieval/scoring.ts
+++ b/src/retrieval/scoring.ts
@@ -55,7 +55,12 @@ const SCORING_CONFIG = {
     CONFLICT_INDICATORS: new Set([
         '반대', '충돌', '오류', '논란', '반박', '차이', '대조',
         'conflict', 'contradict', 'dispute', 'controversy', 'error', 'mismatch', 'vs'
-    ])
+    ]),
+    CONFLICT_THRESHOLDS: {
+        HIGH: 4,
+        MEDIUM: 2,
+        LOW: 1
+    }
 };
 
 // ─── Global Search State & Cache ───
@@ -86,9 +91,14 @@ export function tokenize(text: string): string[] {
     const splitText = normalized.replace(/([a-z0-9]+)([가-힣]+)/gi, '$1 $2').replace(/([가-힣]+)([a-z0-9]+)/gi, '$1 $2');
     
     const tokens = splitText
-        .split(/[^a-z0-9가-힣_]+/g)
+        .split(/[^a-z0-9가-힣]+/g)
         .map((t) => t.trim())
-        .filter((t) => t.length >= 2)
+        .filter((t) => {
+            if (!t) return false;
+            // 한글이 포함된 경우 한 글자라도 허용, 그 외(영문/숫자)는 2글자 이상
+            if (/[가-힣]/.test(t)) return t.length >= 1;
+            return t.length >= 2;
+        })
         .filter((t) => !SCORING_CONFIG.STOP_WORDS_EN.has(t) && !SCORING_CONFIG.STOP_WORDS_KO.has(t));
 
     if (TOKEN_CACHE.size >= SCORING_CONFIG.GLOBAL_CACHE_LIMIT) TOKEN_CACHE.clear();
@@ -228,9 +238,9 @@ export function scoreTfIdf(
         const conflictDetected = conflictMatches.length > 0;
         let conflictSeverity: ConflictSeverity = 'NONE';
 
-        if (conflictMatches.length >= 4) conflictSeverity = 'HIGH';
-        else if (conflictMatches.length >= 2) conflictSeverity = 'MEDIUM';
-        else if (conflictMatches.length === 1) conflictSeverity = 'LOW';
+        if (conflictMatches.length >= SCORING_CONFIG.CONFLICT_THRESHOLDS.HIGH) conflictSeverity = 'HIGH';
+        else if (conflictMatches.length >= SCORING_CONFIG.CONFLICT_THRESHOLDS.MEDIUM) conflictSeverity = 'MEDIUM';
+        else if (conflictMatches.length >= SCORING_CONFIG.CONFLICT_THRESHOLDS.LOW) conflictSeverity = 'LOW';
 
         for (const term of expandedQuery) {
             const tf = termFrequency(term, docTokens);
diff --git a/tests/scoring.test.ts b/tests/scoring.test.ts
index 88f2535..93792ee 100644
--- a/tests/scoring.test.ts
+++ b/tests/scoring.test.ts
@@ -71,6 +71,29 @@ describe('Scoring Engine Unit Tests (v2.72.0)', () => {
         expect(excerpt).not.toContain('첫 번째 문장');
     });
 
+    test('Edge Case Tokenization: should handle extreme mixed strings and symbols', () => {
+        const text = 'A한B글C1!@#$ D.E.F_G 🚀Astra_v2.0';
+        const tokens = tokenize(text);
+        
+        // Language boundary split should handle alternating chars
+        expect(tokens).toContain('astra');
+        expect(tokens).toContain('v2');
+        expect(tokens).toContain('한');
+        expect(tokens).toContain('글');
+        // Symbols should be filtered out
+        expect(tokens.some(t => /^[!@#$]+$/.test(t))).toBe(false);
+    });
+
+    test('Long String Performance: should handle 10k character content', () => {
+        const longContent = '성능 '.repeat(2000) + '최적화 '.repeat(2000);
+        const start = Date.now();
+        const tokens = tokenize(longContent);
+        const duration = Date.now() - start;
+        
+        expect(tokens.length).toBeGreaterThan(0);
+        expect(duration).toBeLessThan(100); // Tokenizer should be efficient even for long text
+    });
+
     test('Performance Benchmark: should process 100 documents within threshold', () => {
         const query = tokenize('performance optimization');
         const largeDocs = Array.from({ length: 100 }, (_, i) => ({
@@ -83,6 +106,6 @@ describe('Scoring Engine Unit Tests (v2.72.0)', () => {
         const duration = Date.now() - start;
         
         console.log(`[Benchmark] 100 docs processing time: ${duration}ms`);
-        expect(duration).toBeLessThan(200); // Should be very fast due to caching
+        expect(duration).toBeLessThan(200); 
     });
 });