From 518a5ed31744c3f2d419cec3f8ff80230e1ea18c Mon Sep 17 00:00:00 2001 From: g1nation Date: Tue, 5 May 2026 11:20:44 +0900 Subject: [PATCH] feat(scoring): completed roadmap Phase 1 & 2 with edge case stability v2.74.0 --- docs/PROJECT_CHRONICLE_GUARD_ROADMAP.md | 43 +++++++++++++++++++++++++ package-lock.json | 4 +-- package.json | 2 +- src/retrieval/scoring.ts | 22 +++++++++---- tests/scoring.test.ts | 25 +++++++++++++- 5 files changed, 86 insertions(+), 10 deletions(-) create mode 100644 docs/PROJECT_CHRONICLE_GUARD_ROADMAP.md diff --git a/docs/PROJECT_CHRONICLE_GUARD_ROADMAP.md b/docs/PROJECT_CHRONICLE_GUARD_ROADMAP.md new file mode 100644 index 0000000..f59f456 --- /dev/null +++ b/docs/PROJECT_CHRONICLE_GUARD_ROADMAP.md @@ -0,0 +1,43 @@ +# Project Chronicle Guard: Search Engine Roadmap + +## ๐ŸŽฏ Current Status: v2.74.0 +- [x] **Phase 1: Linguistic Foundation Stabilization** (Completed) +- [x] **Phase 2: Conflict Scoring Refinement** (Completed) +- [ ] **Phase 3: Performance Scaling & Caching** (In Progress) +- [ ] **Phase 4: Excerpt Precision Tuning** (Planned) +- [ ] **Phase 5: Downstream Integration API** (Planned) + +--- + +## ๐Ÿ”ฌ Phase Details + +### Phase 1: Linguistic Foundation (v2.72.0 - v2.74.0) +- **Goal**: Perfect tokenization for mixed KO/EN/Special characters. +- **Achievement**: + - Bilingual boundary split (e.g., 'Astra์˜' -> 'Astra', '์˜'). + - Hangeul monosyllable preservation (e.g., 'ํ•œ', '๊ธ€'). + - Zero-width character cleaning. + +### Phase 2: Conflict Scoring (v2.73.0 - v2.74.0) +- **Goal**: Quantitative risk assessment for information conflicts. +- **Achievement**: + - Tiered severity logic (NONE, LOW, MEDIUM, HIGH). + - Substring-based detection to overcome particle interference. + - Configurable thresholds via `SCORING_CONFIG`. + +### Phase 3: Performance Scaling (v2.75.0+) +- **Goal**: Sub-10ms response for 10k+ documents. +- **Action**: + - Global module-level caching for IDF and tokens. + - Potential worker thread offloading for heavy scoring. + +### Phase 4: Excerpt Precision (Planned) +- **Goal**: Maximize context signal-to-noise ratio. +- **Action**: + - Density-based window starting point restriction. + - Multi-stage filtering for optimal text chunking. + +### Phase 5: Integration (Planned) +- **Goal**: Seamless RAG pipeline integration. +- **Action**: + - Strict IO schema definition for downstream AI agents. diff --git a/package-lock.json b/package-lock.json index ba4a88d..52637d0 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "g1nation", - "version": "2.73.0", + "version": "2.74.0", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "g1nation", - "version": "2.73.0", + "version": "2.74.0", "license": "MIT", "dependencies": { "marked": "^18.0.2" diff --git a/package.json b/package.json index 778e8a7..54c5cfa 100644 --- a/package.json +++ b/package.json @@ -2,7 +2,7 @@ "name": "astra", "displayName": "Astra", "description": "The personal intelligence layer for Antigravity and VS Code. A private cognitive partner for deep project context, memory, and proactive strategic decision-making.", - "version": "2.73.0", + "version": "2.74.0", "publisher": "g1nation", "license": "MIT", "icon": "assets/icon.png", diff --git a/src/retrieval/scoring.ts b/src/retrieval/scoring.ts index 90378d8..64f10cd 100644 --- a/src/retrieval/scoring.ts +++ b/src/retrieval/scoring.ts @@ -55,7 +55,12 @@ const SCORING_CONFIG = { CONFLICT_INDICATORS: new Set([ '๋ฐ˜๋Œ€', '์ถฉ๋Œ', '์˜ค๋ฅ˜', '๋…ผ๋ž€', '๋ฐ˜๋ฐ•', '์ฐจ์ด', '๋Œ€์กฐ', 'conflict', 'contradict', 'dispute', 'controversy', 'error', 'mismatch', 'vs' - ]) + ]), + CONFLICT_THRESHOLDS: { + HIGH: 4, + MEDIUM: 2, + LOW: 1 + } }; // โ”€โ”€โ”€ Global Search State & Cache โ”€โ”€โ”€ @@ -86,9 +91,14 @@ export function tokenize(text: string): string[] { const splitText = normalized.replace(/([a-z0-9]+)([๊ฐ€-ํžฃ]+)/gi, '$1 $2').replace(/([๊ฐ€-ํžฃ]+)([a-z0-9]+)/gi, '$1 $2'); const tokens = splitText - .split(/[^a-z0-9๊ฐ€-ํžฃ_]+/g) + .split(/[^a-z0-9๊ฐ€-ํžฃ]+/g) .map((t) => t.trim()) - .filter((t) => t.length >= 2) + .filter((t) => { + if (!t) return false; + // ํ•œ๊ธ€์ด ํฌํ•จ๋œ ๊ฒฝ์šฐ ํ•œ ๊ธ€์ž๋ผ๋„ ํ—ˆ์šฉ, ๊ทธ ์™ธ(์˜๋ฌธ/์ˆซ์ž)๋Š” 2๊ธ€์ž ์ด์ƒ + if (/[๊ฐ€-ํžฃ]/.test(t)) return t.length >= 1; + return t.length >= 2; + }) .filter((t) => !SCORING_CONFIG.STOP_WORDS_EN.has(t) && !SCORING_CONFIG.STOP_WORDS_KO.has(t)); if (TOKEN_CACHE.size >= SCORING_CONFIG.GLOBAL_CACHE_LIMIT) TOKEN_CACHE.clear(); @@ -228,9 +238,9 @@ export function scoreTfIdf( const conflictDetected = conflictMatches.length > 0; let conflictSeverity: ConflictSeverity = 'NONE'; - if (conflictMatches.length >= 4) conflictSeverity = 'HIGH'; - else if (conflictMatches.length >= 2) conflictSeverity = 'MEDIUM'; - else if (conflictMatches.length === 1) conflictSeverity = 'LOW'; + if (conflictMatches.length >= SCORING_CONFIG.CONFLICT_THRESHOLDS.HIGH) conflictSeverity = 'HIGH'; + else if (conflictMatches.length >= SCORING_CONFIG.CONFLICT_THRESHOLDS.MEDIUM) conflictSeverity = 'MEDIUM'; + else if (conflictMatches.length >= SCORING_CONFIG.CONFLICT_THRESHOLDS.LOW) conflictSeverity = 'LOW'; for (const term of expandedQuery) { const tf = termFrequency(term, docTokens); diff --git a/tests/scoring.test.ts b/tests/scoring.test.ts index 88f2535..93792ee 100644 --- a/tests/scoring.test.ts +++ b/tests/scoring.test.ts @@ -71,6 +71,29 @@ describe('Scoring Engine Unit Tests (v2.72.0)', () => { expect(excerpt).not.toContain('์ฒซ ๋ฒˆ์งธ ๋ฌธ์žฅ'); }); + test('Edge Case Tokenization: should handle extreme mixed strings and symbols', () => { + const text = 'Aํ•œB๊ธ€C1!@#$ D.E.F_G ๐Ÿš€Astra_v2.0'; + const tokens = tokenize(text); + + // Language boundary split should handle alternating chars + expect(tokens).toContain('astra'); + expect(tokens).toContain('v2'); + expect(tokens).toContain('ํ•œ'); + expect(tokens).toContain('๊ธ€'); + // Symbols should be filtered out + expect(tokens.some(t => /^[!@#$]+$/.test(t))).toBe(false); + }); + + test('Long String Performance: should handle 10k character content', () => { + const longContent = '์„ฑ๋Šฅ '.repeat(2000) + '์ตœ์ ํ™” '.repeat(2000); + const start = Date.now(); + const tokens = tokenize(longContent); + const duration = Date.now() - start; + + expect(tokens.length).toBeGreaterThan(0); + expect(duration).toBeLessThan(100); // Tokenizer should be efficient even for long text + }); + test('Performance Benchmark: should process 100 documents within threshold', () => { const query = tokenize('performance optimization'); const largeDocs = Array.from({ length: 100 }, (_, i) => ({ @@ -83,6 +106,6 @@ describe('Scoring Engine Unit Tests (v2.72.0)', () => { const duration = Date.now() - start; console.log(`[Benchmark] 100 docs processing time: ${duration}ms`); - expect(duration).toBeLessThan(200); // Should be very fast due to caching + expect(duration).toBeLessThan(200); }); });