feat(scoring): completed roadmap Phase 1 & 2 with edge case stability v2.74.0

This commit is contained in:
g1nation
2026-05-05 11:20:44 +09:00
parent e6bc263872
commit 518a5ed317
5 changed files with 86 additions and 10 deletions
+43
View File
@@ -0,0 +1,43 @@
# Project Chronicle Guard: Search Engine Roadmap
## 🎯 Current Status: v2.74.0
- [x] **Phase 1: Linguistic Foundation Stabilization** (Completed)
- [x] **Phase 2: Conflict Scoring Refinement** (Completed)
- [ ] **Phase 3: Performance Scaling & Caching** (In Progress)
- [ ] **Phase 4: Excerpt Precision Tuning** (Planned)
- [ ] **Phase 5: Downstream Integration API** (Planned)
---
## 🔬 Phase Details
### Phase 1: Linguistic Foundation (v2.72.0 - v2.74.0)
- **Goal**: Perfect tokenization for mixed KO/EN/Special characters.
- **Achievement**:
- Bilingual boundary split (e.g., 'Astra의' -> 'Astra', '의').
- Hangeul monosyllable preservation (e.g., '한', '글').
- Zero-width character cleaning.
### Phase 2: Conflict Scoring (v2.73.0 - v2.74.0)
- **Goal**: Quantitative risk assessment for information conflicts.
- **Achievement**:
- Tiered severity logic (NONE, LOW, MEDIUM, HIGH).
- Substring-based detection to overcome particle interference.
- Configurable thresholds via `SCORING_CONFIG`.
### Phase 3: Performance Scaling (v2.75.0+)
- **Goal**: Sub-10ms response for 10k+ documents.
- **Action**:
- Global module-level caching for IDF and tokens.
- Potential worker thread offloading for heavy scoring.
### Phase 4: Excerpt Precision (Planned)
- **Goal**: Maximize context signal-to-noise ratio.
- **Action**:
- Density-based window starting point restriction.
- Multi-stage filtering for optimal text chunking.
### Phase 5: Integration (Planned)
- **Goal**: Seamless RAG pipeline integration.
- **Action**:
- Strict IO schema definition for downstream AI agents.
+2 -2
View File
@@ -1,12 +1,12 @@
{ {
"name": "g1nation", "name": "g1nation",
"version": "2.73.0", "version": "2.74.0",
"lockfileVersion": 3, "lockfileVersion": 3,
"requires": true, "requires": true,
"packages": { "packages": {
"": { "": {
"name": "g1nation", "name": "g1nation",
"version": "2.73.0", "version": "2.74.0",
"license": "MIT", "license": "MIT",
"dependencies": { "dependencies": {
"marked": "^18.0.2" "marked": "^18.0.2"
+1 -1
View File
@@ -2,7 +2,7 @@
"name": "astra", "name": "astra",
"displayName": "Astra", "displayName": "Astra",
"description": "The personal intelligence layer for Antigravity and VS Code. A private cognitive partner for deep project context, memory, and proactive strategic decision-making.", "description": "The personal intelligence layer for Antigravity and VS Code. A private cognitive partner for deep project context, memory, and proactive strategic decision-making.",
"version": "2.73.0", "version": "2.74.0",
"publisher": "g1nation", "publisher": "g1nation",
"license": "MIT", "license": "MIT",
"icon": "assets/icon.png", "icon": "assets/icon.png",
+16 -6
View File
@@ -55,7 +55,12 @@ const SCORING_CONFIG = {
CONFLICT_INDICATORS: new Set([ CONFLICT_INDICATORS: new Set([
'반대', '충돌', '오류', '논란', '반박', '차이', '대조', '반대', '충돌', '오류', '논란', '반박', '차이', '대조',
'conflict', 'contradict', 'dispute', 'controversy', 'error', 'mismatch', 'vs' 'conflict', 'contradict', 'dispute', 'controversy', 'error', 'mismatch', 'vs'
]) ]),
CONFLICT_THRESHOLDS: {
HIGH: 4,
MEDIUM: 2,
LOW: 1
}
}; };
// ─── Global Search State & Cache ─── // ─── Global Search State & Cache ───
@@ -86,9 +91,14 @@ export function tokenize(text: string): string[] {
const splitText = normalized.replace(/([a-z0-9]+)([가-힣]+)/gi, '$1 $2').replace(/([가-힣]+)([a-z0-9]+)/gi, '$1 $2'); const splitText = normalized.replace(/([a-z0-9]+)([가-힣]+)/gi, '$1 $2').replace(/([가-힣]+)([a-z0-9]+)/gi, '$1 $2');
const tokens = splitText const tokens = splitText
.split(/[^a-z0-9가-힣_]+/g) .split(/[^a-z0-9가-힣]+/g)
.map((t) => t.trim()) .map((t) => t.trim())
.filter((t) => t.length >= 2) .filter((t) => {
if (!t) return false;
// 한글이 포함된 경우 한 글자라도 허용, 그 외(영문/숫자)는 2글자 이상
if (/[가-힣]/.test(t)) return t.length >= 1;
return t.length >= 2;
})
.filter((t) => !SCORING_CONFIG.STOP_WORDS_EN.has(t) && !SCORING_CONFIG.STOP_WORDS_KO.has(t)); .filter((t) => !SCORING_CONFIG.STOP_WORDS_EN.has(t) && !SCORING_CONFIG.STOP_WORDS_KO.has(t));
if (TOKEN_CACHE.size >= SCORING_CONFIG.GLOBAL_CACHE_LIMIT) TOKEN_CACHE.clear(); if (TOKEN_CACHE.size >= SCORING_CONFIG.GLOBAL_CACHE_LIMIT) TOKEN_CACHE.clear();
@@ -228,9 +238,9 @@ export function scoreTfIdf(
const conflictDetected = conflictMatches.length > 0; const conflictDetected = conflictMatches.length > 0;
let conflictSeverity: ConflictSeverity = 'NONE'; let conflictSeverity: ConflictSeverity = 'NONE';
if (conflictMatches.length >= 4) conflictSeverity = 'HIGH'; if (conflictMatches.length >= SCORING_CONFIG.CONFLICT_THRESHOLDS.HIGH) conflictSeverity = 'HIGH';
else if (conflictMatches.length >= 2) conflictSeverity = 'MEDIUM'; else if (conflictMatches.length >= SCORING_CONFIG.CONFLICT_THRESHOLDS.MEDIUM) conflictSeverity = 'MEDIUM';
else if (conflictMatches.length === 1) conflictSeverity = 'LOW'; else if (conflictMatches.length >= SCORING_CONFIG.CONFLICT_THRESHOLDS.LOW) conflictSeverity = 'LOW';
for (const term of expandedQuery) { for (const term of expandedQuery) {
const tf = termFrequency(term, docTokens); const tf = termFrequency(term, docTokens);
+24 -1
View File
@@ -71,6 +71,29 @@ describe('Scoring Engine Unit Tests (v2.72.0)', () => {
expect(excerpt).not.toContain('첫 번째 문장'); expect(excerpt).not.toContain('첫 번째 문장');
}); });
test('Edge Case Tokenization: should handle extreme mixed strings and symbols', () => {
const text = 'A한B글C1!@#$ D.E.F_G 🚀Astra_v2.0';
const tokens = tokenize(text);
// Language boundary split should handle alternating chars
expect(tokens).toContain('astra');
expect(tokens).toContain('v2');
expect(tokens).toContain('한');
expect(tokens).toContain('글');
// Symbols should be filtered out
expect(tokens.some(t => /^[!@#$]+$/.test(t))).toBe(false);
});
test('Long String Performance: should handle 10k character content', () => {
const longContent = '성능 '.repeat(2000) + '최적화 '.repeat(2000);
const start = Date.now();
const tokens = tokenize(longContent);
const duration = Date.now() - start;
expect(tokens.length).toBeGreaterThan(0);
expect(duration).toBeLessThan(100); // Tokenizer should be efficient even for long text
});
test('Performance Benchmark: should process 100 documents within threshold', () => { test('Performance Benchmark: should process 100 documents within threshold', () => {
const query = tokenize('performance optimization'); const query = tokenize('performance optimization');
const largeDocs = Array.from({ length: 100 }, (_, i) => ({ const largeDocs = Array.from({ length: 100 }, (_, i) => ({
@@ -83,6 +106,6 @@ describe('Scoring Engine Unit Tests (v2.72.0)', () => {
const duration = Date.now() - start; const duration = Date.now() - start;
console.log(`[Benchmark] 100 docs processing time: ${duration}ms`); console.log(`[Benchmark] 100 docs processing time: ${duration}ms`);
expect(duration).toBeLessThan(200); // Should be very fast due to caching expect(duration).toBeLessThan(200);
}); });
}); });