feat(scoring): completed roadmap Phase 1 & 2 with edge case stability v2.74.0
This commit is contained in:
@@ -0,0 +1,43 @@
|
||||
# Project Chronicle Guard: Search Engine Roadmap
|
||||
|
||||
## 🎯 Current Status: v2.74.0
|
||||
- [x] **Phase 1: Linguistic Foundation Stabilization** (Completed)
|
||||
- [x] **Phase 2: Conflict Scoring Refinement** (Completed)
|
||||
- [ ] **Phase 3: Performance Scaling & Caching** (In Progress)
|
||||
- [ ] **Phase 4: Excerpt Precision Tuning** (Planned)
|
||||
- [ ] **Phase 5: Downstream Integration API** (Planned)
|
||||
|
||||
---
|
||||
|
||||
## 🔬 Phase Details
|
||||
|
||||
### Phase 1: Linguistic Foundation (v2.72.0 - v2.74.0)
|
||||
- **Goal**: Perfect tokenization for mixed KO/EN/Special characters.
|
||||
- **Achievement**:
|
||||
- Bilingual boundary split (e.g., 'Astra의' -> 'Astra', '의').
|
||||
- Hangeul monosyllable preservation (e.g., '한', '글').
|
||||
- Zero-width character cleaning.
|
||||
|
||||
### Phase 2: Conflict Scoring (v2.73.0 - v2.74.0)
|
||||
- **Goal**: Quantitative risk assessment for information conflicts.
|
||||
- **Achievement**:
|
||||
- Tiered severity logic (NONE, LOW, MEDIUM, HIGH).
|
||||
- Substring-based detection to overcome particle interference.
|
||||
- Configurable thresholds via `SCORING_CONFIG`.
|
||||
|
||||
### Phase 3: Performance Scaling (v2.75.0+)
|
||||
- **Goal**: Sub-10ms response for 10k+ documents.
|
||||
- **Action**:
|
||||
- Global module-level caching for IDF and tokens.
|
||||
- Potential worker thread offloading for heavy scoring.
|
||||
|
||||
### Phase 4: Excerpt Precision (Planned)
|
||||
- **Goal**: Maximize context signal-to-noise ratio.
|
||||
- **Action**:
|
||||
- Density-based window starting point restriction.
|
||||
- Multi-stage filtering for optimal text chunking.
|
||||
|
||||
### Phase 5: Integration (Planned)
|
||||
- **Goal**: Seamless RAG pipeline integration.
|
||||
- **Action**:
|
||||
- Strict IO schema definition for downstream AI agents.
|
||||
Generated
+2
-2
@@ -1,12 +1,12 @@
|
||||
{
|
||||
"name": "g1nation",
|
||||
"version": "2.73.0",
|
||||
"version": "2.74.0",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "g1nation",
|
||||
"version": "2.73.0",
|
||||
"version": "2.74.0",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"marked": "^18.0.2"
|
||||
|
||||
+1
-1
@@ -2,7 +2,7 @@
|
||||
"name": "astra",
|
||||
"displayName": "Astra",
|
||||
"description": "The personal intelligence layer for Antigravity and VS Code. A private cognitive partner for deep project context, memory, and proactive strategic decision-making.",
|
||||
"version": "2.73.0",
|
||||
"version": "2.74.0",
|
||||
"publisher": "g1nation",
|
||||
"license": "MIT",
|
||||
"icon": "assets/icon.png",
|
||||
|
||||
@@ -55,7 +55,12 @@ const SCORING_CONFIG = {
|
||||
CONFLICT_INDICATORS: new Set([
|
||||
'반대', '충돌', '오류', '논란', '반박', '차이', '대조',
|
||||
'conflict', 'contradict', 'dispute', 'controversy', 'error', 'mismatch', 'vs'
|
||||
])
|
||||
]),
|
||||
CONFLICT_THRESHOLDS: {
|
||||
HIGH: 4,
|
||||
MEDIUM: 2,
|
||||
LOW: 1
|
||||
}
|
||||
};
|
||||
|
||||
// ─── Global Search State & Cache ───
|
||||
@@ -86,9 +91,14 @@ export function tokenize(text: string): string[] {
|
||||
const splitText = normalized.replace(/([a-z0-9]+)([가-힣]+)/gi, '$1 $2').replace(/([가-힣]+)([a-z0-9]+)/gi, '$1 $2');
|
||||
|
||||
const tokens = splitText
|
||||
.split(/[^a-z0-9가-힣_]+/g)
|
||||
.split(/[^a-z0-9가-힣]+/g)
|
||||
.map((t) => t.trim())
|
||||
.filter((t) => t.length >= 2)
|
||||
.filter((t) => {
|
||||
if (!t) return false;
|
||||
// 한글이 포함된 경우 한 글자라도 허용, 그 외(영문/숫자)는 2글자 이상
|
||||
if (/[가-힣]/.test(t)) return t.length >= 1;
|
||||
return t.length >= 2;
|
||||
})
|
||||
.filter((t) => !SCORING_CONFIG.STOP_WORDS_EN.has(t) && !SCORING_CONFIG.STOP_WORDS_KO.has(t));
|
||||
|
||||
if (TOKEN_CACHE.size >= SCORING_CONFIG.GLOBAL_CACHE_LIMIT) TOKEN_CACHE.clear();
|
||||
@@ -228,9 +238,9 @@ export function scoreTfIdf(
|
||||
const conflictDetected = conflictMatches.length > 0;
|
||||
let conflictSeverity: ConflictSeverity = 'NONE';
|
||||
|
||||
if (conflictMatches.length >= 4) conflictSeverity = 'HIGH';
|
||||
else if (conflictMatches.length >= 2) conflictSeverity = 'MEDIUM';
|
||||
else if (conflictMatches.length === 1) conflictSeverity = 'LOW';
|
||||
if (conflictMatches.length >= SCORING_CONFIG.CONFLICT_THRESHOLDS.HIGH) conflictSeverity = 'HIGH';
|
||||
else if (conflictMatches.length >= SCORING_CONFIG.CONFLICT_THRESHOLDS.MEDIUM) conflictSeverity = 'MEDIUM';
|
||||
else if (conflictMatches.length >= SCORING_CONFIG.CONFLICT_THRESHOLDS.LOW) conflictSeverity = 'LOW';
|
||||
|
||||
for (const term of expandedQuery) {
|
||||
const tf = termFrequency(term, docTokens);
|
||||
|
||||
+24
-1
@@ -71,6 +71,29 @@ describe('Scoring Engine Unit Tests (v2.72.0)', () => {
|
||||
expect(excerpt).not.toContain('첫 번째 문장');
|
||||
});
|
||||
|
||||
test('Edge Case Tokenization: should handle extreme mixed strings and symbols', () => {
|
||||
const text = 'A한B글C1!@#$ D.E.F_G 🚀Astra_v2.0';
|
||||
const tokens = tokenize(text);
|
||||
|
||||
// Language boundary split should handle alternating chars
|
||||
expect(tokens).toContain('astra');
|
||||
expect(tokens).toContain('v2');
|
||||
expect(tokens).toContain('한');
|
||||
expect(tokens).toContain('글');
|
||||
// Symbols should be filtered out
|
||||
expect(tokens.some(t => /^[!@#$]+$/.test(t))).toBe(false);
|
||||
});
|
||||
|
||||
test('Long String Performance: should handle 10k character content', () => {
|
||||
const longContent = '성능 '.repeat(2000) + '최적화 '.repeat(2000);
|
||||
const start = Date.now();
|
||||
const tokens = tokenize(longContent);
|
||||
const duration = Date.now() - start;
|
||||
|
||||
expect(tokens.length).toBeGreaterThan(0);
|
||||
expect(duration).toBeLessThan(100); // Tokenizer should be efficient even for long text
|
||||
});
|
||||
|
||||
test('Performance Benchmark: should process 100 documents within threshold', () => {
|
||||
const query = tokenize('performance optimization');
|
||||
const largeDocs = Array.from({ length: 100 }, (_, i) => ({
|
||||
@@ -83,6 +106,6 @@ describe('Scoring Engine Unit Tests (v2.72.0)', () => {
|
||||
const duration = Date.now() - start;
|
||||
|
||||
console.log(`[Benchmark] 100 docs processing time: ${duration}ms`);
|
||||
expect(duration).toBeLessThan(200); // Should be very fast due to caching
|
||||
expect(duration).toBeLessThan(200);
|
||||
});
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user