feat(scoring): completed roadmap Phase 1 & 2 with edge case stability v2.74.0
This commit is contained in:
@@ -0,0 +1,43 @@
|
|||||||
|
# Project Chronicle Guard: Search Engine Roadmap
|
||||||
|
|
||||||
|
## 🎯 Current Status: v2.74.0
|
||||||
|
- [x] **Phase 1: Linguistic Foundation Stabilization** (Completed)
|
||||||
|
- [x] **Phase 2: Conflict Scoring Refinement** (Completed)
|
||||||
|
- [ ] **Phase 3: Performance Scaling & Caching** (In Progress)
|
||||||
|
- [ ] **Phase 4: Excerpt Precision Tuning** (Planned)
|
||||||
|
- [ ] **Phase 5: Downstream Integration API** (Planned)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 🔬 Phase Details
|
||||||
|
|
||||||
|
### Phase 1: Linguistic Foundation (v2.72.0 - v2.74.0)
|
||||||
|
- **Goal**: Perfect tokenization for mixed KO/EN/Special characters.
|
||||||
|
- **Achievement**:
|
||||||
|
- Bilingual boundary split (e.g., 'Astra의' -> 'Astra', '의').
|
||||||
|
- Hangeul monosyllable preservation (e.g., '한', '글').
|
||||||
|
- Zero-width character cleaning.
|
||||||
|
|
||||||
|
### Phase 2: Conflict Scoring (v2.73.0 - v2.74.0)
|
||||||
|
- **Goal**: Quantitative risk assessment for information conflicts.
|
||||||
|
- **Achievement**:
|
||||||
|
- Tiered severity logic (NONE, LOW, MEDIUM, HIGH).
|
||||||
|
- Substring-based detection to overcome particle interference.
|
||||||
|
- Configurable thresholds via `SCORING_CONFIG`.
|
||||||
|
|
||||||
|
### Phase 3: Performance Scaling (v2.75.0+)
|
||||||
|
- **Goal**: Sub-10ms response for 10k+ documents.
|
||||||
|
- **Action**:
|
||||||
|
- Global module-level caching for IDF and tokens.
|
||||||
|
- Potential worker thread offloading for heavy scoring.
|
||||||
|
|
||||||
|
### Phase 4: Excerpt Precision (Planned)
|
||||||
|
- **Goal**: Maximize context signal-to-noise ratio.
|
||||||
|
- **Action**:
|
||||||
|
- Density-based window starting point restriction.
|
||||||
|
- Multi-stage filtering for optimal text chunking.
|
||||||
|
|
||||||
|
### Phase 5: Integration (Planned)
|
||||||
|
- **Goal**: Seamless RAG pipeline integration.
|
||||||
|
- **Action**:
|
||||||
|
- Strict IO schema definition for downstream AI agents.
|
||||||
Generated
+2
-2
@@ -1,12 +1,12 @@
|
|||||||
{
|
{
|
||||||
"name": "g1nation",
|
"name": "g1nation",
|
||||||
"version": "2.73.0",
|
"version": "2.74.0",
|
||||||
"lockfileVersion": 3,
|
"lockfileVersion": 3,
|
||||||
"requires": true,
|
"requires": true,
|
||||||
"packages": {
|
"packages": {
|
||||||
"": {
|
"": {
|
||||||
"name": "g1nation",
|
"name": "g1nation",
|
||||||
"version": "2.73.0",
|
"version": "2.74.0",
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"marked": "^18.0.2"
|
"marked": "^18.0.2"
|
||||||
|
|||||||
+1
-1
@@ -2,7 +2,7 @@
|
|||||||
"name": "astra",
|
"name": "astra",
|
||||||
"displayName": "Astra",
|
"displayName": "Astra",
|
||||||
"description": "The personal intelligence layer for Antigravity and VS Code. A private cognitive partner for deep project context, memory, and proactive strategic decision-making.",
|
"description": "The personal intelligence layer for Antigravity and VS Code. A private cognitive partner for deep project context, memory, and proactive strategic decision-making.",
|
||||||
"version": "2.73.0",
|
"version": "2.74.0",
|
||||||
"publisher": "g1nation",
|
"publisher": "g1nation",
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
"icon": "assets/icon.png",
|
"icon": "assets/icon.png",
|
||||||
|
|||||||
@@ -55,7 +55,12 @@ const SCORING_CONFIG = {
|
|||||||
CONFLICT_INDICATORS: new Set([
|
CONFLICT_INDICATORS: new Set([
|
||||||
'반대', '충돌', '오류', '논란', '반박', '차이', '대조',
|
'반대', '충돌', '오류', '논란', '반박', '차이', '대조',
|
||||||
'conflict', 'contradict', 'dispute', 'controversy', 'error', 'mismatch', 'vs'
|
'conflict', 'contradict', 'dispute', 'controversy', 'error', 'mismatch', 'vs'
|
||||||
])
|
]),
|
||||||
|
CONFLICT_THRESHOLDS: {
|
||||||
|
HIGH: 4,
|
||||||
|
MEDIUM: 2,
|
||||||
|
LOW: 1
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// ─── Global Search State & Cache ───
|
// ─── Global Search State & Cache ───
|
||||||
@@ -86,9 +91,14 @@ export function tokenize(text: string): string[] {
|
|||||||
const splitText = normalized.replace(/([a-z0-9]+)([가-힣]+)/gi, '$1 $2').replace(/([가-힣]+)([a-z0-9]+)/gi, '$1 $2');
|
const splitText = normalized.replace(/([a-z0-9]+)([가-힣]+)/gi, '$1 $2').replace(/([가-힣]+)([a-z0-9]+)/gi, '$1 $2');
|
||||||
|
|
||||||
const tokens = splitText
|
const tokens = splitText
|
||||||
.split(/[^a-z0-9가-힣_]+/g)
|
.split(/[^a-z0-9가-힣]+/g)
|
||||||
.map((t) => t.trim())
|
.map((t) => t.trim())
|
||||||
.filter((t) => t.length >= 2)
|
.filter((t) => {
|
||||||
|
if (!t) return false;
|
||||||
|
// 한글이 포함된 경우 한 글자라도 허용, 그 외(영문/숫자)는 2글자 이상
|
||||||
|
if (/[가-힣]/.test(t)) return t.length >= 1;
|
||||||
|
return t.length >= 2;
|
||||||
|
})
|
||||||
.filter((t) => !SCORING_CONFIG.STOP_WORDS_EN.has(t) && !SCORING_CONFIG.STOP_WORDS_KO.has(t));
|
.filter((t) => !SCORING_CONFIG.STOP_WORDS_EN.has(t) && !SCORING_CONFIG.STOP_WORDS_KO.has(t));
|
||||||
|
|
||||||
if (TOKEN_CACHE.size >= SCORING_CONFIG.GLOBAL_CACHE_LIMIT) TOKEN_CACHE.clear();
|
if (TOKEN_CACHE.size >= SCORING_CONFIG.GLOBAL_CACHE_LIMIT) TOKEN_CACHE.clear();
|
||||||
@@ -228,9 +238,9 @@ export function scoreTfIdf(
|
|||||||
const conflictDetected = conflictMatches.length > 0;
|
const conflictDetected = conflictMatches.length > 0;
|
||||||
let conflictSeverity: ConflictSeverity = 'NONE';
|
let conflictSeverity: ConflictSeverity = 'NONE';
|
||||||
|
|
||||||
if (conflictMatches.length >= 4) conflictSeverity = 'HIGH';
|
if (conflictMatches.length >= SCORING_CONFIG.CONFLICT_THRESHOLDS.HIGH) conflictSeverity = 'HIGH';
|
||||||
else if (conflictMatches.length >= 2) conflictSeverity = 'MEDIUM';
|
else if (conflictMatches.length >= SCORING_CONFIG.CONFLICT_THRESHOLDS.MEDIUM) conflictSeverity = 'MEDIUM';
|
||||||
else if (conflictMatches.length === 1) conflictSeverity = 'LOW';
|
else if (conflictMatches.length >= SCORING_CONFIG.CONFLICT_THRESHOLDS.LOW) conflictSeverity = 'LOW';
|
||||||
|
|
||||||
for (const term of expandedQuery) {
|
for (const term of expandedQuery) {
|
||||||
const tf = termFrequency(term, docTokens);
|
const tf = termFrequency(term, docTokens);
|
||||||
|
|||||||
+24
-1
@@ -71,6 +71,29 @@ describe('Scoring Engine Unit Tests (v2.72.0)', () => {
|
|||||||
expect(excerpt).not.toContain('첫 번째 문장');
|
expect(excerpt).not.toContain('첫 번째 문장');
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test('Edge Case Tokenization: should handle extreme mixed strings and symbols', () => {
|
||||||
|
const text = 'A한B글C1!@#$ D.E.F_G 🚀Astra_v2.0';
|
||||||
|
const tokens = tokenize(text);
|
||||||
|
|
||||||
|
// Language boundary split should handle alternating chars
|
||||||
|
expect(tokens).toContain('astra');
|
||||||
|
expect(tokens).toContain('v2');
|
||||||
|
expect(tokens).toContain('한');
|
||||||
|
expect(tokens).toContain('글');
|
||||||
|
// Symbols should be filtered out
|
||||||
|
expect(tokens.some(t => /^[!@#$]+$/.test(t))).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
test('Long String Performance: should handle 10k character content', () => {
|
||||||
|
const longContent = '성능 '.repeat(2000) + '최적화 '.repeat(2000);
|
||||||
|
const start = Date.now();
|
||||||
|
const tokens = tokenize(longContent);
|
||||||
|
const duration = Date.now() - start;
|
||||||
|
|
||||||
|
expect(tokens.length).toBeGreaterThan(0);
|
||||||
|
expect(duration).toBeLessThan(100); // Tokenizer should be efficient even for long text
|
||||||
|
});
|
||||||
|
|
||||||
test('Performance Benchmark: should process 100 documents within threshold', () => {
|
test('Performance Benchmark: should process 100 documents within threshold', () => {
|
||||||
const query = tokenize('performance optimization');
|
const query = tokenize('performance optimization');
|
||||||
const largeDocs = Array.from({ length: 100 }, (_, i) => ({
|
const largeDocs = Array.from({ length: 100 }, (_, i) => ({
|
||||||
@@ -83,6 +106,6 @@ describe('Scoring Engine Unit Tests (v2.72.0)', () => {
|
|||||||
const duration = Date.now() - start;
|
const duration = Date.now() - start;
|
||||||
|
|
||||||
console.log(`[Benchmark] 100 docs processing time: ${duration}ms`);
|
console.log(`[Benchmark] 100 docs processing time: ${duration}ms`);
|
||||||
expect(duration).toBeLessThan(200); // Should be very fast due to caching
|
expect(duration).toBeLessThan(200);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|||||||
Reference in New Issue
Block a user