chore: version up to 2.80.34 and package

2026-05-12 22:54:21 +09:00
parent 148bfb070b
commit 065e598cca
26 changed files with 2023 additions and 139 deletions
@@ -160,6 +160,30 @@ function inverseDocumentFrequency(

 export type ConflictSeverity = 'NONE' | 'LOW' | 'MEDIUM' | 'HIGH';

+/**
+ * Counts how many distinct conflict-indicator words are present (substring match) in `rawText`.
+ * Exposed so the brain index can cache this per-file instead of re-scanning content every query.
+ */
+export function countConflictIndicators(rawText: string): number {
+    const lower = (rawText || '').toLowerCase();
+    let n = 0;
+    for (const indicator of SCORING_CONFIG.CONFLICT_INDICATORS) {
+        if (lower.includes(indicator.toLowerCase())) n++;
+    }
+    return n;
+}
+
+/** A document whose tokens were already computed (e.g. from the persistent brain index). */
+export interface PreTokenizedDoc {
+    /** tokenize(`${title} ${content}`) */
+    tokens: string[];
+    /** tokenize(title) */
+    titleTokens: string[];
+    lastModified?: number;
+    /** result of countConflictIndicators(`${title} ${content}`); 0 if unknown */
+    conflictCount: number;
+}
+
 export interface ScoredDocument {
    index: number;
    score: number;
@@ -173,6 +197,8 @@ export interface ScoredDocument {

 /**
 * TF-IDF 기반으로 문서 집합을 스코어링합니다.
+ * 문서 내용을 받아 즉석에서 토크나이즈합니다 — 이미 토큰화된 집합이 있다면
+ * `scoreTfIdfPreTokenized` 를 직접 호출하면 토크나이즈를 건너뛸 수 있습니다.
 */
 export function scoreTfIdf(
    queryTokens: string[],
@@ -183,11 +209,28 @@ export function scoreTfIdf(
    }>
 ): ScoredDocument[] {
    if (documents.length === 0 || queryTokens.length === 0) return [];
+    return scoreTfIdfPreTokenized(queryTokens, documents.map((doc) => {
+        const combined = `${doc.title} ${doc.content}`;
+        return {
+            tokens: tokenize(combined),
+            titleTokens: tokenize(doc.title),
+            lastModified: doc.lastModified,
+            conflictCount: countConflictIndicators(combined),
+        };
+    }));
+}

-    // Pre-tokenize all documents
-    const docTokenArrays = documents.map((doc) =>
-        tokenize(`${doc.title} ${doc.content}`)
-    );
+/**
+ * TF-IDF 스코어링 — 이미 토큰화된 문서 집합 버전 (브레인 인덱스 등 캐시된 토큰을 그대로 사용).
+ * `scoreTfIdf` 와 동일한 알고리즘이며 출력 형태도 같습니다.
+ */
+export function scoreTfIdfPreTokenized(
+    queryTokens: string[],
+    documents: PreTokenizedDoc[]
+): ScoredDocument[] {
+    if (documents.length === 0 || queryTokens.length === 0) return [];
+
+    const docTokenArrays = documents.map((doc) => doc.tokens);
    const docTokenSets = docTokenArrays.map((tokens) => new Set(tokens));

    // Expand query with synonyms
@@ -205,22 +248,18 @@ export function scoreTfIdf(

    return documents.map((doc, index) => {
        const docTokens = docTokenArrays[index];
-        const titleTokens = new Set(tokenize(doc.title));
+        const titleTokens = new Set(doc.titleTokens);
        let score = 0;
        const matchedTerms: string[] = [];

-        // Conflict Detection & Severity Analysis (Substring based for better recall with particles)
-        const rawText = `${doc.title} ${doc.content}`.toLowerCase();
-        const conflictMatches = [...SCORING_CONFIG.CONFLICT_INDICATORS].filter(indicator => 
-            rawText.includes(indicator.toLowerCase())
-        );
-        
-        const conflictDetected = conflictMatches.length > 0;
+        // Conflict Detection & Severity Analysis (pre-counted by caller / index)
+        const conflictCount = doc.conflictCount || 0;
+        const conflictDetected = conflictCount > 0;
        let conflictSeverity: ConflictSeverity = 'NONE';

-        if (conflictMatches.length >= SCORING_CONFIG.CONFLICT_THRESHOLDS.HIGH) conflictSeverity = 'HIGH';
-        else if (conflictMatches.length >= SCORING_CONFIG.CONFLICT_THRESHOLDS.MEDIUM) conflictSeverity = 'MEDIUM';
-        else if (conflictMatches.length >= SCORING_CONFIG.CONFLICT_THRESHOLDS.LOW) conflictSeverity = 'LOW';
+        if (conflictCount >= SCORING_CONFIG.CONFLICT_THRESHOLDS.HIGH) conflictSeverity = 'HIGH';
+        else if (conflictCount >= SCORING_CONFIG.CONFLICT_THRESHOLDS.MEDIUM) conflictSeverity = 'MEDIUM';
+        else if (conflictCount >= SCORING_CONFIG.CONFLICT_THRESHOLDS.LOW) conflictSeverity = 'LOW';

        for (const term of expandedQuery) {
            const tf = termFrequency(term, docTokens);