chore: version up to 2.80.34 and package

This commit is contained in:
g1nation
2026-05-12 22:54:21 +09:00
parent 148bfb070b
commit 065e598cca
26 changed files with 2023 additions and 139 deletions
+54 -15
View File
@@ -160,6 +160,30 @@ function inverseDocumentFrequency(
export type ConflictSeverity = 'NONE' | 'LOW' | 'MEDIUM' | 'HIGH';
/**
* Counts how many distinct conflict-indicator words are present (substring match) in `rawText`.
* Exposed so the brain index can cache this per-file instead of re-scanning content every query.
*/
export function countConflictIndicators(rawText: string): number {
const lower = (rawText || '').toLowerCase();
let n = 0;
for (const indicator of SCORING_CONFIG.CONFLICT_INDICATORS) {
if (lower.includes(indicator.toLowerCase())) n++;
}
return n;
}
/** A document whose tokens were already computed (e.g. from the persistent brain index). */
export interface PreTokenizedDoc {
/** tokenize(`${title} ${content}`) */
tokens: string[];
/** tokenize(title) */
titleTokens: string[];
lastModified?: number;
/** result of countConflictIndicators(`${title} ${content}`); 0 if unknown */
conflictCount: number;
}
export interface ScoredDocument {
index: number;
score: number;
@@ -173,6 +197,8 @@ export interface ScoredDocument {
/**
* TF-IDF 기반으로 문서 집합을 스코어링합니다.
* 문서 내용을 받아 즉석에서 토크나이즈합니다 — 이미 토큰화된 집합이 있다면
* `scoreTfIdfPreTokenized` 를 직접 호출하면 토크나이즈를 건너뛸 수 있습니다.
*/
export function scoreTfIdf(
queryTokens: string[],
@@ -183,11 +209,28 @@ export function scoreTfIdf(
}>
): ScoredDocument[] {
if (documents.length === 0 || queryTokens.length === 0) return [];
return scoreTfIdfPreTokenized(queryTokens, documents.map((doc) => {
const combined = `${doc.title} ${doc.content}`;
return {
tokens: tokenize(combined),
titleTokens: tokenize(doc.title),
lastModified: doc.lastModified,
conflictCount: countConflictIndicators(combined),
};
}));
}
// Pre-tokenize all documents
const docTokenArrays = documents.map((doc) =>
tokenize(`${doc.title} ${doc.content}`)
);
/**
* TF-IDF 스코어링 — 이미 토큰화된 문서 집합 버전 (브레인 인덱스 등 캐시된 토큰을 그대로 사용).
* `scoreTfIdf` 와 동일한 알고리즘이며 출력 형태도 같습니다.
*/
export function scoreTfIdfPreTokenized(
queryTokens: string[],
documents: PreTokenizedDoc[]
): ScoredDocument[] {
if (documents.length === 0 || queryTokens.length === 0) return [];
const docTokenArrays = documents.map((doc) => doc.tokens);
const docTokenSets = docTokenArrays.map((tokens) => new Set(tokens));
// Expand query with synonyms
@@ -205,22 +248,18 @@ export function scoreTfIdf(
return documents.map((doc, index) => {
const docTokens = docTokenArrays[index];
const titleTokens = new Set(tokenize(doc.title));
const titleTokens = new Set(doc.titleTokens);
let score = 0;
const matchedTerms: string[] = [];
// Conflict Detection & Severity Analysis (Substring based for better recall with particles)
const rawText = `${doc.title} ${doc.content}`.toLowerCase();
const conflictMatches = [...SCORING_CONFIG.CONFLICT_INDICATORS].filter(indicator =>
rawText.includes(indicator.toLowerCase())
);
const conflictDetected = conflictMatches.length > 0;
// Conflict Detection & Severity Analysis (pre-counted by caller / index)
const conflictCount = doc.conflictCount || 0;
const conflictDetected = conflictCount > 0;
let conflictSeverity: ConflictSeverity = 'NONE';
if (conflictMatches.length >= SCORING_CONFIG.CONFLICT_THRESHOLDS.HIGH) conflictSeverity = 'HIGH';
else if (conflictMatches.length >= SCORING_CONFIG.CONFLICT_THRESHOLDS.MEDIUM) conflictSeverity = 'MEDIUM';
else if (conflictMatches.length >= SCORING_CONFIG.CONFLICT_THRESHOLDS.LOW) conflictSeverity = 'LOW';
if (conflictCount >= SCORING_CONFIG.CONFLICT_THRESHOLDS.HIGH) conflictSeverity = 'HIGH';
else if (conflictCount >= SCORING_CONFIG.CONFLICT_THRESHOLDS.MEDIUM) conflictSeverity = 'MEDIUM';
else if (conflictCount >= SCORING_CONFIG.CONFLICT_THRESHOLDS.LOW) conflictSeverity = 'LOW';
for (const term of expandedQuery) {
const tf = termFrequency(term, docTokens);