chore: version up to 2.80.34 and package
This commit is contained in:
+54
-15
@@ -160,6 +160,30 @@ function inverseDocumentFrequency(
|
||||
|
||||
export type ConflictSeverity = 'NONE' | 'LOW' | 'MEDIUM' | 'HIGH';
|
||||
|
||||
/**
|
||||
* Counts how many distinct conflict-indicator words are present (substring match) in `rawText`.
|
||||
* Exposed so the brain index can cache this per-file instead of re-scanning content every query.
|
||||
*/
|
||||
export function countConflictIndicators(rawText: string): number {
|
||||
const lower = (rawText || '').toLowerCase();
|
||||
let n = 0;
|
||||
for (const indicator of SCORING_CONFIG.CONFLICT_INDICATORS) {
|
||||
if (lower.includes(indicator.toLowerCase())) n++;
|
||||
}
|
||||
return n;
|
||||
}
|
||||
|
||||
/** A document whose tokens were already computed (e.g. from the persistent brain index). */
|
||||
export interface PreTokenizedDoc {
|
||||
/** tokenize(`${title} ${content}`) */
|
||||
tokens: string[];
|
||||
/** tokenize(title) */
|
||||
titleTokens: string[];
|
||||
lastModified?: number;
|
||||
/** result of countConflictIndicators(`${title} ${content}`); 0 if unknown */
|
||||
conflictCount: number;
|
||||
}
|
||||
|
||||
export interface ScoredDocument {
|
||||
index: number;
|
||||
score: number;
|
||||
@@ -173,6 +197,8 @@ export interface ScoredDocument {
|
||||
|
||||
/**
|
||||
* TF-IDF 기반으로 문서 집합을 스코어링합니다.
|
||||
* 문서 내용을 받아 즉석에서 토크나이즈합니다 — 이미 토큰화된 집합이 있다면
|
||||
* `scoreTfIdfPreTokenized` 를 직접 호출하면 토크나이즈를 건너뛸 수 있습니다.
|
||||
*/
|
||||
export function scoreTfIdf(
|
||||
queryTokens: string[],
|
||||
@@ -183,11 +209,28 @@ export function scoreTfIdf(
|
||||
}>
|
||||
): ScoredDocument[] {
|
||||
if (documents.length === 0 || queryTokens.length === 0) return [];
|
||||
return scoreTfIdfPreTokenized(queryTokens, documents.map((doc) => {
|
||||
const combined = `${doc.title} ${doc.content}`;
|
||||
return {
|
||||
tokens: tokenize(combined),
|
||||
titleTokens: tokenize(doc.title),
|
||||
lastModified: doc.lastModified,
|
||||
conflictCount: countConflictIndicators(combined),
|
||||
};
|
||||
}));
|
||||
}
|
||||
|
||||
// Pre-tokenize all documents
|
||||
const docTokenArrays = documents.map((doc) =>
|
||||
tokenize(`${doc.title} ${doc.content}`)
|
||||
);
|
||||
/**
|
||||
* TF-IDF 스코어링 — 이미 토큰화된 문서 집합 버전 (브레인 인덱스 등 캐시된 토큰을 그대로 사용).
|
||||
* `scoreTfIdf` 와 동일한 알고리즘이며 출력 형태도 같습니다.
|
||||
*/
|
||||
export function scoreTfIdfPreTokenized(
|
||||
queryTokens: string[],
|
||||
documents: PreTokenizedDoc[]
|
||||
): ScoredDocument[] {
|
||||
if (documents.length === 0 || queryTokens.length === 0) return [];
|
||||
|
||||
const docTokenArrays = documents.map((doc) => doc.tokens);
|
||||
const docTokenSets = docTokenArrays.map((tokens) => new Set(tokens));
|
||||
|
||||
// Expand query with synonyms
|
||||
@@ -205,22 +248,18 @@ export function scoreTfIdf(
|
||||
|
||||
return documents.map((doc, index) => {
|
||||
const docTokens = docTokenArrays[index];
|
||||
const titleTokens = new Set(tokenize(doc.title));
|
||||
const titleTokens = new Set(doc.titleTokens);
|
||||
let score = 0;
|
||||
const matchedTerms: string[] = [];
|
||||
|
||||
// Conflict Detection & Severity Analysis (Substring based for better recall with particles)
|
||||
const rawText = `${doc.title} ${doc.content}`.toLowerCase();
|
||||
const conflictMatches = [...SCORING_CONFIG.CONFLICT_INDICATORS].filter(indicator =>
|
||||
rawText.includes(indicator.toLowerCase())
|
||||
);
|
||||
|
||||
const conflictDetected = conflictMatches.length > 0;
|
||||
// Conflict Detection & Severity Analysis (pre-counted by caller / index)
|
||||
const conflictCount = doc.conflictCount || 0;
|
||||
const conflictDetected = conflictCount > 0;
|
||||
let conflictSeverity: ConflictSeverity = 'NONE';
|
||||
|
||||
if (conflictMatches.length >= SCORING_CONFIG.CONFLICT_THRESHOLDS.HIGH) conflictSeverity = 'HIGH';
|
||||
else if (conflictMatches.length >= SCORING_CONFIG.CONFLICT_THRESHOLDS.MEDIUM) conflictSeverity = 'MEDIUM';
|
||||
else if (conflictMatches.length >= SCORING_CONFIG.CONFLICT_THRESHOLDS.LOW) conflictSeverity = 'LOW';
|
||||
if (conflictCount >= SCORING_CONFIG.CONFLICT_THRESHOLDS.HIGH) conflictSeverity = 'HIGH';
|
||||
else if (conflictCount >= SCORING_CONFIG.CONFLICT_THRESHOLDS.MEDIUM) conflictSeverity = 'MEDIUM';
|
||||
else if (conflictCount >= SCORING_CONFIG.CONFLICT_THRESHOLDS.LOW) conflictSeverity = 'LOW';
|
||||
|
||||
for (const term of expandedQuery) {
|
||||
const tf = termFrequency(term, docTokens);
|
||||
|
||||
Reference in New Issue
Block a user