Update project files

This commit is contained in:
2026-05-22 15:00:14 +09:00
parent 132d130ff1
commit 8016ef18fa
29 changed files with 1353 additions and 804 deletions
+24 -6
View File
@@ -129,11 +129,24 @@ export function expandQuery(tokens: string[]): string[] {
/**
* TF (Term Frequency): 문서 내 용어 빈도
*
* Takes a precomputed term-count `Map` (built once per document by
* `buildTermCounts`) instead of re-scanning the token array per term — the
* value is numerically identical to `count / documentTokens.length`.
*/
function termFrequency(term: string, documentTokens: string[]): number {
if (documentTokens.length === 0) return 0;
const count = documentTokens.filter((t) => t === term).length;
return count / documentTokens.length;
function termFrequency(term: string, termCounts: Map<string, number>, totalTokens: number): number {
if (totalTokens === 0) return 0;
const count = termCounts.get(term) || 0;
return count / totalTokens;
}
/** Build a term -> occurrence-count map for one document's token array (computed once, reused per query term). */
function buildTermCounts(documentTokens: string[]): Map<string, number> {
const counts = new Map<string, number>();
for (const t of documentTokens) {
counts.set(t, (counts.get(t) || 0) + 1);
}
return counts;
}
/**
@@ -231,7 +244,11 @@ export function scoreTfIdfPreTokenized(
if (documents.length === 0 || queryTokens.length === 0) return [];
const docTokenArrays = documents.map((doc) => doc.tokens);
const docTokenSets = docTokenArrays.map((tokens) => new Set(tokens));
// Precompute, once per document: a term -> count map (used for TF) and the
// derived token Set (used for IDF). Both were previously recomputed inside
// nested loops — building them once and reusing them is numerically identical.
const docTermCounts = docTokenArrays.map((tokens) => buildTermCounts(tokens));
const docTokenSets = docTermCounts.map((counts) => new Set(counts.keys()));
// Expand query with synonyms
const expandedQuery = expandQuery(queryTokens);
@@ -248,6 +265,7 @@ export function scoreTfIdfPreTokenized(
return documents.map((doc, index) => {
const docTokens = docTokenArrays[index];
const termCounts = docTermCounts[index];
const titleTokens = new Set(doc.titleTokens);
let score = 0;
const matchedTerms: string[] = [];
@@ -262,7 +280,7 @@ export function scoreTfIdfPreTokenized(
else if (conflictCount >= SCORING_CONFIG.CONFLICT_THRESHOLDS.LOW) conflictSeverity = 'LOW';
for (const term of expandedQuery) {
const tf = termFrequency(term, docTokens);
const tf = termFrequency(term, termCounts, docTokens.length);
const idf = idfCache.get(term) || 1;
const tfidf = tf * idf;