Update project files

2026-05-22 15:00:14 +09:00
parent 132d130ff1
commit 8016ef18fa
29 changed files with 1353 additions and 804 deletions
@@ -129,11 +129,24 @@ export function expandQuery(tokens: string[]): string[] {

 /**
 * TF (Term Frequency): 문서 내 용어 빈도
+ *
+ * Takes a precomputed term-count `Map` (built once per document by
+ * `buildTermCounts`) instead of re-scanning the token array per term — the
+ * value is numerically identical to `count / documentTokens.length`.
 */
-function termFrequency(term: string, documentTokens: string[]): number {
-    if (documentTokens.length === 0) return 0;
-    const count = documentTokens.filter((t) => t === term).length;
-    return count / documentTokens.length;
+function termFrequency(term: string, termCounts: Map<string, number>, totalTokens: number): number {
+    if (totalTokens === 0) return 0;
+    const count = termCounts.get(term) || 0;
+    return count / totalTokens;
+}
+
+/** Build a term -> occurrence-count map for one document's token array (computed once, reused per query term). */
+function buildTermCounts(documentTokens: string[]): Map<string, number> {
+    const counts = new Map<string, number>();
+    for (const t of documentTokens) {
+        counts.set(t, (counts.get(t) || 0) + 1);
+    }
+    return counts;
 }

 /**
@@ -231,7 +244,11 @@ export function scoreTfIdfPreTokenized(
    if (documents.length === 0 || queryTokens.length === 0) return [];

    const docTokenArrays = documents.map((doc) => doc.tokens);
-    const docTokenSets = docTokenArrays.map((tokens) => new Set(tokens));
+    // Precompute, once per document: a term -> count map (used for TF) and the
+    // derived token Set (used for IDF). Both were previously recomputed inside
+    // nested loops — building them once and reusing them is numerically identical.
+    const docTermCounts = docTokenArrays.map((tokens) => buildTermCounts(tokens));
+    const docTokenSets = docTermCounts.map((counts) => new Set(counts.keys()));

    // Expand query with synonyms
    const expandedQuery = expandQuery(queryTokens);
@@ -248,6 +265,7 @@ export function scoreTfIdfPreTokenized(

    return documents.map((doc, index) => {
        const docTokens = docTokenArrays[index];
+        const termCounts = docTermCounts[index];
        const titleTokens = new Set(doc.titleTokens);
        let score = 0;
        const matchedTerms: string[] = [];
@@ -262,7 +280,7 @@ export function scoreTfIdfPreTokenized(
        else if (conflictCount >= SCORING_CONFIG.CONFLICT_THRESHOLDS.LOW) conflictSeverity = 'LOW';

        for (const term of expandedQuery) {
-            const tf = termFrequency(term, docTokens);
+            const tf = termFrequency(term, termCounts, docTokens.length);
            const idf = idfCache.get(term) || 1;
            const tfidf = tf * idf;