feat(scoring): added comprehensive unit tests and refined bilingual tokenization v2.72.0

2026-05-05 11:10:31 +09:00
parent ca9fbf125a
commit 563e499324
4 changed files with 98 additions and 7 deletions
@@ -80,11 +80,13 @@ export function tokenize(text: string): string[] {
    const normalized = text
        .toLowerCase()
        .replace(/[\u200B-\u200D\uFEFF]/g, '')
-        .replace(/[^\w\s가-힣_.-]/g, ' ')
-        .trim();
+        .replace(/[^\w\s가-힣_.-]/g, ' ');

-    const tokens = normalized
-        .split(/[^a-z0-9가-힣_.-]+/g)
+    // [Refinement] 영문/숫자와 한글 경계에서 분리하도록 개선
+    const splitText = normalized.replace(/([a-z0-9]+)([가-힣]+)/gi, '$1 $2').replace(/([가-힣]+)([a-z0-9]+)/gi, '$1 $2');
+    
+    const tokens = splitText
+        .split(/[^a-z0-9가-힣_]+/g)
        .map((t) => t.trim())
        .filter((t) => t.length >= 2)
        .filter((t) => !SCORING_CONFIG.STOP_WORDS_EN.has(t) && !SCORING_CONFIG.STOP_WORDS_KO.has(t));
@@ -297,6 +299,9 @@ export function extractBestExcerpt(
    let bestLen = 0;

    for (let i = 0; i < scored.length; i++) {
+        // [Refinement] 정보 밀도가 낮은 문장은 윈도우의 시작점이 될 수 없음
+        if (scored[i].density < SCORING_CONFIG.DENSITY_THRESHOLD) continue;
+
        let windowText = '';
        let windowScore = 0;
        let j = i;