feat(scoring): added comprehensive unit tests and refined bilingual tokenization v2.72.0

This commit is contained in:
g1nation
2026-05-05 11:10:31 +09:00
parent ca9fbf125a
commit 563e499324
4 changed files with 98 additions and 7 deletions
+9 -4
View File
@@ -80,11 +80,13 @@ export function tokenize(text: string): string[] {
const normalized = text
.toLowerCase()
.replace(/[\u200B-\u200D\uFEFF]/g, '')
.replace(/[^\w\s가-힣_.-]/g, ' ')
.trim();
.replace(/[^\w\s가-힣_.-]/g, ' ');
const tokens = normalized
.split(/[^a-z0-9가-힣_.-]+/g)
// [Refinement] 영문/숫자와 한글 경계에서 분리하도록 개선
const splitText = normalized.replace(/([a-z0-9]+)([가-힣]+)/gi, '$1 $2').replace(/([가-힣]+)([a-z0-9]+)/gi, '$1 $2');
const tokens = splitText
.split(/[^a-z0-9가-힣_]+/g)
.map((t) => t.trim())
.filter((t) => t.length >= 2)
.filter((t) => !SCORING_CONFIG.STOP_WORDS_EN.has(t) && !SCORING_CONFIG.STOP_WORDS_KO.has(t));
@@ -297,6 +299,9 @@ export function extractBestExcerpt(
let bestLen = 0;
for (let i = 0; i < scored.length; i++) {
// [Refinement] 정보 밀도가 낮은 문장은 윈도우의 시작점이 될 수 없음
if (scored[i].density < SCORING_CONFIG.DENSITY_THRESHOLD) continue;
let windowText = '';
let windowScore = 0;
let j = i;