537 lines
21 KiB
TypeScript
537 lines
21 KiB
TypeScript
/**
|
||
* ============================================================
|
||
* Scoring Engine — TF-IDF + Bilingual Tokenizer
|
||
*
|
||
* 단순 includes() 키워드 매칭을 넘어서,
|
||
* TF-IDF 가중치 기반의 문서 스코어링을 제공합니다.
|
||
* 한국어/영어 양국어 토크나이저를 포함합니다.
|
||
* ============================================================
|
||
*/
|
||
|
||
// ─── Bilingual Tokenizer ───
|
||
|
||
// ─── Scoring Engine Configuration ───
|
||
|
||
const SCORING_CONFIG = {
|
||
STOP_WORDS_EN: new Set([
|
||
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
|
||
'of', 'with', 'by', 'from', 'is', 'are', 'was', 'were', 'be', 'been',
|
||
'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
|
||
'should', 'may', 'might', 'can', 'this', 'that', 'these', 'those',
|
||
'it', 'its', 'not', 'no', 'what', 'how', 'when', 'where', 'which',
|
||
'who', 'whom', 'why', 'if', 'then', 'than', 'so', 'as', 'just',
|
||
'about', 'also', 'more', 'some', 'very', 'all', 'each', 'every',
|
||
'such', 'please', 'write', 'use', 'using', 'used'
|
||
]),
|
||
STOP_WORDS_KO: new Set([
|
||
'그리고', '그런데', '그래서', '하지만', '또한', '또는', '해서', '하는',
|
||
'있어', '없어', '아래', '위에', '어떻게', '이것', '저것', '그것',
|
||
'이런', '저런', '그런', '여기', '거기', '필요', '사용', '관련',
|
||
'대한', '대해', '통해', '따라', '위해', '대로', '만큼'
|
||
]),
|
||
SYNONYM_DATA: [
|
||
['성능', ['performance', 'optimization', '최적화', 'speed']],
|
||
['performance', ['성능', '최적화', 'optimization', 'speed']],
|
||
['아키텍처', ['architecture', '구조', 'structure', 'design']],
|
||
['architecture', ['아키텍처', '구조', 'structure', 'design']],
|
||
['메모리', ['memory', '기억', 'cache', 'storage']],
|
||
['memory', ['메모리', '기억', 'cache', 'storage']],
|
||
['버그', ['bug', 'error', '오류', 'issue', 'defect']],
|
||
['bug', ['버그', 'error', '오류', 'issue']],
|
||
['설계', ['design', '아키텍처', 'architecture', 'pattern']],
|
||
['design', ['설계', '아키텍처', 'architecture', 'pattern']],
|
||
['배포', ['deploy', 'deployment', 'release', 'ci', 'cd']],
|
||
['deploy', ['배포', 'deployment', 'release']],
|
||
['테스트', ['test', 'testing', 'spec', 'jest', 'mocha']],
|
||
['test', ['테스트', 'testing', 'spec']],
|
||
['프로젝트', ['project', '프로그램', 'repo', 'repository']],
|
||
['project', ['프로젝트', '프로그램', 'repo']],
|
||
['방향', ['direction', '전략', 'strategy', '목표', 'goal']],
|
||
['direction', ['방향', '전략', 'strategy', '목표']]
|
||
] as [string, string[]][],
|
||
DENSITY_THRESHOLD: 0.15, // 발췌문 추출 시 최소 키워드 밀도
|
||
TITLE_MULTIPLIER: 3.0, // 제목 일치 가중치
|
||
GLOBAL_CACHE_LIMIT: 2000,
|
||
CONFLICT_INDICATORS: new Set([
|
||
'반대', '충돌', '오류', '논란', '반박', '차이', '대조',
|
||
'conflict', 'contradict', 'dispute', 'controversy', 'error', 'mismatch', 'vs'
|
||
]),
|
||
CONFLICT_THRESHOLDS: {
|
||
HIGH: 4,
|
||
MEDIUM: 2,
|
||
LOW: 1
|
||
}
|
||
};
|
||
|
||
// ─── Global Search State & Cache ───
|
||
const TOKEN_CACHE = new Map<string, string[]>();
|
||
|
||
/**
|
||
* 캐시를 명시적으로 비웁니다. 문서 집합이 크게 변경되었을 때 사용합니다.
|
||
*/
|
||
export function clearScoringCache() {
|
||
TOKEN_CACHE.clear();
|
||
}
|
||
|
||
/**
|
||
* 한국어/영어 혼합 텍스트를 정규화하고 토큰으로 분리합니다.
|
||
*/
|
||
export function tokenize(text: string): string[] {
|
||
if (!text) return [];
|
||
if (TOKEN_CACHE.has(text)) return TOKEN_CACHE.get(text)!;
|
||
|
||
const normalized = text
|
||
.toLowerCase()
|
||
.replace(/[\u200B-\u200D\uFEFF]/g, '')
|
||
.replace(/[^\w\s가-힣_+#.-]/g, ' ');
|
||
|
||
// [Refinement] 영문/숫자와 한글 경계에서 분리하도록 개선
|
||
const splitText = normalized.replace(/([a-z0-9]+)([가-힣]+)/gi, '$1 $2').replace(/([가-힣]+)([a-z0-9]+)/gi, '$1 $2');
|
||
|
||
const tokens = splitText
|
||
.split(/[^a-z0-9가-힣+#.-]+/g) // [Structural Fix] C++, C#, .net 등 특수 기호 보존
|
||
.map((t) => t.trim().replace(/[.,]$/g, '')) // [Refinement] 문장 끝 마침표/쉼표 제거
|
||
.filter((t) => {
|
||
if (!t) return false;
|
||
// 특수문자만 남은 토큰 제거 (단일 + 나 . 등)
|
||
if (/^[+#.-]+$/.test(t)) return false;
|
||
// 한글이 포함된 경우 한 글자라도 허용, 그 외(영문/숫자)는 2글자 이상
|
||
if (/[가-힣]/.test(t)) return t.length >= 1;
|
||
return t.length >= 2;
|
||
})
|
||
.filter((t) => !SCORING_CONFIG.STOP_WORDS_EN.has(t) && !SCORING_CONFIG.STOP_WORDS_KO.has(t));
|
||
|
||
if (TOKEN_CACHE.size >= SCORING_CONFIG.GLOBAL_CACHE_LIMIT) TOKEN_CACHE.clear();
|
||
TOKEN_CACHE.set(text, tokens);
|
||
return tokens;
|
||
}
|
||
|
||
const synonymMap = new Map<string, string[]>(SCORING_CONFIG.SYNONYM_DATA);
|
||
|
||
/**
|
||
* 동의어/관련어 확장을 수행합니다.
|
||
* SCORING_CONFIG의 중앙 데이터를 참조합니다.
|
||
*/
|
||
export function expandQuery(tokens: string[]): string[] {
|
||
const expanded = new Set(tokens);
|
||
for (const token of tokens) {
|
||
const synonyms = synonymMap.get(token);
|
||
if (Array.isArray(synonyms)) {
|
||
for (const syn of synonyms) {
|
||
expanded.add(syn);
|
||
}
|
||
}
|
||
}
|
||
return Array.from(expanded);
|
||
}
|
||
|
||
// ─── TF-IDF Scoring ───
|
||
|
||
/**
|
||
* TF (Term Frequency): 문서 내 용어 빈도
|
||
*
|
||
* Takes a precomputed term-count `Map` (built once per document by
|
||
* `buildTermCounts`) instead of re-scanning the token array per term — the
|
||
* value is numerically identical to `count / documentTokens.length`.
|
||
*/
|
||
function termFrequency(term: string, termCounts: Map<string, number>, totalTokens: number): number {
|
||
if (totalTokens === 0) return 0;
|
||
const count = termCounts.get(term) || 0;
|
||
return count / totalTokens;
|
||
}
|
||
|
||
/** Build a term -> occurrence-count map for one document's token array (computed once, reused per query term). */
|
||
function buildTermCounts(documentTokens: string[]): Map<string, number> {
|
||
const counts = new Map<string, number>();
|
||
for (const t of documentTokens) {
|
||
counts.set(t, (counts.get(t) || 0) + 1);
|
||
}
|
||
return counts;
|
||
}
|
||
|
||
/**
|
||
* IDF (Inverse Document Frequency): 전체 문서 대비 희소도
|
||
* (Stability Enhancement: Smoothing 적용 및 최소 문서 수 대응)
|
||
*/
|
||
function inverseDocumentFrequency(
|
||
term: string,
|
||
allDocumentTokenSets: Array<Set<string>>
|
||
): number {
|
||
const N = allDocumentTokenSets.length;
|
||
if (N === 0) return 1.0;
|
||
|
||
const containing = allDocumentTokenSets.filter((doc) => doc.has(term)).length;
|
||
|
||
// N이 매우 작을 때(예: 5개 이하) 스코어 편향 방지를 위한 최소 분모 보정
|
||
const smoothN = N < 5 ? N + 5 : N;
|
||
const smoothContaining = containing;
|
||
|
||
// Standard Smooth IDF: log((N+1) / (containing+1)) + 1
|
||
// containing이 0일 경우에도 안전하게 동작하도록 설계
|
||
return Math.log((smoothN + 1) / (smoothContaining + 1)) + 1;
|
||
}
|
||
|
||
export type ConflictSeverity = 'NONE' | 'LOW' | 'MEDIUM' | 'HIGH';
|
||
|
||
/**
|
||
* Counts how many distinct conflict-indicator words are present (substring match) in `rawText`.
|
||
* Exposed so the brain index can cache this per-file instead of re-scanning content every query.
|
||
*/
|
||
export function countConflictIndicators(rawText: string): number {
|
||
const lower = (rawText || '').toLowerCase();
|
||
let n = 0;
|
||
for (const indicator of SCORING_CONFIG.CONFLICT_INDICATORS) {
|
||
if (lower.includes(indicator.toLowerCase())) n++;
|
||
}
|
||
return n;
|
||
}
|
||
|
||
/** A document whose tokens were already computed (e.g. from the persistent brain index). */
|
||
export interface PreTokenizedDoc {
|
||
/** tokenize(`${title} ${content}`) */
|
||
tokens: string[];
|
||
/** tokenize(title) */
|
||
titleTokens: string[];
|
||
lastModified?: number;
|
||
/** result of countConflictIndicators(`${title} ${content}`); 0 if unknown */
|
||
conflictCount: number;
|
||
}
|
||
|
||
export interface ScoredDocument {
|
||
index: number;
|
||
score: number;
|
||
titleBoost: number;
|
||
recencyBoost: number;
|
||
matchedTerms: string[];
|
||
conflictDetected: boolean;
|
||
conflictSeverity: ConflictSeverity;
|
||
informationDensity: number;
|
||
}
|
||
|
||
/**
|
||
* TF-IDF 기반으로 문서 집합을 스코어링합니다.
|
||
* 문서 내용을 받아 즉석에서 토크나이즈합니다 — 이미 토큰화된 집합이 있다면
|
||
* `scoreTfIdfPreTokenized` 를 직접 호출하면 토크나이즈를 건너뛸 수 있습니다.
|
||
*/
|
||
export function scoreTfIdf(
|
||
queryTokens: string[],
|
||
documents: Array<{
|
||
title: string;
|
||
content: string;
|
||
lastModified?: number;
|
||
}>
|
||
): ScoredDocument[] {
|
||
if (documents.length === 0 || queryTokens.length === 0) return [];
|
||
return scoreTfIdfPreTokenized(queryTokens, documents.map((doc) => {
|
||
const combined = `${doc.title} ${doc.content}`;
|
||
return {
|
||
tokens: tokenize(combined),
|
||
titleTokens: tokenize(doc.title),
|
||
lastModified: doc.lastModified,
|
||
conflictCount: countConflictIndicators(combined),
|
||
};
|
||
}));
|
||
}
|
||
|
||
/**
|
||
* TF-IDF 스코어링 — 이미 토큰화된 문서 집합 버전 (브레인 인덱스 등 캐시된 토큰을 그대로 사용).
|
||
* `scoreTfIdf` 와 동일한 알고리즘이며 출력 형태도 같습니다.
|
||
*/
|
||
export function scoreTfIdfPreTokenized(
|
||
queryTokens: string[],
|
||
documents: PreTokenizedDoc[]
|
||
): ScoredDocument[] {
|
||
if (documents.length === 0 || queryTokens.length === 0) return [];
|
||
|
||
const docTokenArrays = documents.map((doc) => doc.tokens);
|
||
// Precompute, once per document: a term -> count map (used for TF) and the
|
||
// derived token Set (used for IDF). Both were previously recomputed inside
|
||
// nested loops — building them once and reusing them is numerically identical.
|
||
const docTermCounts = docTokenArrays.map((tokens) => buildTermCounts(tokens));
|
||
const docTokenSets = docTermCounts.map((counts) => new Set(counts.keys()));
|
||
|
||
// Expand query with synonyms
|
||
const expandedQuery = expandQuery(queryTokens);
|
||
|
||
// Compute IDF for each query term (Local cache per document set)
|
||
const idfCache = new Map<string, number>();
|
||
for (const term of expandedQuery) {
|
||
if (!idfCache.has(term)) {
|
||
idfCache.set(term, inverseDocumentFrequency(term, docTokenSets));
|
||
}
|
||
}
|
||
|
||
const now = Date.now();
|
||
|
||
return documents.map((doc, index) => {
|
||
const docTokens = docTokenArrays[index];
|
||
const termCounts = docTermCounts[index];
|
||
const titleTokens = new Set(doc.titleTokens);
|
||
let score = 0;
|
||
const matchedTerms: string[] = [];
|
||
|
||
// Conflict Detection & Severity Analysis (pre-counted by caller / index)
|
||
const conflictCount = doc.conflictCount || 0;
|
||
const conflictDetected = conflictCount > 0;
|
||
let conflictSeverity: ConflictSeverity = 'NONE';
|
||
|
||
if (conflictCount >= SCORING_CONFIG.CONFLICT_THRESHOLDS.HIGH) conflictSeverity = 'HIGH';
|
||
else if (conflictCount >= SCORING_CONFIG.CONFLICT_THRESHOLDS.MEDIUM) conflictSeverity = 'MEDIUM';
|
||
else if (conflictCount >= SCORING_CONFIG.CONFLICT_THRESHOLDS.LOW) conflictSeverity = 'LOW';
|
||
|
||
for (const term of expandedQuery) {
|
||
const tf = termFrequency(term, termCounts, docTokens.length);
|
||
const idf = idfCache.get(term) || 1;
|
||
const tfidf = tf * idf;
|
||
|
||
if (tfidf > 0) {
|
||
matchedTerms.push(term);
|
||
}
|
||
|
||
// Title match bonus
|
||
const titleMultiplier = titleTokens.has(term) ? SCORING_CONFIG.TITLE_MULTIPLIER : 1.0;
|
||
score += tfidf * titleMultiplier;
|
||
}
|
||
|
||
// Information Density: 쿼리 관련 토큰의 밀도 측정
|
||
const informationDensity = docTokens.length > 0 ? matchedTerms.length / docTokens.length : 0;
|
||
|
||
// Recency boost
|
||
let recencyBoost = 0;
|
||
if (doc.lastModified) {
|
||
const daysAgo = (now - doc.lastModified) / (1000 * 60 * 60 * 24);
|
||
if (daysAgo < 1) recencyBoost = 0.3;
|
||
else if (daysAgo < 7) recencyBoost = 0.2;
|
||
else if (daysAgo < 30) recencyBoost = 0.1;
|
||
}
|
||
|
||
// Title match bonus for exact query term presence
|
||
const titleBoost = queryTokens.some((t) => titleTokens.has(t)) ? 0.2 : 0;
|
||
|
||
// [Structural Fix] Conflict Penalty 및 음수 점수 방지 (Floor Zero 정책)
|
||
const conflictMultiplier = conflictSeverity === 'HIGH' ? 0.1
|
||
: conflictSeverity === 'MEDIUM' ? 0.5
|
||
: conflictSeverity === 'LOW' ? 0.8
|
||
: 1.0;
|
||
|
||
const finalScore = (score + recencyBoost + titleBoost) * conflictMultiplier;
|
||
|
||
// [Structural Fix] Information Density: 쿼리 커버리지 기반으로 계산 방식 정상화
|
||
const queryCoverage = expandedQuery.length > 0
|
||
? new Set(matchedTerms).size / expandedQuery.length
|
||
: 0;
|
||
|
||
return {
|
||
index,
|
||
score: finalScore,
|
||
titleBoost,
|
||
recencyBoost,
|
||
matchedTerms: [...new Set(matchedTerms)],
|
||
conflictDetected,
|
||
conflictSeverity,
|
||
informationDensity: queryCoverage // 밀도를 쿼리 커버리지로 대체
|
||
};
|
||
});
|
||
}
|
||
|
||
/**
|
||
* Split markdown content into top-level sections by `#` / `##` / `###` headings.
|
||
*
|
||
* Returned sections are `{ heading, body }` — `heading` includes the heading
|
||
* line itself (preserving level), `body` is the text up to the next heading
|
||
* of the same-or-shallower depth. Front-matter (a leading `--- … ---` block)
|
||
* is dropped because it's not query-relevant.
|
||
*
|
||
* A document with no headings returns one synthetic section
|
||
* `{ heading: '', body: content }` so callers can treat the result uniformly.
|
||
*
|
||
* Why this exists: retrieval was returning whole files (excerpts capped at
|
||
* 400 chars). On long notes, that excerpt was often the file's intro/setup,
|
||
* not the section that actually matched the query. Section-level retrieval
|
||
* lets us pick the relevant heading directly and drop everything else.
|
||
*/
|
||
export interface MarkdownSection {
|
||
heading: string;
|
||
body: string;
|
||
}
|
||
export function splitMarkdownSections(content: string): MarkdownSection[] {
|
||
if (!content) return [];
|
||
// Strip frontmatter
|
||
let text = content;
|
||
if (/^?---\s*\n/.test(text)) {
|
||
const end = text.indexOf('\n---', 4);
|
||
if (end >= 0) text = text.slice(end + 4).replace(/^\s*\n/, '');
|
||
}
|
||
const lines = text.split('\n');
|
||
const headingIdx: Array<{ line: number; level: number }> = [];
|
||
for (let i = 0; i < lines.length; i++) {
|
||
const m = /^(#{1,6})\s+\S/.exec(lines[i]);
|
||
if (m) headingIdx.push({ line: i, level: m[1].length });
|
||
}
|
||
if (headingIdx.length === 0) {
|
||
return [{ heading: '', body: text.trim() }];
|
||
}
|
||
const sections: MarkdownSection[] = [];
|
||
// Capture any leading content above the first heading as a "preamble" section.
|
||
if (headingIdx[0].line > 0) {
|
||
const preamble = lines.slice(0, headingIdx[0].line).join('\n').trim();
|
||
if (preamble) sections.push({ heading: '', body: preamble });
|
||
}
|
||
for (let i = 0; i < headingIdx.length; i++) {
|
||
const start = headingIdx[i].line;
|
||
const end = i + 1 < headingIdx.length ? headingIdx[i + 1].line : lines.length;
|
||
const heading = lines[start].trim();
|
||
const body = lines.slice(start + 1, end).join('\n').trim();
|
||
sections.push({ heading, body });
|
||
}
|
||
return sections;
|
||
}
|
||
|
||
/**
|
||
* Pick the best heading-bounded section of a markdown document for a query,
|
||
* then fall back to keyword-window extraction inside that section if the
|
||
* section itself is still too long.
|
||
*
|
||
* Strategy:
|
||
* 1. Split into sections by heading (`splitMarkdownSections`).
|
||
* 2. Score each section's heading + body by query token overlap; weight
|
||
* heading matches 3× so "## Foo" beats a body mention of "foo".
|
||
* 3. If the top section's text fits, return it as-is (heading + body).
|
||
* 4. Otherwise, run `extractBestExcerpt` inside the top section's body and
|
||
* prepend the heading.
|
||
*
|
||
* Falls back to a plain `extractBestExcerpt` when the document has no
|
||
* headings — that's what `splitMarkdownSections` returns as a single
|
||
* synthetic section.
|
||
*
|
||
* Caps:
|
||
* - Output is always ≤ `maxLength` (final excerpt is sliced as a safety net).
|
||
* - Sections smaller than 24 chars after stripping are skipped — they're
|
||
* usually empty headings the author left as placeholders.
|
||
*/
|
||
export function extractBestSection(
|
||
content: string,
|
||
queryTokens: string[],
|
||
maxLength = 600
|
||
): string {
|
||
const sections = splitMarkdownSections(content);
|
||
if (sections.length === 0) return content.slice(0, maxLength);
|
||
if (sections.length === 1 && !sections[0].heading) {
|
||
return extractBestExcerpt(sections[0].body || content, queryTokens, maxLength);
|
||
}
|
||
const expanded = expandQuery(queryTokens);
|
||
const expandedSet = new Set(expanded);
|
||
const scoreText = (text: string) => {
|
||
if (!text) return 0;
|
||
const toks = tokenize(text);
|
||
let hits = 0;
|
||
for (const t of toks) if (expandedSet.has(t)) hits++;
|
||
return hits;
|
||
};
|
||
let best = { idx: -1, score: -1 };
|
||
for (let i = 0; i < sections.length; i++) {
|
||
const s = sections[i];
|
||
if ((s.heading.length + s.body.length) < 24) continue;
|
||
const score = scoreText(s.heading) * 3 + scoreText(s.body);
|
||
if (score > best.score) best = { idx: i, score };
|
||
}
|
||
if (best.idx < 0) {
|
||
// No section contained any query terms — fall back to a whole-doc excerpt.
|
||
return extractBestExcerpt(content, queryTokens, maxLength);
|
||
}
|
||
const picked = sections[best.idx];
|
||
const headingLine = picked.heading ? `${picked.heading}\n` : '';
|
||
const room = Math.max(64, maxLength - headingLine.length);
|
||
if (picked.body.length <= room) {
|
||
return (headingLine + picked.body).slice(0, maxLength).trim();
|
||
}
|
||
const inner = extractBestExcerpt(picked.body, queryTokens, room);
|
||
return (headingLine + inner).slice(0, maxLength).trim();
|
||
}
|
||
|
||
/**
|
||
* 텍스트에서 가장 관련성 높은 구간(excerpt)을 추출합니다.
|
||
* 단순 paragraph 단위가 아니라, 키워드 밀도가 높은 윈도우를 찾습니다.
|
||
*/
|
||
export function extractBestExcerpt(
|
||
content: string,
|
||
queryTokens: string[],
|
||
maxLength = 500
|
||
): string {
|
||
const expanded = expandQuery(queryTokens);
|
||
const expandedSet = new Set(expanded);
|
||
|
||
// 1. Sentence splitting & Initial filtering
|
||
const sentences = content
|
||
.split(/(?<=[.!?。!?\n])\s*/)
|
||
.map((s) => s.trim())
|
||
.filter((s) => s.length > 5);
|
||
|
||
if (sentences.length === 0) return content.slice(0, maxLength);
|
||
|
||
// 2. Phase 1: Density-based filtering (Multi-stage)
|
||
// 최소 정보 밀도를 충족하지 못하는 문장은 후보군에서 제외하거나 가중치를 낮춤
|
||
const scored = sentences.map((sentence, idx) => {
|
||
const tokens = tokenize(sentence);
|
||
const matchCount = tokens.filter((t) => expandedSet.has(t)).length;
|
||
const density = tokens.length > 0 ? matchCount / tokens.length : 0;
|
||
|
||
// 정보 밀도가 임계값 미만이면 점수를 크게 깎음
|
||
const densityMultiplier = density >= SCORING_CONFIG.DENSITY_THRESHOLD ? 1.5 : 0.5;
|
||
|
||
return { sentence, idx, matchCount, density, score: (matchCount + density * 2) * densityMultiplier };
|
||
});
|
||
|
||
// 3. Phase 2: Optimal window search
|
||
let bestStart = 0;
|
||
let bestScore = -1;
|
||
let bestLen = 0;
|
||
|
||
for (let i = 0; i < scored.length; i++) {
|
||
// [Refinement] 정보 밀도가 낮은 문장은 윈도우의 시작점이 될 수 없음
|
||
if (scored[i].density < SCORING_CONFIG.DENSITY_THRESHOLD) continue;
|
||
|
||
let windowText = '';
|
||
let windowScore = 0;
|
||
let j = i;
|
||
|
||
while (j < scored.length && windowText.length < maxLength) {
|
||
windowText += scored[j].sentence + ' ';
|
||
windowScore += scored[j].score;
|
||
j++;
|
||
}
|
||
|
||
if (windowScore > bestScore) {
|
||
bestScore = windowScore;
|
||
bestStart = i;
|
||
bestLen = j - i;
|
||
}
|
||
}
|
||
|
||
// [Structural Fix] 임계값을 충족하는 윈도우가 없을 경우 Fallback (빈 컨텍스트 방지)
|
||
if (bestScore <= 0) {
|
||
const fallbackSentences = [...scored] // [Structural Fix] 원본 배열 변이 방지 (Shallow Copy)
|
||
.sort((a, b) => b.score - a.score)
|
||
.slice(0, 2) // 가장 관련성 높은 문장 2개만 추출
|
||
.map((s) => s.sentence);
|
||
|
||
const fallbackResult = fallbackSentences.join(' ');
|
||
return fallbackResult.length > maxLength ? fallbackResult.slice(0, maxLength - 3) + '...' : fallbackResult;
|
||
}
|
||
|
||
// 4. Result construction with semantic context padding
|
||
let finalStart = bestStart;
|
||
let finalEnd = bestStart + bestLen;
|
||
|
||
// 전후 문맥을 1문장씩 추가하여 의미적 완전성 확보 (예산 허용 시)
|
||
if (finalStart > 0) finalStart--;
|
||
if (finalEnd < scored.length) finalEnd++;
|
||
|
||
const excerptSentences = scored
|
||
.slice(finalStart, finalEnd)
|
||
.map((s) => s.sentence);
|
||
|
||
const result = excerptSentences.join(' ');
|
||
return result.length > maxLength ? result.slice(0, maxLength - 3) + '...' : result;
|
||
}
|