Files
connectai/src/retrieval/scoring.ts
T
2026-05-22 15:00:14 +09:00

537 lines
21 KiB
TypeScript
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* ============================================================
* Scoring Engine — TF-IDF + Bilingual Tokenizer
*
* 단순 includes() 키워드 매칭을 넘어서,
* TF-IDF 가중치 기반의 문서 스코어링을 제공합니다.
* 한국어/영어 양국어 토크나이저를 포함합니다.
* ============================================================
*/
// ─── Bilingual Tokenizer ───
// ─── Scoring Engine Configuration ───
const SCORING_CONFIG = {
STOP_WORDS_EN: new Set([
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
'of', 'with', 'by', 'from', 'is', 'are', 'was', 'were', 'be', 'been',
'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
'should', 'may', 'might', 'can', 'this', 'that', 'these', 'those',
'it', 'its', 'not', 'no', 'what', 'how', 'when', 'where', 'which',
'who', 'whom', 'why', 'if', 'then', 'than', 'so', 'as', 'just',
'about', 'also', 'more', 'some', 'very', 'all', 'each', 'every',
'such', 'please', 'write', 'use', 'using', 'used'
]),
STOP_WORDS_KO: new Set([
'그리고', '그런데', '그래서', '하지만', '또한', '또는', '해서', '하는',
'있어', '없어', '아래', '위에', '어떻게', '이것', '저것', '그것',
'이런', '저런', '그런', '여기', '거기', '필요', '사용', '관련',
'대한', '대해', '통해', '따라', '위해', '대로', '만큼'
]),
SYNONYM_DATA: [
['성능', ['performance', 'optimization', '최적화', 'speed']],
['performance', ['성능', '최적화', 'optimization', 'speed']],
['아키텍처', ['architecture', '구조', 'structure', 'design']],
['architecture', ['아키텍처', '구조', 'structure', 'design']],
['메모리', ['memory', '기억', 'cache', 'storage']],
['memory', ['메모리', '기억', 'cache', 'storage']],
['버그', ['bug', 'error', '오류', 'issue', 'defect']],
['bug', ['버그', 'error', '오류', 'issue']],
['설계', ['design', '아키텍처', 'architecture', 'pattern']],
['design', ['설계', '아키텍처', 'architecture', 'pattern']],
['배포', ['deploy', 'deployment', 'release', 'ci', 'cd']],
['deploy', ['배포', 'deployment', 'release']],
['테스트', ['test', 'testing', 'spec', 'jest', 'mocha']],
['test', ['테스트', 'testing', 'spec']],
['프로젝트', ['project', '프로그램', 'repo', 'repository']],
['project', ['프로젝트', '프로그램', 'repo']],
['방향', ['direction', '전략', 'strategy', '목표', 'goal']],
['direction', ['방향', '전략', 'strategy', '목표']]
] as [string, string[]][],
DENSITY_THRESHOLD: 0.15, // 발췌문 추출 시 최소 키워드 밀도
TITLE_MULTIPLIER: 3.0, // 제목 일치 가중치
GLOBAL_CACHE_LIMIT: 2000,
CONFLICT_INDICATORS: new Set([
'반대', '충돌', '오류', '논란', '반박', '차이', '대조',
'conflict', 'contradict', 'dispute', 'controversy', 'error', 'mismatch', 'vs'
]),
CONFLICT_THRESHOLDS: {
HIGH: 4,
MEDIUM: 2,
LOW: 1
}
};
// ─── Global Search State & Cache ───
const TOKEN_CACHE = new Map<string, string[]>();
/**
* 캐시를 명시적으로 비웁니다. 문서 집합이 크게 변경되었을 때 사용합니다.
*/
export function clearScoringCache() {
TOKEN_CACHE.clear();
}
/**
* 한국어/영어 혼합 텍스트를 정규화하고 토큰으로 분리합니다.
*/
export function tokenize(text: string): string[] {
if (!text) return [];
if (TOKEN_CACHE.has(text)) return TOKEN_CACHE.get(text)!;
const normalized = text
.toLowerCase()
.replace(/[\u200B-\u200D\uFEFF]/g, '')
.replace(/[^\w\s가-힣_+#.-]/g, ' ');
// [Refinement] 영문/숫자와 한글 경계에서 분리하도록 개선
const splitText = normalized.replace(/([a-z0-9]+)([가-힣]+)/gi, '$1 $2').replace(/([가-힣]+)([a-z0-9]+)/gi, '$1 $2');
const tokens = splitText
.split(/[^a-z0-9가-힣+#.-]+/g) // [Structural Fix] C++, C#, .net 등 특수 기호 보존
.map((t) => t.trim().replace(/[.,]$/g, '')) // [Refinement] 문장 끝 마침표/쉼표 제거
.filter((t) => {
if (!t) return false;
// 특수문자만 남은 토큰 제거 (단일 + 나 . 등)
if (/^[+#.-]+$/.test(t)) return false;
// 한글이 포함된 경우 한 글자라도 허용, 그 외(영문/숫자)는 2글자 이상
if (/[가-힣]/.test(t)) return t.length >= 1;
return t.length >= 2;
})
.filter((t) => !SCORING_CONFIG.STOP_WORDS_EN.has(t) && !SCORING_CONFIG.STOP_WORDS_KO.has(t));
if (TOKEN_CACHE.size >= SCORING_CONFIG.GLOBAL_CACHE_LIMIT) TOKEN_CACHE.clear();
TOKEN_CACHE.set(text, tokens);
return tokens;
}
const synonymMap = new Map<string, string[]>(SCORING_CONFIG.SYNONYM_DATA);
/**
* 동의어/관련어 확장을 수행합니다.
* SCORING_CONFIG의 중앙 데이터를 참조합니다.
*/
export function expandQuery(tokens: string[]): string[] {
const expanded = new Set(tokens);
for (const token of tokens) {
const synonyms = synonymMap.get(token);
if (Array.isArray(synonyms)) {
for (const syn of synonyms) {
expanded.add(syn);
}
}
}
return Array.from(expanded);
}
// ─── TF-IDF Scoring ───
/**
* TF (Term Frequency): 문서 내 용어 빈도
*
* Takes a precomputed term-count `Map` (built once per document by
* `buildTermCounts`) instead of re-scanning the token array per term — the
* value is numerically identical to `count / documentTokens.length`.
*/
function termFrequency(term: string, termCounts: Map<string, number>, totalTokens: number): number {
if (totalTokens === 0) return 0;
const count = termCounts.get(term) || 0;
return count / totalTokens;
}
/** Build a term -> occurrence-count map for one document's token array (computed once, reused per query term). */
function buildTermCounts(documentTokens: string[]): Map<string, number> {
const counts = new Map<string, number>();
for (const t of documentTokens) {
counts.set(t, (counts.get(t) || 0) + 1);
}
return counts;
}
/**
* IDF (Inverse Document Frequency): 전체 문서 대비 희소도
* (Stability Enhancement: Smoothing 적용 및 최소 문서 수 대응)
*/
function inverseDocumentFrequency(
term: string,
allDocumentTokenSets: Array<Set<string>>
): number {
const N = allDocumentTokenSets.length;
if (N === 0) return 1.0;
const containing = allDocumentTokenSets.filter((doc) => doc.has(term)).length;
// N이 매우 작을 때(예: 5개 이하) 스코어 편향 방지를 위한 최소 분모 보정
const smoothN = N < 5 ? N + 5 : N;
const smoothContaining = containing;
// Standard Smooth IDF: log((N+1) / (containing+1)) + 1
// containing이 0일 경우에도 안전하게 동작하도록 설계
return Math.log((smoothN + 1) / (smoothContaining + 1)) + 1;
}
export type ConflictSeverity = 'NONE' | 'LOW' | 'MEDIUM' | 'HIGH';
/**
* Counts how many distinct conflict-indicator words are present (substring match) in `rawText`.
* Exposed so the brain index can cache this per-file instead of re-scanning content every query.
*/
export function countConflictIndicators(rawText: string): number {
const lower = (rawText || '').toLowerCase();
let n = 0;
for (const indicator of SCORING_CONFIG.CONFLICT_INDICATORS) {
if (lower.includes(indicator.toLowerCase())) n++;
}
return n;
}
/** A document whose tokens were already computed (e.g. from the persistent brain index). */
export interface PreTokenizedDoc {
/** tokenize(`${title} ${content}`) */
tokens: string[];
/** tokenize(title) */
titleTokens: string[];
lastModified?: number;
/** result of countConflictIndicators(`${title} ${content}`); 0 if unknown */
conflictCount: number;
}
export interface ScoredDocument {
index: number;
score: number;
titleBoost: number;
recencyBoost: number;
matchedTerms: string[];
conflictDetected: boolean;
conflictSeverity: ConflictSeverity;
informationDensity: number;
}
/**
* TF-IDF 기반으로 문서 집합을 스코어링합니다.
* 문서 내용을 받아 즉석에서 토크나이즈합니다 — 이미 토큰화된 집합이 있다면
* `scoreTfIdfPreTokenized` 를 직접 호출하면 토크나이즈를 건너뛸 수 있습니다.
*/
export function scoreTfIdf(
queryTokens: string[],
documents: Array<{
title: string;
content: string;
lastModified?: number;
}>
): ScoredDocument[] {
if (documents.length === 0 || queryTokens.length === 0) return [];
return scoreTfIdfPreTokenized(queryTokens, documents.map((doc) => {
const combined = `${doc.title} ${doc.content}`;
return {
tokens: tokenize(combined),
titleTokens: tokenize(doc.title),
lastModified: doc.lastModified,
conflictCount: countConflictIndicators(combined),
};
}));
}
/**
* TF-IDF 스코어링 — 이미 토큰화된 문서 집합 버전 (브레인 인덱스 등 캐시된 토큰을 그대로 사용).
* `scoreTfIdf` 와 동일한 알고리즘이며 출력 형태도 같습니다.
*/
export function scoreTfIdfPreTokenized(
queryTokens: string[],
documents: PreTokenizedDoc[]
): ScoredDocument[] {
if (documents.length === 0 || queryTokens.length === 0) return [];
const docTokenArrays = documents.map((doc) => doc.tokens);
// Precompute, once per document: a term -> count map (used for TF) and the
// derived token Set (used for IDF). Both were previously recomputed inside
// nested loops — building them once and reusing them is numerically identical.
const docTermCounts = docTokenArrays.map((tokens) => buildTermCounts(tokens));
const docTokenSets = docTermCounts.map((counts) => new Set(counts.keys()));
// Expand query with synonyms
const expandedQuery = expandQuery(queryTokens);
// Compute IDF for each query term (Local cache per document set)
const idfCache = new Map<string, number>();
for (const term of expandedQuery) {
if (!idfCache.has(term)) {
idfCache.set(term, inverseDocumentFrequency(term, docTokenSets));
}
}
const now = Date.now();
return documents.map((doc, index) => {
const docTokens = docTokenArrays[index];
const termCounts = docTermCounts[index];
const titleTokens = new Set(doc.titleTokens);
let score = 0;
const matchedTerms: string[] = [];
// Conflict Detection & Severity Analysis (pre-counted by caller / index)
const conflictCount = doc.conflictCount || 0;
const conflictDetected = conflictCount > 0;
let conflictSeverity: ConflictSeverity = 'NONE';
if (conflictCount >= SCORING_CONFIG.CONFLICT_THRESHOLDS.HIGH) conflictSeverity = 'HIGH';
else if (conflictCount >= SCORING_CONFIG.CONFLICT_THRESHOLDS.MEDIUM) conflictSeverity = 'MEDIUM';
else if (conflictCount >= SCORING_CONFIG.CONFLICT_THRESHOLDS.LOW) conflictSeverity = 'LOW';
for (const term of expandedQuery) {
const tf = termFrequency(term, termCounts, docTokens.length);
const idf = idfCache.get(term) || 1;
const tfidf = tf * idf;
if (tfidf > 0) {
matchedTerms.push(term);
}
// Title match bonus
const titleMultiplier = titleTokens.has(term) ? SCORING_CONFIG.TITLE_MULTIPLIER : 1.0;
score += tfidf * titleMultiplier;
}
// Information Density: 쿼리 관련 토큰의 밀도 측정
const informationDensity = docTokens.length > 0 ? matchedTerms.length / docTokens.length : 0;
// Recency boost
let recencyBoost = 0;
if (doc.lastModified) {
const daysAgo = (now - doc.lastModified) / (1000 * 60 * 60 * 24);
if (daysAgo < 1) recencyBoost = 0.3;
else if (daysAgo < 7) recencyBoost = 0.2;
else if (daysAgo < 30) recencyBoost = 0.1;
}
// Title match bonus for exact query term presence
const titleBoost = queryTokens.some((t) => titleTokens.has(t)) ? 0.2 : 0;
// [Structural Fix] Conflict Penalty 및 음수 점수 방지 (Floor Zero 정책)
const conflictMultiplier = conflictSeverity === 'HIGH' ? 0.1
: conflictSeverity === 'MEDIUM' ? 0.5
: conflictSeverity === 'LOW' ? 0.8
: 1.0;
const finalScore = (score + recencyBoost + titleBoost) * conflictMultiplier;
// [Structural Fix] Information Density: 쿼리 커버리지 기반으로 계산 방식 정상화
const queryCoverage = expandedQuery.length > 0
? new Set(matchedTerms).size / expandedQuery.length
: 0;
return {
index,
score: finalScore,
titleBoost,
recencyBoost,
matchedTerms: [...new Set(matchedTerms)],
conflictDetected,
conflictSeverity,
informationDensity: queryCoverage // 밀도를 쿼리 커버리지로 대체
};
});
}
/**
* Split markdown content into top-level sections by `#` / `##` / `###` headings.
*
* Returned sections are `{ heading, body }` — `heading` includes the heading
* line itself (preserving level), `body` is the text up to the next heading
* of the same-or-shallower depth. Front-matter (a leading `--- … ---` block)
* is dropped because it's not query-relevant.
*
* A document with no headings returns one synthetic section
* `{ heading: '', body: content }` so callers can treat the result uniformly.
*
* Why this exists: retrieval was returning whole files (excerpts capped at
* 400 chars). On long notes, that excerpt was often the file's intro/setup,
* not the section that actually matched the query. Section-level retrieval
* lets us pick the relevant heading directly and drop everything else.
*/
export interface MarkdownSection {
heading: string;
body: string;
}
export function splitMarkdownSections(content: string): MarkdownSection[] {
if (!content) return [];
// Strip frontmatter
let text = content;
if (/^?---\s*\n/.test(text)) {
const end = text.indexOf('\n---', 4);
if (end >= 0) text = text.slice(end + 4).replace(/^\s*\n/, '');
}
const lines = text.split('\n');
const headingIdx: Array<{ line: number; level: number }> = [];
for (let i = 0; i < lines.length; i++) {
const m = /^(#{1,6})\s+\S/.exec(lines[i]);
if (m) headingIdx.push({ line: i, level: m[1].length });
}
if (headingIdx.length === 0) {
return [{ heading: '', body: text.trim() }];
}
const sections: MarkdownSection[] = [];
// Capture any leading content above the first heading as a "preamble" section.
if (headingIdx[0].line > 0) {
const preamble = lines.slice(0, headingIdx[0].line).join('\n').trim();
if (preamble) sections.push({ heading: '', body: preamble });
}
for (let i = 0; i < headingIdx.length; i++) {
const start = headingIdx[i].line;
const end = i + 1 < headingIdx.length ? headingIdx[i + 1].line : lines.length;
const heading = lines[start].trim();
const body = lines.slice(start + 1, end).join('\n').trim();
sections.push({ heading, body });
}
return sections;
}
/**
* Pick the best heading-bounded section of a markdown document for a query,
* then fall back to keyword-window extraction inside that section if the
* section itself is still too long.
*
* Strategy:
* 1. Split into sections by heading (`splitMarkdownSections`).
* 2. Score each section's heading + body by query token overlap; weight
* heading matches 3× so "## Foo" beats a body mention of "foo".
* 3. If the top section's text fits, return it as-is (heading + body).
* 4. Otherwise, run `extractBestExcerpt` inside the top section's body and
* prepend the heading.
*
* Falls back to a plain `extractBestExcerpt` when the document has no
* headings — that's what `splitMarkdownSections` returns as a single
* synthetic section.
*
* Caps:
* - Output is always ≤ `maxLength` (final excerpt is sliced as a safety net).
* - Sections smaller than 24 chars after stripping are skipped — they're
* usually empty headings the author left as placeholders.
*/
export function extractBestSection(
content: string,
queryTokens: string[],
maxLength = 600
): string {
const sections = splitMarkdownSections(content);
if (sections.length === 0) return content.slice(0, maxLength);
if (sections.length === 1 && !sections[0].heading) {
return extractBestExcerpt(sections[0].body || content, queryTokens, maxLength);
}
const expanded = expandQuery(queryTokens);
const expandedSet = new Set(expanded);
const scoreText = (text: string) => {
if (!text) return 0;
const toks = tokenize(text);
let hits = 0;
for (const t of toks) if (expandedSet.has(t)) hits++;
return hits;
};
let best = { idx: -1, score: -1 };
for (let i = 0; i < sections.length; i++) {
const s = sections[i];
if ((s.heading.length + s.body.length) < 24) continue;
const score = scoreText(s.heading) * 3 + scoreText(s.body);
if (score > best.score) best = { idx: i, score };
}
if (best.idx < 0) {
// No section contained any query terms — fall back to a whole-doc excerpt.
return extractBestExcerpt(content, queryTokens, maxLength);
}
const picked = sections[best.idx];
const headingLine = picked.heading ? `${picked.heading}\n` : '';
const room = Math.max(64, maxLength - headingLine.length);
if (picked.body.length <= room) {
return (headingLine + picked.body).slice(0, maxLength).trim();
}
const inner = extractBestExcerpt(picked.body, queryTokens, room);
return (headingLine + inner).slice(0, maxLength).trim();
}
/**
* 텍스트에서 가장 관련성 높은 구간(excerpt)을 추출합니다.
* 단순 paragraph 단위가 아니라, 키워드 밀도가 높은 윈도우를 찾습니다.
*/
export function extractBestExcerpt(
content: string,
queryTokens: string[],
maxLength = 500
): string {
const expanded = expandQuery(queryTokens);
const expandedSet = new Set(expanded);
// 1. Sentence splitting & Initial filtering
const sentences = content
.split(/(?<=[.!?。!?\n])\s*/)
.map((s) => s.trim())
.filter((s) => s.length > 5);
if (sentences.length === 0) return content.slice(0, maxLength);
// 2. Phase 1: Density-based filtering (Multi-stage)
// 최소 정보 밀도를 충족하지 못하는 문장은 후보군에서 제외하거나 가중치를 낮춤
const scored = sentences.map((sentence, idx) => {
const tokens = tokenize(sentence);
const matchCount = tokens.filter((t) => expandedSet.has(t)).length;
const density = tokens.length > 0 ? matchCount / tokens.length : 0;
// 정보 밀도가 임계값 미만이면 점수를 크게 깎음
const densityMultiplier = density >= SCORING_CONFIG.DENSITY_THRESHOLD ? 1.5 : 0.5;
return { sentence, idx, matchCount, density, score: (matchCount + density * 2) * densityMultiplier };
});
// 3. Phase 2: Optimal window search
let bestStart = 0;
let bestScore = -1;
let bestLen = 0;
for (let i = 0; i < scored.length; i++) {
// [Refinement] 정보 밀도가 낮은 문장은 윈도우의 시작점이 될 수 없음
if (scored[i].density < SCORING_CONFIG.DENSITY_THRESHOLD) continue;
let windowText = '';
let windowScore = 0;
let j = i;
while (j < scored.length && windowText.length < maxLength) {
windowText += scored[j].sentence + ' ';
windowScore += scored[j].score;
j++;
}
if (windowScore > bestScore) {
bestScore = windowScore;
bestStart = i;
bestLen = j - i;
}
}
// [Structural Fix] 임계값을 충족하는 윈도우가 없을 경우 Fallback (빈 컨텍스트 방지)
if (bestScore <= 0) {
const fallbackSentences = [...scored] // [Structural Fix] 원본 배열 변이 방지 (Shallow Copy)
.sort((a, b) => b.score - a.score)
.slice(0, 2) // 가장 관련성 높은 문장 2개만 추출
.map((s) => s.sentence);
const fallbackResult = fallbackSentences.join(' ');
return fallbackResult.length > maxLength ? fallbackResult.slice(0, maxLength - 3) + '...' : fallbackResult;
}
// 4. Result construction with semantic context padding
let finalStart = bestStart;
let finalEnd = bestStart + bestLen;
// 전후 문맥을 1문장씩 추가하여 의미적 완전성 확보 (예산 허용 시)
if (finalStart > 0) finalStart--;
if (finalEnd < scored.length) finalEnd++;
const excerptSentences = scored
.slice(finalStart, finalEnd)
.map((s) => s.sentence);
const result = excerptSentences.join(' ');
return result.length > maxLength ? result.slice(0, maxLength - 3) + '...' : result;
}