feat: integrate unified RAG pipeline and bump version to 2.60.0

This commit is contained in:
g1nation
2026-05-04 11:00:01 +09:00
parent 0515dd625d
commit 445d530b63
16 changed files with 2178 additions and 112 deletions
+241
View File
@@ -0,0 +1,241 @@
/**
* ============================================================
* Scoring Engine — TF-IDF + Bilingual Tokenizer
*
* 단순 includes() 키워드 매칭을 넘어서,
* TF-IDF 가중치 기반의 문서 스코어링을 제공합니다.
* 한국어/영어 양국어 토크나이저를 포함합니다.
* ============================================================
*/
// ─── Bilingual Tokenizer ───
const STOP_WORDS_EN = new Set([
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
'of', 'with', 'by', 'from', 'is', 'are', 'was', 'were', 'be', 'been',
'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
'should', 'may', 'might', 'can', 'this', 'that', 'these', 'those',
'it', 'its', 'not', 'no', 'what', 'how', 'when', 'where', 'which',
'who', 'whom', 'why', 'if', 'then', 'than', 'so', 'as', 'just',
'about', 'also', 'more', 'some', 'very', 'all', 'each', 'every',
'such', 'please', 'write', 'use', 'using', 'used'
]);
const STOP_WORDS_KO = new Set([
'그리고', '그런데', '그래서', '하지만', '또한', '또는', '해서', '하는',
'있어', '없어', '아래', '위에', '어떻게', '이것', '저것', '그것',
'이런', '저런', '그런', '여기', '거기', '필요', '사용', '관련',
'대한', '대해', '통해', '따라', '위해', '대로', '만큼'
]);
/**
* 한국어/영어 혼합 텍스트를 토큰으로 분리합니다.
*/
export function tokenize(text: string): string[] {
return text
.toLowerCase()
.split(/[^a-z0-9가-힣_.-]+/g)
.map((t) => t.trim())
.filter((t) => t.length >= 2)
.filter((t) => !STOP_WORDS_EN.has(t) && !STOP_WORDS_KO.has(t));
}
/**
* 동의어/관련어 확장을 수행합니다.
*/
export function expandQuery(tokens: string[]): string[] {
const synonymMap: Record<string, string[]> = {
'성능': ['performance', 'optimization', '최적화', 'speed'],
'performance': ['성능', '최적화', 'optimization', 'speed'],
'아키텍처': ['architecture', '구조', 'structure', 'design'],
'architecture': ['아키텍처', '구조', 'structure', 'design'],
'메모리': ['memory', '기억', 'cache', 'storage'],
'memory': ['메모리', '기억', 'cache', 'storage'],
'버그': ['bug', 'error', '오류', 'issue', 'defect'],
'bug': ['버그', 'error', '오류', 'issue'],
'설계': ['design', '아키텍처', 'architecture', 'pattern'],
'design': ['설계', '아키텍처', 'architecture', 'pattern'],
'배포': ['deploy', 'deployment', 'release', 'ci', 'cd'],
'deploy': ['배포', 'deployment', 'release'],
'테스트': ['test', 'testing', 'spec', 'jest', 'mocha'],
'test': ['테스트', 'testing', 'spec'],
'프로젝트': ['project', '프로그램', 'repo', 'repository'],
'project': ['프로젝트', '프로그램', 'repo'],
'방향': ['direction', '전략', 'strategy', '목표', 'goal'],
'direction': ['방향', '전략', 'strategy', '목표']
};
const expanded = new Set(tokens);
for (const token of tokens) {
const synonyms = synonymMap[token];
if (synonyms) {
for (const syn of synonyms) {
expanded.add(syn);
}
}
}
return Array.from(expanded);
}
// ─── TF-IDF Scoring ───
/**
* TF (Term Frequency): 문서 내 용어 빈도
*/
function termFrequency(term: string, documentTokens: string[]): number {
if (documentTokens.length === 0) return 0;
const count = documentTokens.filter((t) => t === term).length;
return count / documentTokens.length;
}
/**
* IDF (Inverse Document Frequency): 전체 문서 대비 희소도
*/
function inverseDocumentFrequency(
term: string,
allDocumentTokenSets: Array<Set<string>>
): number {
const containing = allDocumentTokenSets.filter((doc) => doc.has(term)).length;
return Math.log((allDocumentTokenSets.length + 1) / (containing + 1)) + 1;
}
export interface ScoredDocument {
index: number;
score: number;
titleBoost: number;
recencyBoost: number;
matchedTerms: string[];
}
/**
* TF-IDF 기반으로 문서 집합을 스코어링합니다.
*/
export function scoreTfIdf(
queryTokens: string[],
documents: Array<{
title: string;
content: string;
lastModified?: number;
}>
): ScoredDocument[] {
if (documents.length === 0 || queryTokens.length === 0) return [];
// Pre-tokenize all documents
const docTokenArrays = documents.map((doc) =>
tokenize(`${doc.title} ${doc.content}`)
);
const docTokenSets = docTokenArrays.map((tokens) => new Set(tokens));
// Expand query with synonyms
const expandedQuery = expandQuery(queryTokens);
// Compute IDF for each query term
const idfCache = new Map<string, number>();
for (const term of expandedQuery) {
if (!idfCache.has(term)) {
idfCache.set(term, inverseDocumentFrequency(term, docTokenSets));
}
}
const now = Date.now();
return documents.map((doc, index) => {
const docTokens = docTokenArrays[index];
const titleTokens = new Set(tokenize(doc.title));
let score = 0;
const matchedTerms: string[] = [];
for (const term of expandedQuery) {
const tf = termFrequency(term, docTokens);
const idf = idfCache.get(term) || 1;
const tfidf = tf * idf;
if (tfidf > 0) {
matchedTerms.push(term);
}
// Title match bonus (3x)
const titleMultiplier = titleTokens.has(term) ? 3.0 : 1.0;
score += tfidf * titleMultiplier;
}
// Recency boost: documents modified recently get a boost
let recencyBoost = 0;
if (doc.lastModified) {
const daysAgo = (now - doc.lastModified) / (1000 * 60 * 60 * 24);
if (daysAgo < 1) recencyBoost = 0.3;
else if (daysAgo < 7) recencyBoost = 0.2;
else if (daysAgo < 30) recencyBoost = 0.1;
}
// Title match bonus for exact query term presence
const titleBoost = queryTokens.some((t) => titleTokens.has(t)) ? 0.2 : 0;
return {
index,
score: score + recencyBoost + titleBoost,
titleBoost,
recencyBoost,
matchedTerms: [...new Set(matchedTerms)]
};
});
}
/**
* 텍스트에서 가장 관련성 높은 구간(excerpt)을 추출합니다.
* 단순 paragraph 단위가 아니라, 키워드 밀도가 높은 윈도우를 찾습니다.
*/
export function extractBestExcerpt(
content: string,
queryTokens: string[],
maxLength = 500
): string {
const expanded = expandQuery(queryTokens);
const expandedSet = new Set(expanded);
// Split into sentences (한국어 + 영어)
const sentences = content
.split(/(?<=[.!?。!?\n])\s*/)
.map((s) => s.trim())
.filter((s) => s.length > 10);
if (sentences.length === 0) return content.slice(0, maxLength);
// Score each sentence
const scored = sentences.map((sentence, idx) => {
const tokens = tokenize(sentence);
const matchCount = tokens.filter((t) => expandedSet.has(t)).length;
const density = tokens.length > 0 ? matchCount / tokens.length : 0;
return { sentence, idx, matchCount, density };
});
// Find the best window of consecutive sentences
let bestStart = 0;
let bestScore = -1;
let bestLen = 0;
for (let i = 0; i < scored.length; i++) {
let windowText = '';
let windowScore = 0;
let j = i;
while (j < scored.length && windowText.length < maxLength) {
windowText += scored[j].sentence + ' ';
windowScore += scored[j].matchCount + scored[j].density * 2;
j++;
}
if (windowScore > bestScore) {
bestScore = windowScore;
bestStart = i;
bestLen = j - i;
}
}
const excerptSentences = scored
.slice(bestStart, bestStart + bestLen)
.map((s) => s.sentence);
const result = excerptSentences.join(' ');
return result.length > maxLength ? result.slice(0, maxLength - 3) + '...' : result;
}