feat: integrate unified RAG pipeline and bump version to 2.60.0

2026-05-04 11:00:01 +09:00
parent 0515dd625d
commit 445d530b63
16 changed files with 2178 additions and 112 deletions
@@ -0,0 +1,241 @@
+/**
+ * ============================================================
+ * Scoring Engine — TF-IDF + Bilingual Tokenizer
+ * 
+ * 단순 includes() 키워드 매칭을 넘어서,
+ * TF-IDF 가중치 기반의 문서 스코어링을 제공합니다.
+ * 한국어/영어 양국어 토크나이저를 포함합니다.
+ * ============================================================
+ */
+
+// ─── Bilingual Tokenizer ───
+
+const STOP_WORDS_EN = new Set([
+    'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
+    'of', 'with', 'by', 'from', 'is', 'are', 'was', 'were', 'be', 'been',
+    'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
+    'should', 'may', 'might', 'can', 'this', 'that', 'these', 'those',
+    'it', 'its', 'not', 'no', 'what', 'how', 'when', 'where', 'which',
+    'who', 'whom', 'why', 'if', 'then', 'than', 'so', 'as', 'just',
+    'about', 'also', 'more', 'some', 'very', 'all', 'each', 'every',
+    'such', 'please', 'write', 'use', 'using', 'used'
+]);
+
+const STOP_WORDS_KO = new Set([
+    '그리고', '그런데', '그래서', '하지만', '또한', '또는', '해서', '하는',
+    '있어', '없어', '아래', '위에', '어떻게', '이것', '저것', '그것',
+    '이런', '저런', '그런', '여기', '거기', '필요', '사용', '관련',
+    '대한', '대해', '통해', '따라', '위해', '대로', '만큼'
+]);
+
+/**
+ * 한국어/영어 혼합 텍스트를 토큰으로 분리합니다.
+ */
+export function tokenize(text: string): string[] {
+    return text
+        .toLowerCase()
+        .split(/[^a-z0-9가-힣_.-]+/g)
+        .map((t) => t.trim())
+        .filter((t) => t.length >= 2)
+        .filter((t) => !STOP_WORDS_EN.has(t) && !STOP_WORDS_KO.has(t));
+}
+
+/**
+ * 동의어/관련어 확장을 수행합니다.
+ */
+export function expandQuery(tokens: string[]): string[] {
+    const synonymMap: Record<string, string[]> = {
+        '성능': ['performance', 'optimization', '최적화', 'speed'],
+        'performance': ['성능', '최적화', 'optimization', 'speed'],
+        '아키텍처': ['architecture', '구조', 'structure', 'design'],
+        'architecture': ['아키텍처', '구조', 'structure', 'design'],
+        '메모리': ['memory', '기억', 'cache', 'storage'],
+        'memory': ['메모리', '기억', 'cache', 'storage'],
+        '버그': ['bug', 'error', '오류', 'issue', 'defect'],
+        'bug': ['버그', 'error', '오류', 'issue'],
+        '설계': ['design', '아키텍처', 'architecture', 'pattern'],
+        'design': ['설계', '아키텍처', 'architecture', 'pattern'],
+        '배포': ['deploy', 'deployment', 'release', 'ci', 'cd'],
+        'deploy': ['배포', 'deployment', 'release'],
+        '테스트': ['test', 'testing', 'spec', 'jest', 'mocha'],
+        'test': ['테스트', 'testing', 'spec'],
+        '프로젝트': ['project', '프로그램', 'repo', 'repository'],
+        'project': ['프로젝트', '프로그램', 'repo'],
+        '방향': ['direction', '전략', 'strategy', '목표', 'goal'],
+        'direction': ['방향', '전략', 'strategy', '목표']
+    };
+
+    const expanded = new Set(tokens);
+    for (const token of tokens) {
+        const synonyms = synonymMap[token];
+        if (synonyms) {
+            for (const syn of synonyms) {
+                expanded.add(syn);
+            }
+        }
+    }
+    return Array.from(expanded);
+}
+
+// ─── TF-IDF Scoring ───
+
+/**
+ * TF (Term Frequency): 문서 내 용어 빈도
+ */
+function termFrequency(term: string, documentTokens: string[]): number {
+    if (documentTokens.length === 0) return 0;
+    const count = documentTokens.filter((t) => t === term).length;
+    return count / documentTokens.length;
+}
+
+/**
+ * IDF (Inverse Document Frequency): 전체 문서 대비 희소도
+ */
+function inverseDocumentFrequency(
+    term: string,
+    allDocumentTokenSets: Array<Set<string>>
+): number {
+    const containing = allDocumentTokenSets.filter((doc) => doc.has(term)).length;
+    return Math.log((allDocumentTokenSets.length + 1) / (containing + 1)) + 1;
+}
+
+export interface ScoredDocument {
+    index: number;
+    score: number;
+    titleBoost: number;
+    recencyBoost: number;
+    matchedTerms: string[];
+}
+
+/**
+ * TF-IDF 기반으로 문서 집합을 스코어링합니다.
+ */
+export function scoreTfIdf(
+    queryTokens: string[],
+    documents: Array<{
+        title: string;
+        content: string;
+        lastModified?: number;
+    }>
+): ScoredDocument[] {
+    if (documents.length === 0 || queryTokens.length === 0) return [];
+
+    // Pre-tokenize all documents
+    const docTokenArrays = documents.map((doc) =>
+        tokenize(`${doc.title} ${doc.content}`)
+    );
+    const docTokenSets = docTokenArrays.map((tokens) => new Set(tokens));
+
+    // Expand query with synonyms
+    const expandedQuery = expandQuery(queryTokens);
+
+    // Compute IDF for each query term
+    const idfCache = new Map<string, number>();
+    for (const term of expandedQuery) {
+        if (!idfCache.has(term)) {
+            idfCache.set(term, inverseDocumentFrequency(term, docTokenSets));
+        }
+    }
+
+    const now = Date.now();
+
+    return documents.map((doc, index) => {
+        const docTokens = docTokenArrays[index];
+        const titleTokens = new Set(tokenize(doc.title));
+        let score = 0;
+        const matchedTerms: string[] = [];
+
+        for (const term of expandedQuery) {
+            const tf = termFrequency(term, docTokens);
+            const idf = idfCache.get(term) || 1;
+            const tfidf = tf * idf;
+
+            if (tfidf > 0) {
+                matchedTerms.push(term);
+            }
+
+            // Title match bonus (3x)
+            const titleMultiplier = titleTokens.has(term) ? 3.0 : 1.0;
+            score += tfidf * titleMultiplier;
+        }
+
+        // Recency boost: documents modified recently get a boost
+        let recencyBoost = 0;
+        if (doc.lastModified) {
+            const daysAgo = (now - doc.lastModified) / (1000 * 60 * 60 * 24);
+            if (daysAgo < 1) recencyBoost = 0.3;
+            else if (daysAgo < 7) recencyBoost = 0.2;
+            else if (daysAgo < 30) recencyBoost = 0.1;
+        }
+
+        // Title match bonus for exact query term presence
+        const titleBoost = queryTokens.some((t) => titleTokens.has(t)) ? 0.2 : 0;
+
+        return {
+            index,
+            score: score + recencyBoost + titleBoost,
+            titleBoost,
+            recencyBoost,
+            matchedTerms: [...new Set(matchedTerms)]
+        };
+    });
+}
+
+/**
+ * 텍스트에서 가장 관련성 높은 구간(excerpt)을 추출합니다.
+ * 단순 paragraph 단위가 아니라, 키워드 밀도가 높은 윈도우를 찾습니다.
+ */
+export function extractBestExcerpt(
+    content: string,
+    queryTokens: string[],
+    maxLength = 500
+): string {
+    const expanded = expandQuery(queryTokens);
+    const expandedSet = new Set(expanded);
+
+    // Split into sentences (한국어 + 영어)
+    const sentences = content
+        .split(/(?<=[.!?。！？\n])\s*/)
+        .map((s) => s.trim())
+        .filter((s) => s.length > 10);
+
+    if (sentences.length === 0) return content.slice(0, maxLength);
+
+    // Score each sentence
+    const scored = sentences.map((sentence, idx) => {
+        const tokens = tokenize(sentence);
+        const matchCount = tokens.filter((t) => expandedSet.has(t)).length;
+        const density = tokens.length > 0 ? matchCount / tokens.length : 0;
+        return { sentence, idx, matchCount, density };
+    });
+
+    // Find the best window of consecutive sentences
+    let bestStart = 0;
+    let bestScore = -1;
+    let bestLen = 0;
+
+    for (let i = 0; i < scored.length; i++) {
+        let windowText = '';
+        let windowScore = 0;
+        let j = i;
+
+        while (j < scored.length && windowText.length < maxLength) {
+            windowText += scored[j].sentence + ' ';
+            windowScore += scored[j].matchCount + scored[j].density * 2;
+            j++;
+        }
+
+        if (windowScore > bestScore) {
+            bestScore = windowScore;
+            bestStart = i;
+            bestLen = j - i;
+        }
+    }
+
+    const excerptSentences = scored
+        .slice(bestStart, bestStart + bestLen)
+        .map((s) => s.sentence);
+    
+    const result = excerptSentences.join(' ');
+    return result.length > maxLength ? result.slice(0, maxLength - 3) + '...' : result;
+}