/** * Conflict Surface — [CONFLICT WARNINGS] 시스템 프롬프트 블록 생성. * * 기존 scoring.ts 가 문서당 conflictSeverity(NONE/LOW/MEDIUM/HIGH) 를 *이미* * 계산하지만(반대/논란/vs 등 indicator 단어 카운트), LLM 은 그 사실을 모름. * buildAstraModeSystemPrompt 의 v4 정책 텍스트는 이미 "[CONFLICT WARNING] 플래그" * 를 *언급*하나, 실제 어떤 문서가 충돌인지 LLM 에게 *전달되지 않음* — 정책이 * 명시되어 있지만 데이터가 없어 무용한 상태. * * 이 모듈이 그 갭을 메움: * 1. 자기-신호(self-flag) — chunk.metadata.conflictSeverity ≥ threshold * 2. 교차-문서 발산(cross-divergence) — 같은 주제 2 chunks, Jaccard < 임계 * * 둘을 합쳐 마크다운 블록 한 개로. 결과가 비면 빈 문자열 반환 — 호출자가 * 안전하게 무조건 join 가능. */ import { RetrievalChunk, ConflictSeverity } from './types'; import { tokenize } from './scoring'; /** 사용자 설정 임계값. 'low' = LOW 부터, 'medium' = MEDIUM 부터, 'high' = HIGH 만. */ export type ConflictThresholdSetting = 'low' | 'medium' | 'high'; export interface ConflictBlockOptions { /** 자기-신호 surface 시 최소 severity. 기본 'medium'. */ selfFlagThreshold: ConflictThresholdSetting; /** 교차 발산 감지 enable. 기본 true. */ crossDivergenceEnabled: boolean; /** 자기-신호 / 교차 발산 각각 표시 최대 건수. 기본 5. */ maxPerSection: number; /** Chunk 미리보기 길이. 기본 220 chars. */ excerptLength: number; } const DEFAULT_OPTIONS: ConflictBlockOptions = { selfFlagThreshold: 'medium', crossDivergenceEnabled: true, maxPerSection: 5, excerptLength: 220, }; function severityRank(s: ConflictSeverity | undefined): number { switch (s) { case 'HIGH': return 3; case 'MEDIUM': return 2; case 'LOW': return 1; default: return 0; } } function thresholdRank(t: ConflictThresholdSetting): number { switch (t) { case 'high': return 3; case 'medium': return 2; case 'low': return 1; } } function severityEmoji(s: ConflictSeverity | undefined): string { switch (s) { case 'HIGH': return '🔴'; case 'MEDIUM': return '🟡'; case 'LOW': return '🟠'; default: return '⚪'; } } function shortExcerpt(text: string, n: number): string { if (!text) return ''; const cleaned = text.replace(/\s+/g, ' ').trim(); return cleaned.length <= n ? cleaned : cleaned.slice(0, n) + '…'; } /** 두 토큰 집합의 Jaccard 유사도. */ function jaccard(a: Set, b: Set): number { if (a.size === 0 || b.size === 0) return 0; let intersect = 0; for (const t of a) if (b.has(t)) intersect++; const union = a.size + b.size - intersect; return union === 0 ? 0 : intersect / union; } /** * 교차-문서 발산 후보 쌍 찾기. * * 휴리스틱: * 1. 각 chunk 의 title 토큰(최대 5개) 으로 "주제 키" 생성 * 2. 동일 주제 키 2개 이상 공유하는 chunk 쌍을 후보로 * 3. 본문 토큰 Jaccard < 0.30 이면 발산으로 판정 (같은 주제 다른 내용) * 4. 점수 = (공유 토픽 토큰 수) × (1 - Jaccard) — 발산이 클수록 우선 * * 한 chunk 가 여러 쌍에 등장 가능 — 상위 N 쌍만 반환. */ interface DivergencePair { a: RetrievalChunk; b: RetrievalChunk; sharedTopicTokens: string[]; contentJaccard: number; score: number; } function findCrossDivergence(chunks: RetrievalChunk[], topicJaccardMax: number = 0.30): DivergencePair[] { if (chunks.length < 2) return []; // Pre-compute title topic tokens + content token sets — n^2 비교 전에 한 번만. const titleTokenSets: Set[] = []; const contentTokenSets: Set[] = []; for (const c of chunks) { const titleTokens = tokenize(c.title || '').filter((t) => t.length >= 2); titleTokenSets.push(new Set(titleTokens.slice(0, 8))); contentTokenSets.push(new Set(tokenize(c.content || ''))); } const pairs: DivergencePair[] = []; for (let i = 0; i < chunks.length; i++) { for (let j = i + 1; j < chunks.length; j++) { // 1. 같은 주제 — title 토큰 공유 ≥ 2 const shared: string[] = []; for (const t of titleTokenSets[i]) if (titleTokenSets[j].has(t)) shared.push(t); if (shared.length < 2) continue; // 2. 본문 발산 — Jaccard < 임계 const cj = jaccard(contentTokenSets[i], contentTokenSets[j]); if (cj >= topicJaccardMax) continue; pairs.push({ a: chunks[i], b: chunks[j], sharedTopicTokens: shared, contentJaccard: cj, score: shared.length * (1 - cj), }); } } pairs.sort((p, q) => q.score - p.score); return pairs; } /** * 시스템 프롬프트용 [CONFLICT WARNINGS] 블록 생성. 충돌 없으면 빈 문자열 반환. * * 호출 측은 무조건 join 해도 안전 — 빈 문자열이면 프롬프트에 추가 줄바꿈 없음. */ export function buildConflictWarningsBlock( chunks: RetrievalChunk[], options: Partial = {}, ): string { const opts: ConflictBlockOptions = { ...DEFAULT_OPTIONS, ...options }; if (!chunks || chunks.length === 0) return ''; // ─── Section 1: self-flag ─── const threshold = thresholdRank(opts.selfFlagThreshold); const selfFlagged = chunks .filter((c) => severityRank(c.metadata?.conflictSeverity) >= threshold) .sort((a, b) => severityRank(b.metadata?.conflictSeverity) - severityRank(a.metadata?.conflictSeverity)) .slice(0, opts.maxPerSection); // ─── Section 2: cross-doc divergence ─── const divergence = opts.crossDivergenceEnabled ? findCrossDivergence(chunks).slice(0, opts.maxPerSection) : []; if (selfFlagged.length === 0 && divergence.length === 0) return ''; const lines: string[] = []; lines.push('[CONFLICT WARNINGS]'); lines.push('다음 검색된 출처에서 충돌 신호 감지. 단일 결론을 강요하지 말고, 상충되는 관점을 명시하고 사용자 판단에 위임할 것.'); lines.push(''); if (selfFlagged.length > 0) { lines.push('## 자기-신호 (출처 내부에서 충돌/논란 키워드 감지)'); for (const c of selfFlagged) { const sev = c.metadata?.conflictSeverity || 'NONE'; const emoji = severityEmoji(sev); const src = c.source; const title = c.title || '(제목 없음)'; lines.push(`- ${emoji} **[${sev}]** \`${src}\` · ${title}`); lines.push(` > ${shortExcerpt(c.content, opts.excerptLength)}`); } lines.push(''); } if (divergence.length > 0) { lines.push('## 교차-문서 발산 (같은 주제·다른 내용 ─ 잠재적 모순)'); for (const p of divergence) { const topics = p.sharedTopicTokens.slice(0, 5).join(' · '); const cjPct = (p.contentJaccard * 100).toFixed(0); lines.push(`- 🔀 **공유 주제**: ${topics} _(본문 중복 ${cjPct}%)_`); lines.push(` - A: \`${p.a.source}\` · ${p.a.title || '(제목 없음)'}`); lines.push(` > ${shortExcerpt(p.a.content, opts.excerptLength)}`); lines.push(` - B: \`${p.b.source}\` · ${p.b.title || '(제목 없음)'}`); lines.push(` > ${shortExcerpt(p.b.content, opts.excerptLength)}`); } lines.push(''); } lines.push('[지침]'); lines.push('1. 답변에 위 출처 중 하나라도 사용한다면, 충돌 가능성을 명시 (예: "출처 A 는 X 라 하나 출처 B 는 Y").'); lines.push('2. 어느 쪽이 옳다고 단정하지 말고, 사용자가 판단할 수 있도록 근거를 분리해 제시.'); lines.push('3. 충돌이 답변과 무관하면 무시 가능 — 다만 무관 판단 자체도 한 줄로 기록.'); lines.push('[/CONFLICT WARNINGS]'); return lines.join('\n'); }