connectai/src/retrieval/conflictBlock.ts

/**
 * Conflict Surface — [CONFLICT WARNINGS] 시스템 프롬프트 블록 생성.
 *
 * 기존 scoring.ts 가 문서당 conflictSeverity(NONE/LOW/MEDIUM/HIGH) 를 *이미*
 * 계산하지만(반대/논란/vs 등 indicator 단어 카운트), LLM 은 그 사실을 모름.
 * buildAstraModeSystemPrompt 의 v4 정책 텍스트는 이미 "[CONFLICT WARNING] 플래그"
 * 를 *언급*하나, 실제 어떤 문서가 충돌인지 LLM 에게 *전달되지 않음* — 정책이
 * 명시되어 있지만 데이터가 없어 무용한 상태.
 *
 * 이 모듈이 그 갭을 메움:
 *   1. 자기-신호(self-flag)  — chunk.metadata.conflictSeverity ≥ threshold
 *   2. 교차-문서 발산(cross-divergence) — 같은 주제 2 chunks, Jaccard < 임계
 *
 * 둘을 합쳐 마크다운 블록 한 개로. 결과가 비면 빈 문자열 반환 — 호출자가
 * 안전하게 무조건 join 가능.
 */

import { RetrievalChunk, ConflictSeverity } from './types';
import { tokenize } from './scoring';

/** 사용자 설정 임계값. 'low' = LOW 부터, 'medium' = MEDIUM 부터, 'high' = HIGH 만. */
export type ConflictThresholdSetting = 'low' | 'medium' | 'high';

export interface ConflictBlockOptions {
    /** 자기-신호 surface 시 최소 severity. 기본 'medium'. */
    selfFlagThreshold: ConflictThresholdSetting;
    /** 교차 발산 감지 enable. 기본 true. */
    crossDivergenceEnabled: boolean;
    /** 자기-신호 / 교차 발산 각각 표시 최대 건수. 기본 5. */
    maxPerSection: number;
    /** Chunk 미리보기 길이. 기본 220 chars. */
    excerptLength: number;
}

const DEFAULT_OPTIONS: ConflictBlockOptions = {
    selfFlagThreshold: 'medium',
    crossDivergenceEnabled: true,
    maxPerSection: 5,
    excerptLength: 220,
};

function severityRank(s: ConflictSeverity | undefined): number {
    switch (s) {
        case 'HIGH': return 3;
        case 'MEDIUM': return 2;
        case 'LOW': return 1;
        default: return 0;
    }
}

function thresholdRank(t: ConflictThresholdSetting): number {
    switch (t) {
        case 'high': return 3;
        case 'medium': return 2;
        case 'low': return 1;
    }
}

function severityEmoji(s: ConflictSeverity | undefined): string {
    switch (s) {
        case 'HIGH': return '🔴';
        case 'MEDIUM': return '🟡';
        case 'LOW': return '🟠';
        default: return '⚪';
    }
}

function shortExcerpt(text: string, n: number): string {
    if (!text) return '';
    const cleaned = text.replace(/\s+/g, ' ').trim();
    return cleaned.length <= n ? cleaned : cleaned.slice(0, n) + '…';
}

/** 두 토큰 집합의 Jaccard 유사도. */
function jaccard(a: Set<string>, b: Set<string>): number {
    if (a.size === 0 || b.size === 0) return 0;
    let intersect = 0;
    for (const t of a) if (b.has(t)) intersect++;
    const union = a.size + b.size - intersect;
    return union === 0 ? 0 : intersect / union;
}

/**
 * 교차-문서 발산 후보 쌍 찾기.
 *
 * 휴리스틱:
 *  1. 각 chunk 의 title 토큰(최대 5개) 으로 "주제 키" 생성
 *  2. 동일 주제 키 2개 이상 공유하는 chunk 쌍을 후보로
 *  3. 본문 토큰 Jaccard < 0.30 이면 발산으로 판정 (같은 주제 다른 내용)
 *  4. 점수 = (공유 토픽 토큰 수) × (1 - Jaccard) — 발산이 클수록 우선
 *
 * 한 chunk 가 여러 쌍에 등장 가능 — 상위 N 쌍만 반환.
 */
interface DivergencePair {
    a: RetrievalChunk;
    b: RetrievalChunk;
    sharedTopicTokens: string[];
    contentJaccard: number;
    score: number;
}

function findCrossDivergence(chunks: RetrievalChunk[], topicJaccardMax: number = 0.30): DivergencePair[] {
    if (chunks.length < 2) return [];

    // Pre-compute title topic tokens + content token sets — n^2 비교 전에 한 번만.
    const titleTokenSets: Set<string>[] = [];
    const contentTokenSets: Set<string>[] = [];
    for (const c of chunks) {
        const titleTokens = tokenize(c.title || '').filter((t) => t.length >= 2);
        titleTokenSets.push(new Set(titleTokens.slice(0, 8)));
        contentTokenSets.push(new Set(tokenize(c.content || '')));
    }

    const pairs: DivergencePair[] = [];
    for (let i = 0; i < chunks.length; i++) {
        for (let j = i + 1; j < chunks.length; j++) {
            // 1. 같은 주제 — title 토큰 공유 ≥ 2
            const shared: string[] = [];
            for (const t of titleTokenSets[i]) if (titleTokenSets[j].has(t)) shared.push(t);
            if (shared.length < 2) continue;

            // 2. 본문 발산 — Jaccard < 임계
            const cj = jaccard(contentTokenSets[i], contentTokenSets[j]);
            if (cj >= topicJaccardMax) continue;

            pairs.push({
                a: chunks[i],
                b: chunks[j],
                sharedTopicTokens: shared,
                contentJaccard: cj,
                score: shared.length * (1 - cj),
            });
        }
    }
    pairs.sort((p, q) => q.score - p.score);
    return pairs;
}

/**
 * 시스템 프롬프트용 [CONFLICT WARNINGS] 블록 생성. 충돌 없으면 빈 문자열 반환.
 *
 * 호출 측은 무조건 join 해도 안전 — 빈 문자열이면 프롬프트에 추가 줄바꿈 없음.
 */
export function buildConflictWarningsBlock(
    chunks: RetrievalChunk[],
    options: Partial<ConflictBlockOptions> = {},
): string {
    const opts: ConflictBlockOptions = { ...DEFAULT_OPTIONS, ...options };
    if (!chunks || chunks.length === 0) return '';

    // ─── Section 1: self-flag ───
    const threshold = thresholdRank(opts.selfFlagThreshold);
    const selfFlagged = chunks
        .filter((c) => severityRank(c.metadata?.conflictSeverity) >= threshold)
        .sort((a, b) => severityRank(b.metadata?.conflictSeverity) - severityRank(a.metadata?.conflictSeverity))
        .slice(0, opts.maxPerSection);

    // ─── Section 2: cross-doc divergence ───
    const divergence = opts.crossDivergenceEnabled
        ? findCrossDivergence(chunks).slice(0, opts.maxPerSection)
        : [];

    if (selfFlagged.length === 0 && divergence.length === 0) return '';

    const lines: string[] = [];
    lines.push('[CONFLICT WARNINGS]');
    lines.push('다음 검색된 출처에서 충돌 신호 감지. 단일 결론을 강요하지 말고, 상충되는 관점을 명시하고 사용자 판단에 위임할 것.');
    lines.push('');

    if (selfFlagged.length > 0) {
        lines.push('## 자기-신호 (출처 내부에서 충돌/논란 키워드 감지)');
        for (const c of selfFlagged) {
            const sev = c.metadata?.conflictSeverity || 'NONE';
            const emoji = severityEmoji(sev);
            const src = c.source;
            const title = c.title || '(제목 없음)';
            lines.push(`- ${emoji} **[${sev}]** \`${src}\` · ${title}`);
            lines.push(`  > ${shortExcerpt(c.content, opts.excerptLength)}`);
        }
        lines.push('');
    }

    if (divergence.length > 0) {
        lines.push('## 교차-문서 발산 (같은 주제·다른 내용 ─ 잠재적 모순)');
        for (const p of divergence) {
            const topics = p.sharedTopicTokens.slice(0, 5).join(' · ');
            const cjPct = (p.contentJaccard * 100).toFixed(0);
            lines.push(`- 🔀 **공유 주제**: ${topics} _(본문 중복 ${cjPct}%)_`);
            lines.push(`  - A: \`${p.a.source}\` · ${p.a.title || '(제목 없음)'}`);
            lines.push(`    > ${shortExcerpt(p.a.content, opts.excerptLength)}`);
            lines.push(`  - B: \`${p.b.source}\` · ${p.b.title || '(제목 없음)'}`);
            lines.push(`    > ${shortExcerpt(p.b.content, opts.excerptLength)}`);
        }
        lines.push('');
    }

    lines.push('[지침]');
    lines.push('1. 답변에 위 출처 중 하나라도 사용한다면, 충돌 가능성을 명시 (예: "출처 A 는 X 라 하나 출처 B 는 Y").');
    lines.push('2. 어느 쪽이 옳다고 단정하지 말고, 사용자가 판단할 수 있도록 근거를 분리해 제시.');
    lines.push('3. 충돌이 답변과 무관하면 무시 가능 — 다만 무관 판단 자체도 한 줄로 기록.');
    lines.push('[/CONFLICT WARNINGS]');

    return lines.join('\n');
}