connectai/src/retrieval/lessonHelpers.ts

/**
 * ============================================================
 * Lesson / Experience Memory — pure helpers (no vscode dependency)
 *
 * "Lesson" = a markdown file in the active brain that captures a past mistake/risk and how to avoid
 * repeating it. Identified by a `lessons/` / `playbooks/` / `qa-findings/` path segment, or by
 * frontmatter `type: lesson|playbook|qa-finding`. These are retrieved like any other brain file but
 * boosted and injected as a prominent "verify before finalizing" checklist (see EXPERIENCE_MEMORY_PLAN.md).
 * ============================================================
 */

import { tokenize } from './scoring';

/** Path segments that mark a file as lesson-like. */
export const LESSON_DIR_RE = /(^|[\\/])(lessons?|playbooks?|qa[-_]?findings?)([\\/]|$)/i;

export type LessonKind = 'lesson' | 'playbook' | 'qa-finding';

/**
 * Decide whether a brain file is a lesson (and which kind). Cheap — only looks at the relative path
 * and, if present, the YAML-ish frontmatter at the top of `content`.
 *
 * @returns the kind string, or '' for an ordinary note.
 */
export function detectLessonKind(relativePath: string, content: string): LessonKind | '' {
    // 1) Frontmatter `type:` wins if present.
    const fm = parseFrontmatterType(content);
    if (fm === 'lesson' || fm === 'playbook' || fm === 'qa-finding') return fm;
    // 2) Otherwise infer from the path.
    const m = LESSON_DIR_RE.exec(relativePath || '');
    if (!m) return '';
    const seg = m[2].toLowerCase();
    if (seg.startsWith('playbook')) return 'playbook';
    if (seg.startsWith('qa')) return 'qa-finding';
    return 'lesson';
}

/** Pull the `type:` value out of a leading `--- ... ---` frontmatter block. Returns '' if absent. */
function parseFrontmatterType(content: string): string {
    if (!content) return '';
    const head = content.slice(0, 800);
    if (!/^?---\s*\n/.test(head)) return '';
    const end = head.indexOf('\n---', 4);
    if (end < 0) return '';
    const block = head.slice(0, end);
    const m = block.match(/^\s*type\s*:\s*["']?([a-zA-Z-]+)["']?\s*$/m);
    return m ? m[1].trim().toLowerCase() : '';
}

/**
 * Pull a specific markdown section ("## NAME ... up to the next heading") from a lesson card.
 * Returns trimmed body text, or '' if the heading isn't found.
 */
function extractSection(content: string, headingRe: RegExp): string {
    const m = content.match(headingRe);
    if (!m || m.index === undefined) return '';
    const after = content.slice(m.index + m[0].length);
    const stop = after.search(/\n#{1,6}\s/);
    const section = stop >= 0 ? after.slice(0, stop) : after;
    return section.trim();
}

/**
 * Slim a lesson card down to the sections that actually matter for guardrails:
 * Mistake / Risk, Root Cause, Fix, and Prevention Checklist. Drops Situation,
 * Applies-To, and any verbose narrative. Returned text is markdown-compatible
 * with the original headings so the model still sees the structure.
 *
 * Falls back to the original content (clipped to `maxLen`) if no recognised
 * sections are found — keeps backwards-compat for old lessons that don't
 * follow the current template.
 *
 * Why: lesson cards are loaded at 2500 chars each and three cards can eat
 * ~11K tokens. The essence sections are usually <600 chars total per card,
 * which trims retrieval tokens by ~70% without losing the signal.
 */
export function extractLessonEssence(content: string, maxLen = 1200): string {
    if (!content) return '';
    const sections: Array<{ heading: string; body: string }> = [];
    const want: Array<[string, RegExp]> = [
        ['## Mistake / Risk', /^#{1,6}\s*(?:mistake\s*\/?\s*risk|mistake|risk|실수|문제)\s*$/im],
        ['## Root Cause', /^#{1,6}\s*(?:root\s*cause|근본\s*원인|원인)\s*$/im],
        ['## Fix', /^#{1,6}\s*(?:fix|해결|수정)\s*$/im],
        ['## Prevention Checklist', /^#{1,6}\s*(?:prevention\s*checklist|prevention|체크리스트|예방\s*체크리스트)\s*$/im],
    ];
    for (const [heading, re] of want) {
        const body = extractSection(content, re);
        if (body && !/^<[^>]+>$/.test(body)) sections.push({ heading, body });
    }
    if (sections.length === 0) {
        return content.length <= maxLen ? content.trim() : content.slice(0, maxLen).trim() + '\n…';
    }
    let assembled = sections.map((s) => `${s.heading}\n${s.body}`).join('\n\n');
    if (assembled.length > maxLen) assembled = assembled.slice(0, maxLen).trim() + '\n…';
    return assembled;
}

/** Extract the "## Prevention Checklist" bullet list from a lesson card, if present. */
export function extractPreventionChecklist(content: string): string[] {
    if (!content) return [];
    const m = content.match(/^#{1,6}\s*(?:prevention\s*checklist|prevention|체크리스트|예방\s*체크리스트)\s*$/im);
    if (!m || m.index === undefined) return [];
    const after = content.slice(m.index + m[0].length);
    // Stop at the next heading.
    const stop = after.search(/\n#{1,6}\s/);
    const section = stop >= 0 ? after.slice(0, stop) : after;
    return section
        .split('\n')
        .map((l) => l.trim())
        .filter((l) => /^[-*]\s+/.test(l))
        .map((l) => l.replace(/^[-*]\s+/, '').trim())
        .filter(Boolean);
}

export interface LessonChunkLite {
    title: string;       // relative path / display title
    content: string;     // excerpt or full card text
}

/**
 * Build the prompt block injected ahead of the regular RAG context. Kept compact; if a card has a
 * parseable Prevention Checklist we surface just that, otherwise the card text.
 */
export function buildLessonChecklistBlock(chunks: LessonChunkLite[]): string {
    if (!chunks || chunks.length === 0) return '';
    const sections: string[] = [];
    for (const c of chunks) {
        const checklist = extractPreventionChecklist(c.content);
        const body = checklist.length > 0
            ? checklist.map((item) => `- [ ] ${item}`).join('\n')
            : c.content.trim();
        sections.push(`### ${c.title}\n${body}`);
    }
    return [
        '[⚠ ACTIVE LESSONS — verify these BEFORE finalizing your answer]',
        'These are recorded lessons from past work on this project. Read them first and make sure you are NOT',
        'about to repeat any of the mistakes / skip any of the precautions below. If a checklist item is relevant',
        'to the current request, explicitly confirm it in your answer. If a lesson conflicts with the user, prefer',
        'the user but flag the conflict.',
        '',
        sections.join('\n\n'),
        '',
        '[END ACTIVE LESSONS]',
    ].join('\n');
}

/**
 * A starter lesson card written by the `g1nation.lesson.create` / `…fromConversation` commands for
 * the user to fill in. If `situation` is given (e.g. captured from the recent chat turn), it pre-fills
 * the Situation section.
 */
export function lessonTemplate(title: string, today: string, situation?: string): string {
    const safeTitle = (title || 'Untitled lesson').replace(/\n/g, ' ').trim();
    const situationBody = (situation && situation.trim()) ? situation.trim() : '<무슨 작업/맥락이었는지>';
    return [
        '---',
        'type: lesson',
        `title: ${safeTitle}`,
        'applies-to: []',
        'severity: medium',
        'source: curated',
        'occurrences: 1',
        `last-seen: ${today}`,
        '---',
        '',
        `# Lesson: ${safeTitle}`,
        '',
        '## Situation',
        situationBody,
        '',
        '## Mistake / Risk',
        '<무엇이 잘못됐거나 위험했는지>',
        '',
        '## Root Cause',
        '<왜 그렇게 됐는지 — 표면 증상이 아니라 근본 원인>',
        '',
        '## Fix',
        '<어떻게 고쳤는지>',
        '',
        '## Prevention Checklist',
        '- <다음에 비슷한 작업을 할 때 반드시 확인할 것>',
        '- ',
        '',
        '## Applies To',
        '- <태그: 기능/영역 이름>',
        '',
    ].join('\n');
}

/** Filesystem-safe slug for a lesson filename. */
export function lessonSlug(title: string): string {
    const base = (title || 'lesson')
        .toLowerCase()
        .replace(/[^a-z0-9가-힣]+/g, '-')
        .slice(0, 60)
        .replace(/^-+|-+$/g, '');
    return base || 'lesson';
}

// ── QA-feedback (regression complaint) detection ─────────────────────────────

/**
 * Heuristic: does this user message look like "you broke something again / same mistake / why does
 * this keep happening"? If so, the host offers to record a lesson. Deliberately conservative — false
 * positives just show a dismissible prompt, but we'd rather not nag.
 */
const QA_REGRESSION_PATTERNS: RegExp[] = [
    /또\s*(안\s*돼|안되|이래|발생|터졌|깨졌|망가졌)/,
    /(다시|또)\s*같은\s*(실수|문제|버그|에러|오류)/,
    /(비슷한|똑같은)\s*(실수|문제|버그|이슈|패턴)/,
    /왜\s*(자꾸|계속|반복|또)/,
    /(고쳤는데|수정했는데|패치했는데|바꿨는데)\s*(또|다시|여전히|아직).{0,20}(안|깨|망|문제|에러|오류|실패|broke|broken)/i,
    /(여전히|아직도)\s*(안\s*돼|안되|버그|깨|문제|실패)/,
    /regress(ion|ed)?/i,
    /\b(broke|broken|failing|still\s+broken|same\s+(bug|mistake|issue|error)|again)\b.{0,40}\b(again|still|repeat|recurr)/i,
    /\bwhy\b.{0,30}\b(keep|again|repeatedly|recurr)/i,
];
export function isQaRegressionFeedback(prompt: string): boolean {
    if (!prompt) return false;
    const t = prompt.trim();
    if (t.length < 4 || t.length > 4000) return false;
    return QA_REGRESSION_PATTERNS.some((re) => re.test(t));
}

// ── Lesson frontmatter parse / occurrences bump (for dedup-merge) ────────────

export interface LessonFrontmatter {
    type?: string;
    title?: string;
    occurrences?: number;
    appliesTo?: string[];
}

/** Parse the leading `--- ... ---` block. Returns {} when there is no frontmatter. */
export function parseLessonFrontmatter(content: string): LessonFrontmatter {
    if (!content) return {};
    const head = content.slice(0, 2000);
    if (!/^?---\s*\n/.test(head)) return {};
    const end = head.indexOf('\n---', 4);
    if (end < 0) return {};
    const block = head.slice(0, end);
    const get = (key: string) => {
        const m = block.match(new RegExp(`^\\s*${key}\\s*:\\s*(.+?)\\s*$`, 'm'));
        return m ? m[1].replace(/^["']|["']$/g, '').trim() : undefined;
    };
    const occ = get('occurrences');
    const tags = get('applies-to');
    let appliesTo: string[] | undefined;
    if (tags) {
        const inner = tags.replace(/^\[|\]$/g, '').trim();
        appliesTo = inner ? inner.split(',').map((s) => s.trim().replace(/^["']|["']$/g, '').trim()).filter(Boolean) : [];
    }
    return {
        type: get('type')?.toLowerCase(),
        title: get('title'),
        occurrences: occ !== undefined && Number.isFinite(Number(occ)) ? Number(occ) : undefined,
        appliesTo,
    };
}

/** Normalize a lesson title for equality matching (lowercase, strip punctuation/whitespace). */
export function normalizeLessonTitle(title: string): string {
    return (title || '').toLowerCase().replace(/[^a-z0-9가-힣]+/g, '');
}

/**
 * Return `content` with the frontmatter's `occurrences:` incremented by 1 and `last-seen:` set to
 * `today`. If the keys are missing they're inserted just inside the frontmatter block. If there is
 * no frontmatter at all, `content` is returned unchanged (caller decides what to do).
 */
export function bumpLessonOccurrences(content: string, today: string): string {
    if (!/^?---\s*\n/.test(content)) return content;
    const end = content.indexOf('\n---', 4);
    if (end < 0) return content;
    let block = content.slice(0, end);
    const rest = content.slice(end);
    const cur = parseLessonFrontmatter(content).occurrences ?? 1;
    if (/^\s*occurrences\s*:/m.test(block)) {
        block = block.replace(/^(\s*occurrences\s*:\s*).*$/m, `$1${cur + 1}`);
    } else {
        block += `\noccurrences: ${cur + 1}`;
    }
    if (/^\s*last-seen\s*:/m.test(block)) {
        block = block.replace(/^(\s*last-seen\s*:\s*).*$/m, `$1${today}`);
    } else {
        block += `\nlast-seen: ${today}`;
    }
    return block + rest;
}

// ── Post-answer checklist coverage (non-blocking flag) ──────────────────────

/** "Significant" words of a checklist item — drops placeholders, punctuation, very short tokens. */
function checklistItemTerms(item: string): string[] {
    if (/^</.test(item.trim())) return []; // template placeholder like "<다음에 확인할 것>"
    return Array.from(new Set(tokenize(item))).filter((t) => t.length >= 2);
}

/**
 * Given the assistant's answer and the lesson cards injected this turn, return Prevention-Checklist
 * items that the answer does not visibly address (zero of their significant terms appear). Conservative
 * by design — only flags items with at least 2 significant terms and a real, non-placeholder body.
 * Capped at `max` items so the footer doesn't get noisy.
 */
export function findUnaddressedChecklistItems(answer: string, lessonContents: string[], max = 3): string[] {
    if (!answer || !lessonContents || lessonContents.length === 0) return [];
    const answerTerms = new Set(tokenize(answer));
    const out: string[] = [];
    const seen = new Set<string>();
    for (const content of lessonContents) {
        for (const item of extractPreventionChecklist(content)) {
            const key = normalizeLessonTitle(item);
            if (!key || seen.has(key)) continue;
            const terms = checklistItemTerms(item);
            if (terms.length < 2) continue; // too vague to judge
            const covered = terms.some((t) => answerTerms.has(t));
            if (!covered) {
                out.push(item);
                seen.add(key);
                if (out.length >= max) return out;
            }
        }
    }
    return out;
}