Files
connectai/src/retrieval/lessonHelpers.ts
T

326 lines
14 KiB
TypeScript
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* ============================================================
* Lesson / Experience Memory — pure helpers (no vscode dependency)
*
* "Lesson" = a markdown file in the active brain that captures a past mistake/risk and how to avoid
* repeating it. Identified by a `lessons/` / `playbooks/` / `qa-findings/` path segment, or by
* frontmatter `type: lesson|playbook|qa-finding`. These are retrieved like any other brain file but
* boosted and injected as a prominent "verify before finalizing" checklist (see EXPERIENCE_MEMORY_PLAN.md).
* ============================================================
*/
import { tokenize } from './scoring';
/** Path segments that mark a file as lesson-like. */
export const LESSON_DIR_RE = /(^|[\\/])(lessons?|playbooks?|qa[-_]?findings?)([\\/]|$)/i;
export type LessonKind = 'lesson' | 'playbook' | 'qa-finding';
/**
* Decide whether a brain file is a lesson (and which kind). Cheap — only looks at the relative path
* and, if present, the YAML-ish frontmatter at the top of `content`.
*
* @returns the kind string, or '' for an ordinary note.
*/
export function detectLessonKind(relativePath: string, content: string): LessonKind | '' {
// 1) Frontmatter `type:` wins if present.
const fm = parseFrontmatterType(content);
if (fm === 'lesson' || fm === 'playbook' || fm === 'qa-finding') return fm;
// 2) Otherwise infer from the path.
const m = LESSON_DIR_RE.exec(relativePath || '');
if (!m) return '';
const seg = m[2].toLowerCase();
if (seg.startsWith('playbook')) return 'playbook';
if (seg.startsWith('qa')) return 'qa-finding';
return 'lesson';
}
/** Pull the `type:` value out of a leading `--- ... ---` frontmatter block. Returns '' if absent. */
function parseFrontmatterType(content: string): string {
if (!content) return '';
const head = content.slice(0, 800);
if (!/^?---\s*\n/.test(head)) return '';
const end = head.indexOf('\n---', 4);
if (end < 0) return '';
const block = head.slice(0, end);
const m = block.match(/^\s*type\s*:\s*["']?([a-zA-Z-]+)["']?\s*$/m);
return m ? m[1].trim().toLowerCase() : '';
}
/**
* Pull a specific markdown section ("## NAME ... up to the next heading") from a lesson card.
* Returns trimmed body text, or '' if the heading isn't found.
*/
function extractSection(content: string, headingRe: RegExp): string {
const m = content.match(headingRe);
if (!m || m.index === undefined) return '';
const after = content.slice(m.index + m[0].length);
const stop = after.search(/\n#{1,6}\s/);
const section = stop >= 0 ? after.slice(0, stop) : after;
return section.trim();
}
/**
* Slim a lesson card down to the sections that actually matter for guardrails:
* Mistake / Risk, Root Cause, Fix, and Prevention Checklist. Drops Situation,
* Applies-To, and any verbose narrative. Returned text is markdown-compatible
* with the original headings so the model still sees the structure.
*
* Falls back to the original content (clipped to `maxLen`) if no recognised
* sections are found — keeps backwards-compat for old lessons that don't
* follow the current template.
*
* Why: lesson cards are loaded at 2500 chars each and three cards can eat
* ~11K tokens. The essence sections are usually <600 chars total per card,
* which trims retrieval tokens by ~70% without losing the signal.
*/
export function extractLessonEssence(content: string, maxLen = 1200): string {
if (!content) return '';
const sections: Array<{ heading: string; body: string }> = [];
const want: Array<[string, RegExp]> = [
['## Mistake / Risk', /^#{1,6}\s*(?:mistake\s*\/?\s*risk|mistake|risk|실수|문제)\s*$/im],
['## Root Cause', /^#{1,6}\s*(?:root\s*cause|근본\s*원인|원인)\s*$/im],
['## Fix', /^#{1,6}\s*(?:fix|해결|수정)\s*$/im],
['## Prevention Checklist', /^#{1,6}\s*(?:prevention\s*checklist|prevention|체크리스트|예방\s*체크리스트)\s*$/im],
];
for (const [heading, re] of want) {
const body = extractSection(content, re);
if (body && !/^<[^>]+>$/.test(body)) sections.push({ heading, body });
}
if (sections.length === 0) {
return content.length <= maxLen ? content.trim() : content.slice(0, maxLen).trim() + '\n…';
}
let assembled = sections.map((s) => `${s.heading}\n${s.body}`).join('\n\n');
if (assembled.length > maxLen) assembled = assembled.slice(0, maxLen).trim() + '\n…';
return assembled;
}
/** Extract the "## Prevention Checklist" bullet list from a lesson card, if present. */
export function extractPreventionChecklist(content: string): string[] {
if (!content) return [];
const m = content.match(/^#{1,6}\s*(?:prevention\s*checklist|prevention|체크리스트|예방\s*체크리스트)\s*$/im);
if (!m || m.index === undefined) return [];
const after = content.slice(m.index + m[0].length);
// Stop at the next heading.
const stop = after.search(/\n#{1,6}\s/);
const section = stop >= 0 ? after.slice(0, stop) : after;
return section
.split('\n')
.map((l) => l.trim())
.filter((l) => /^[-*]\s+/.test(l))
.map((l) => l.replace(/^[-*]\s+/, '').trim())
.filter(Boolean);
}
export interface LessonChunkLite {
title: string; // relative path / display title
content: string; // excerpt or full card text
}
/**
* Build the prompt block injected ahead of the regular RAG context. Kept compact; if a card has a
* parseable Prevention Checklist we surface just that, otherwise the card text.
*/
export function buildLessonChecklistBlock(chunks: LessonChunkLite[]): string {
if (!chunks || chunks.length === 0) return '';
const sections: string[] = [];
for (const c of chunks) {
const checklist = extractPreventionChecklist(c.content);
const body = checklist.length > 0
? checklist.map((item) => `- [ ] ${item}`).join('\n')
: c.content.trim();
sections.push(`### ${c.title}\n${body}`);
}
return [
'[⚠ ACTIVE LESSONS — verify these BEFORE finalizing your answer]',
'These are recorded lessons from past work on this project. Read them first and make sure you are NOT',
'about to repeat any of the mistakes / skip any of the precautions below. If a checklist item is relevant',
'to the current request, explicitly confirm it in your answer. If a lesson conflicts with the user, prefer',
'the user but flag the conflict.',
'',
sections.join('\n\n'),
'',
'[END ACTIVE LESSONS]',
].join('\n');
}
/**
* A starter lesson card written by the `g1nation.lesson.create` / `…fromConversation` commands for
* the user to fill in. If `situation` is given (e.g. captured from the recent chat turn), it pre-fills
* the Situation section.
*/
export function lessonTemplate(title: string, today: string, situation?: string): string {
const safeTitle = (title || 'Untitled lesson').replace(/\n/g, ' ').trim();
const situationBody = (situation && situation.trim()) ? situation.trim() : '<무슨 작업/맥락이었는지>';
return [
'---',
'type: lesson',
`title: ${safeTitle}`,
'applies-to: []',
'severity: medium',
'source: curated',
'occurrences: 1',
`last-seen: ${today}`,
'---',
'',
`# Lesson: ${safeTitle}`,
'',
'## Situation',
situationBody,
'',
'## Mistake / Risk',
'<무엇이 잘못됐거나 위험했는지>',
'',
'## Root Cause',
'<왜 그렇게 됐는지 — 표면 증상이 아니라 근본 원인>',
'',
'## Fix',
'<어떻게 고쳤는지>',
'',
'## Prevention Checklist',
'- <다음에 비슷한 작업을 할 때 반드시 확인할 것>',
'- ',
'',
'## Applies To',
'- <태그: 기능/영역 이름>',
'',
].join('\n');
}
/** Filesystem-safe slug for a lesson filename. */
export function lessonSlug(title: string): string {
const base = (title || 'lesson')
.toLowerCase()
.replace(/[^a-z0-9가-힣]+/g, '-')
.slice(0, 60)
.replace(/^-+|-+$/g, '');
return base || 'lesson';
}
// ── QA-feedback (regression complaint) detection ─────────────────────────────
/**
* Heuristic: does this user message look like "you broke something again / same mistake / why does
* this keep happening"? If so, the host offers to record a lesson. Deliberately conservative — false
* positives just show a dismissible prompt, but we'd rather not nag.
*/
const QA_REGRESSION_PATTERNS: RegExp[] = [
/또\s*(안\s*돼|안되|이래|발생|터졌|깨졌|망가졌)/,
/(다시|또)\s*같은\s*(실수|문제|버그|에러|오류)/,
/(비슷한|똑같은)\s*(실수|문제|버그|이슈|패턴)/,
/왜\s*(자꾸|계속|반복|또)/,
/(고쳤는데|수정했는데|패치했는데|바꿨는데)\s*(또|다시|여전히|아직).{0,20}(안|깨|망|문제|에러|오류|실패|broke|broken)/i,
/(여전히|아직도)\s*(안\s*돼|안되|버그|깨|문제|실패)/,
/regress(ion|ed)?/i,
/\b(broke|broken|failing|still\s+broken|same\s+(bug|mistake|issue|error)|again)\b.{0,40}\b(again|still|repeat|recurr)/i,
/\bwhy\b.{0,30}\b(keep|again|repeatedly|recurr)/i,
];
export function isQaRegressionFeedback(prompt: string): boolean {
if (!prompt) return false;
const t = prompt.trim();
if (t.length < 4 || t.length > 4000) return false;
return QA_REGRESSION_PATTERNS.some((re) => re.test(t));
}
// ── Lesson frontmatter parse / occurrences bump (for dedup-merge) ────────────
export interface LessonFrontmatter {
type?: string;
title?: string;
occurrences?: number;
appliesTo?: string[];
}
/** Parse the leading `--- ... ---` block. Returns {} when there is no frontmatter. */
export function parseLessonFrontmatter(content: string): LessonFrontmatter {
if (!content) return {};
const head = content.slice(0, 2000);
if (!/^?---\s*\n/.test(head)) return {};
const end = head.indexOf('\n---', 4);
if (end < 0) return {};
const block = head.slice(0, end);
const get = (key: string) => {
const m = block.match(new RegExp(`^\\s*${key}\\s*:\\s*(.+?)\\s*$`, 'm'));
return m ? m[1].replace(/^["']|["']$/g, '').trim() : undefined;
};
const occ = get('occurrences');
const tags = get('applies-to');
let appliesTo: string[] | undefined;
if (tags) {
const inner = tags.replace(/^\[|\]$/g, '').trim();
appliesTo = inner ? inner.split(',').map((s) => s.trim().replace(/^["']|["']$/g, '').trim()).filter(Boolean) : [];
}
return {
type: get('type')?.toLowerCase(),
title: get('title'),
occurrences: occ !== undefined && Number.isFinite(Number(occ)) ? Number(occ) : undefined,
appliesTo,
};
}
/** Normalize a lesson title for equality matching (lowercase, strip punctuation/whitespace). */
export function normalizeLessonTitle(title: string): string {
return (title || '').toLowerCase().replace(/[^a-z0-9가-힣]+/g, '');
}
/**
* Return `content` with the frontmatter's `occurrences:` incremented by 1 and `last-seen:` set to
* `today`. If the keys are missing they're inserted just inside the frontmatter block. If there is
* no frontmatter at all, `content` is returned unchanged (caller decides what to do).
*/
export function bumpLessonOccurrences(content: string, today: string): string {
if (!/^?---\s*\n/.test(content)) return content;
const end = content.indexOf('\n---', 4);
if (end < 0) return content;
let block = content.slice(0, end);
const rest = content.slice(end);
const cur = parseLessonFrontmatter(content).occurrences ?? 1;
if (/^\s*occurrences\s*:/m.test(block)) {
block = block.replace(/^(\s*occurrences\s*:\s*).*$/m, `$1${cur + 1}`);
} else {
block += `\noccurrences: ${cur + 1}`;
}
if (/^\s*last-seen\s*:/m.test(block)) {
block = block.replace(/^(\s*last-seen\s*:\s*).*$/m, `$1${today}`);
} else {
block += `\nlast-seen: ${today}`;
}
return block + rest;
}
// ── Post-answer checklist coverage (non-blocking flag) ──────────────────────
/** "Significant" words of a checklist item — drops placeholders, punctuation, very short tokens. */
function checklistItemTerms(item: string): string[] {
if (/^</.test(item.trim())) return []; // template placeholder like "<다음에 확인할 것>"
return Array.from(new Set(tokenize(item))).filter((t) => t.length >= 2);
}
/**
* Given the assistant's answer and the lesson cards injected this turn, return Prevention-Checklist
* items that the answer does not visibly address (zero of their significant terms appear). Conservative
* by design — only flags items with at least 2 significant terms and a real, non-placeholder body.
* Capped at `max` items so the footer doesn't get noisy.
*/
export function findUnaddressedChecklistItems(answer: string, lessonContents: string[], max = 3): string[] {
if (!answer || !lessonContents || lessonContents.length === 0) return [];
const answerTerms = new Set(tokenize(answer));
const out: string[] = [];
const seen = new Set<string>();
for (const content of lessonContents) {
for (const item of extractPreventionChecklist(content)) {
const key = normalizeLessonTitle(item);
if (!key || seen.has(key)) continue;
const terms = checklistItemTerms(item);
if (terms.length < 2) continue; // too vague to judge
const covered = terms.some((t) => answerTerms.has(t));
if (!covered) {
out.push(item);
seen.add(key);
if (out.length >= max) return out;
}
}
}
return out;
}