refactor: optimize core engine and retrieval logic for v2.80.43

2026-05-13 19:23:57 +09:00
parent c4260466b9
commit 089abf22db
17 changed files with 1311 additions and 88 deletions
@@ -316,6 +316,121 @@ export function scoreTfIdfPreTokenized(
    });
 }

+/**
+ * Split markdown content into top-level sections by `#` / `##` / `###` headings.
+ *
+ * Returned sections are `{ heading, body }` — `heading` includes the heading
+ * line itself (preserving level), `body` is the text up to the next heading
+ * of the same-or-shallower depth. Front-matter (a leading `--- … ---` block)
+ * is dropped because it's not query-relevant.
+ *
+ * A document with no headings returns one synthetic section
+ * `{ heading: '', body: content }` so callers can treat the result uniformly.
+ *
+ * Why this exists: retrieval was returning whole files (excerpts capped at
+ * 400 chars). On long notes, that excerpt was often the file's intro/setup,
+ * not the section that actually matched the query. Section-level retrieval
+ * lets us pick the relevant heading directly and drop everything else.
+ */
+export interface MarkdownSection {
+    heading: string;
+    body: string;
+}
+export function splitMarkdownSections(content: string): MarkdownSection[] {
+    if (!content) return [];
+    // Strip frontmatter
+    let text = content;
+    if (/^?---\s*\n/.test(text)) {
+        const end = text.indexOf('\n---', 4);
+        if (end >= 0) text = text.slice(end + 4).replace(/^\s*\n/, '');
+    }
+    const lines = text.split('\n');
+    const headingIdx: Array<{ line: number; level: number }> = [];
+    for (let i = 0; i < lines.length; i++) {
+        const m = /^(#{1,6})\s+\S/.exec(lines[i]);
+        if (m) headingIdx.push({ line: i, level: m[1].length });
+    }
+    if (headingIdx.length === 0) {
+        return [{ heading: '', body: text.trim() }];
+    }
+    const sections: MarkdownSection[] = [];
+    // Capture any leading content above the first heading as a "preamble" section.
+    if (headingIdx[0].line > 0) {
+        const preamble = lines.slice(0, headingIdx[0].line).join('\n').trim();
+        if (preamble) sections.push({ heading: '', body: preamble });
+    }
+    for (let i = 0; i < headingIdx.length; i++) {
+        const start = headingIdx[i].line;
+        const end = i + 1 < headingIdx.length ? headingIdx[i + 1].line : lines.length;
+        const heading = lines[start].trim();
+        const body = lines.slice(start + 1, end).join('\n').trim();
+        sections.push({ heading, body });
+    }
+    return sections;
+}
+
+/**
+ * Pick the best heading-bounded section of a markdown document for a query,
+ * then fall back to keyword-window extraction inside that section if the
+ * section itself is still too long.
+ *
+ * Strategy:
+ *   1. Split into sections by heading (`splitMarkdownSections`).
+ *   2. Score each section's heading + body by query token overlap; weight
+ *      heading matches 3× so "## Foo" beats a body mention of "foo".
+ *   3. If the top section's text fits, return it as-is (heading + body).
+ *   4. Otherwise, run `extractBestExcerpt` inside the top section's body and
+ *      prepend the heading.
+ *
+ * Falls back to a plain `extractBestExcerpt` when the document has no
+ * headings — that's what `splitMarkdownSections` returns as a single
+ * synthetic section.
+ *
+ * Caps:
+ *   - Output is always ≤ `maxLength` (final excerpt is sliced as a safety net).
+ *   - Sections smaller than 24 chars after stripping are skipped — they're
+ *     usually empty headings the author left as placeholders.
+ */
+export function extractBestSection(
+    content: string,
+    queryTokens: string[],
+    maxLength = 600
+): string {
+    const sections = splitMarkdownSections(content);
+    if (sections.length === 0) return content.slice(0, maxLength);
+    if (sections.length === 1 && !sections[0].heading) {
+        return extractBestExcerpt(sections[0].body || content, queryTokens, maxLength);
+    }
+    const expanded = expandQuery(queryTokens);
+    const expandedSet = new Set(expanded);
+    const scoreText = (text: string) => {
+        if (!text) return 0;
+        const toks = tokenize(text);
+        let hits = 0;
+        for (const t of toks) if (expandedSet.has(t)) hits++;
+        return hits;
+    };
+    let best = { idx: -1, score: -1 };
+    for (let i = 0; i < sections.length; i++) {
+        const s = sections[i];
+        if ((s.heading.length + s.body.length) < 24) continue;
+        const score = scoreText(s.heading) * 3 + scoreText(s.body);
+        if (score > best.score) best = { idx: i, score };
+    }
+    if (best.idx < 0) {
+        // No section contained any query terms — fall back to a whole-doc excerpt.
+        return extractBestExcerpt(content, queryTokens, maxLength);
+    }
+    const picked = sections[best.idx];
+    const headingLine = picked.heading ? `${picked.heading}\n` : '';
+    const room = Math.max(64, maxLength - headingLine.length);
+    if (picked.body.length <= room) {
+        return (headingLine + picked.body).slice(0, maxLength).trim();
+    }
+    const inner = extractBestExcerpt(picked.body, queryTokens, room);
+    return (headingLine + inner).slice(0, maxLength).trim();
+}
+
 /**
 * 텍스트에서 가장 관련성 높은 구간(excerpt)을 추출합니다.
 * 단순 paragraph 단위가 아니라, 키워드 밀도가 높은 윈도우를 찾습니다.