refactor: optimize core engine and retrieval logic for v2.80.43

This commit is contained in:
2026-05-13 19:23:57 +09:00
parent c4260466b9
commit 089abf22db
17 changed files with 1311 additions and 88 deletions
+115
View File
@@ -316,6 +316,121 @@ export function scoreTfIdfPreTokenized(
});
}
/**
* Split markdown content into top-level sections by `#` / `##` / `###` headings.
*
* Returned sections are `{ heading, body }` — `heading` includes the heading
* line itself (preserving level), `body` is the text up to the next heading
* of the same-or-shallower depth. Front-matter (a leading `--- … ---` block)
* is dropped because it's not query-relevant.
*
* A document with no headings returns one synthetic section
* `{ heading: '', body: content }` so callers can treat the result uniformly.
*
* Why this exists: retrieval was returning whole files (excerpts capped at
* 400 chars). On long notes, that excerpt was often the file's intro/setup,
* not the section that actually matched the query. Section-level retrieval
* lets us pick the relevant heading directly and drop everything else.
*/
export interface MarkdownSection {
heading: string;
body: string;
}
export function splitMarkdownSections(content: string): MarkdownSection[] {
if (!content) return [];
// Strip frontmatter
let text = content;
if (/^?---\s*\n/.test(text)) {
const end = text.indexOf('\n---', 4);
if (end >= 0) text = text.slice(end + 4).replace(/^\s*\n/, '');
}
const lines = text.split('\n');
const headingIdx: Array<{ line: number; level: number }> = [];
for (let i = 0; i < lines.length; i++) {
const m = /^(#{1,6})\s+\S/.exec(lines[i]);
if (m) headingIdx.push({ line: i, level: m[1].length });
}
if (headingIdx.length === 0) {
return [{ heading: '', body: text.trim() }];
}
const sections: MarkdownSection[] = [];
// Capture any leading content above the first heading as a "preamble" section.
if (headingIdx[0].line > 0) {
const preamble = lines.slice(0, headingIdx[0].line).join('\n').trim();
if (preamble) sections.push({ heading: '', body: preamble });
}
for (let i = 0; i < headingIdx.length; i++) {
const start = headingIdx[i].line;
const end = i + 1 < headingIdx.length ? headingIdx[i + 1].line : lines.length;
const heading = lines[start].trim();
const body = lines.slice(start + 1, end).join('\n').trim();
sections.push({ heading, body });
}
return sections;
}
/**
* Pick the best heading-bounded section of a markdown document for a query,
* then fall back to keyword-window extraction inside that section if the
* section itself is still too long.
*
* Strategy:
* 1. Split into sections by heading (`splitMarkdownSections`).
* 2. Score each section's heading + body by query token overlap; weight
* heading matches 3× so "## Foo" beats a body mention of "foo".
* 3. If the top section's text fits, return it as-is (heading + body).
* 4. Otherwise, run `extractBestExcerpt` inside the top section's body and
* prepend the heading.
*
* Falls back to a plain `extractBestExcerpt` when the document has no
* headings — that's what `splitMarkdownSections` returns as a single
* synthetic section.
*
* Caps:
* - Output is always ≤ `maxLength` (final excerpt is sliced as a safety net).
* - Sections smaller than 24 chars after stripping are skipped — they're
* usually empty headings the author left as placeholders.
*/
export function extractBestSection(
content: string,
queryTokens: string[],
maxLength = 600
): string {
const sections = splitMarkdownSections(content);
if (sections.length === 0) return content.slice(0, maxLength);
if (sections.length === 1 && !sections[0].heading) {
return extractBestExcerpt(sections[0].body || content, queryTokens, maxLength);
}
const expanded = expandQuery(queryTokens);
const expandedSet = new Set(expanded);
const scoreText = (text: string) => {
if (!text) return 0;
const toks = tokenize(text);
let hits = 0;
for (const t of toks) if (expandedSet.has(t)) hits++;
return hits;
};
let best = { idx: -1, score: -1 };
for (let i = 0; i < sections.length; i++) {
const s = sections[i];
if ((s.heading.length + s.body.length) < 24) continue;
const score = scoreText(s.heading) * 3 + scoreText(s.body);
if (score > best.score) best = { idx: i, score };
}
if (best.idx < 0) {
// No section contained any query terms — fall back to a whole-doc excerpt.
return extractBestExcerpt(content, queryTokens, maxLength);
}
const picked = sections[best.idx];
const headingLine = picked.heading ? `${picked.heading}\n` : '';
const room = Math.max(64, maxLength - headingLine.length);
if (picked.body.length <= room) {
return (headingLine + picked.body).slice(0, maxLength).trim();
}
const inner = extractBestExcerpt(picked.body, queryTokens, room);
return (headingLine + inner).slice(0, maxLength).trim();
}
/**
* 텍스트에서 가장 관련성 높은 구간(excerpt)을 추출합니다.
* 단순 paragraph 단위가 아니라, 키워드 밀도가 높은 윈도우를 찾습니다.