refactor: optimize core engine and retrieval logic for v2.80.43
This commit is contained in:
@@ -316,6 +316,121 @@ export function scoreTfIdfPreTokenized(
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Split markdown content into top-level sections by `#` / `##` / `###` headings.
|
||||
*
|
||||
* Returned sections are `{ heading, body }` — `heading` includes the heading
|
||||
* line itself (preserving level), `body` is the text up to the next heading
|
||||
* of the same-or-shallower depth. Front-matter (a leading `--- … ---` block)
|
||||
* is dropped because it's not query-relevant.
|
||||
*
|
||||
* A document with no headings returns one synthetic section
|
||||
* `{ heading: '', body: content }` so callers can treat the result uniformly.
|
||||
*
|
||||
* Why this exists: retrieval was returning whole files (excerpts capped at
|
||||
* 400 chars). On long notes, that excerpt was often the file's intro/setup,
|
||||
* not the section that actually matched the query. Section-level retrieval
|
||||
* lets us pick the relevant heading directly and drop everything else.
|
||||
*/
|
||||
export interface MarkdownSection {
|
||||
heading: string;
|
||||
body: string;
|
||||
}
|
||||
export function splitMarkdownSections(content: string): MarkdownSection[] {
|
||||
if (!content) return [];
|
||||
// Strip frontmatter
|
||||
let text = content;
|
||||
if (/^?---\s*\n/.test(text)) {
|
||||
const end = text.indexOf('\n---', 4);
|
||||
if (end >= 0) text = text.slice(end + 4).replace(/^\s*\n/, '');
|
||||
}
|
||||
const lines = text.split('\n');
|
||||
const headingIdx: Array<{ line: number; level: number }> = [];
|
||||
for (let i = 0; i < lines.length; i++) {
|
||||
const m = /^(#{1,6})\s+\S/.exec(lines[i]);
|
||||
if (m) headingIdx.push({ line: i, level: m[1].length });
|
||||
}
|
||||
if (headingIdx.length === 0) {
|
||||
return [{ heading: '', body: text.trim() }];
|
||||
}
|
||||
const sections: MarkdownSection[] = [];
|
||||
// Capture any leading content above the first heading as a "preamble" section.
|
||||
if (headingIdx[0].line > 0) {
|
||||
const preamble = lines.slice(0, headingIdx[0].line).join('\n').trim();
|
||||
if (preamble) sections.push({ heading: '', body: preamble });
|
||||
}
|
||||
for (let i = 0; i < headingIdx.length; i++) {
|
||||
const start = headingIdx[i].line;
|
||||
const end = i + 1 < headingIdx.length ? headingIdx[i + 1].line : lines.length;
|
||||
const heading = lines[start].trim();
|
||||
const body = lines.slice(start + 1, end).join('\n').trim();
|
||||
sections.push({ heading, body });
|
||||
}
|
||||
return sections;
|
||||
}
|
||||
|
||||
/**
|
||||
* Pick the best heading-bounded section of a markdown document for a query,
|
||||
* then fall back to keyword-window extraction inside that section if the
|
||||
* section itself is still too long.
|
||||
*
|
||||
* Strategy:
|
||||
* 1. Split into sections by heading (`splitMarkdownSections`).
|
||||
* 2. Score each section's heading + body by query token overlap; weight
|
||||
* heading matches 3× so "## Foo" beats a body mention of "foo".
|
||||
* 3. If the top section's text fits, return it as-is (heading + body).
|
||||
* 4. Otherwise, run `extractBestExcerpt` inside the top section's body and
|
||||
* prepend the heading.
|
||||
*
|
||||
* Falls back to a plain `extractBestExcerpt` when the document has no
|
||||
* headings — that's what `splitMarkdownSections` returns as a single
|
||||
* synthetic section.
|
||||
*
|
||||
* Caps:
|
||||
* - Output is always ≤ `maxLength` (final excerpt is sliced as a safety net).
|
||||
* - Sections smaller than 24 chars after stripping are skipped — they're
|
||||
* usually empty headings the author left as placeholders.
|
||||
*/
|
||||
export function extractBestSection(
|
||||
content: string,
|
||||
queryTokens: string[],
|
||||
maxLength = 600
|
||||
): string {
|
||||
const sections = splitMarkdownSections(content);
|
||||
if (sections.length === 0) return content.slice(0, maxLength);
|
||||
if (sections.length === 1 && !sections[0].heading) {
|
||||
return extractBestExcerpt(sections[0].body || content, queryTokens, maxLength);
|
||||
}
|
||||
const expanded = expandQuery(queryTokens);
|
||||
const expandedSet = new Set(expanded);
|
||||
const scoreText = (text: string) => {
|
||||
if (!text) return 0;
|
||||
const toks = tokenize(text);
|
||||
let hits = 0;
|
||||
for (const t of toks) if (expandedSet.has(t)) hits++;
|
||||
return hits;
|
||||
};
|
||||
let best = { idx: -1, score: -1 };
|
||||
for (let i = 0; i < sections.length; i++) {
|
||||
const s = sections[i];
|
||||
if ((s.heading.length + s.body.length) < 24) continue;
|
||||
const score = scoreText(s.heading) * 3 + scoreText(s.body);
|
||||
if (score > best.score) best = { idx: i, score };
|
||||
}
|
||||
if (best.idx < 0) {
|
||||
// No section contained any query terms — fall back to a whole-doc excerpt.
|
||||
return extractBestExcerpt(content, queryTokens, maxLength);
|
||||
}
|
||||
const picked = sections[best.idx];
|
||||
const headingLine = picked.heading ? `${picked.heading}\n` : '';
|
||||
const room = Math.max(64, maxLength - headingLine.length);
|
||||
if (picked.body.length <= room) {
|
||||
return (headingLine + picked.body).slice(0, maxLength).trim();
|
||||
}
|
||||
const inner = extractBestExcerpt(picked.body, queryTokens, room);
|
||||
return (headingLine + inner).slice(0, maxLength).trim();
|
||||
}
|
||||
|
||||
/**
|
||||
* 텍스트에서 가장 관련성 높은 구간(excerpt)을 추출합니다.
|
||||
* 단순 paragraph 단위가 아니라, 키워드 밀도가 높은 윈도우를 찾습니다.
|
||||
|
||||
Reference in New Issue
Block a user