refactor: optimize core engine and retrieval logic for v2.80.43

This commit is contained in:
2026-05-13 19:23:57 +09:00
parent c4260466b9
commit 089abf22db
17 changed files with 1311 additions and 88 deletions
+391 -11
View File
@@ -30,6 +30,7 @@ import { StatusBarManager, AgentStatus } from './core/statusBar';
import { lockManager } from './core/lock';
import { actionQueue } from './core/queue';
import { ConflictResolver } from './core/conflict';
import { recordTelemetry } from './core/telemetry';
import {
buildSecondBrainTrace,
enforceProjectClaimPolicyInAnswer,
@@ -40,6 +41,8 @@ import {
import { MemoryManager } from './memory';
import { RetrievalOrchestrator } from './retrieval';
import { buildLessonChecklistBlock, isQaRegressionFeedback, findUnaddressedChecklistItems } from './retrieval/lessonHelpers';
import { embedQuery, embedTexts } from './retrieval/embeddings';
import { backfillBrainEmbeddings } from './retrieval/brainIndex';
import { resolveScopeForAgent } from './skills/agentKnowledgeMap';
import {
extractVisibleFinal,
@@ -117,6 +120,51 @@ const AGENT_PROMPTS: Record<AgentRole, string> = {
3. Deliver a logical, consistent, and polished response.`
};
/**
* Compact recent chat sessions for medium-term memory retrieval.
*
* Returns up to `limit + 5` recently-touched sessions (excluding the active
* one) as small summaries: title + first user message + tail of the last
* assistant message. The retrieval orchestrator then scores these against the
* current query and selects the top `limit` matches inside the shared budget.
*
* We pull a few more than `limit` so TF-IDF scoring has room to rerank — the
* persisted list is timestamp-ordered, which isn't the same as topical fit.
*/
function compactRecentSessions(
rawSessions: any[],
activeSessionId: string | null,
limit: number,
): Array<{ id: string; title: string; firstUserMsg: string; lastAssistantExcerpt: string; summary?: string; timestamp: number }> {
if (!Array.isArray(rawSessions) || rawSessions.length === 0 || limit <= 0) return [];
const pool = rawSessions.length > limit + 5 ? limit + 5 : rawSessions.length;
const out: Array<{ id: string; title: string; firstUserMsg: string; lastAssistantExcerpt: string; summary?: string; timestamp: number }> = [];
for (let i = 0; i < rawSessions.length && out.length < pool; i++) {
const s = rawSessions[i];
if (!s || typeof s !== 'object') continue;
const id = String(s.id ?? '');
if (!id || id === activeSessionId) continue;
const history: any[] = Array.isArray(s.history) ? s.history : [];
if (history.length === 0) continue;
const firstUser = history.find((m) => m?.role === 'user');
const lastAssistant = [...history].reverse().find((m) => m?.role === 'assistant');
const firstUserMsg = String(firstUser?.content ?? '').replace(/\s+/g, ' ').trim().slice(0, 200);
const lastTxt = String(lastAssistant?.content ?? '').replace(/\s+/g, ' ').trim();
const lastAssistantExcerpt = lastTxt.length <= 200 ? lastTxt : lastTxt.slice(-200);
const summary = typeof s.summary === 'string' ? s.summary.trim().slice(0, 600) : undefined;
if (!firstUserMsg && !lastAssistantExcerpt && !summary) continue;
out.push({
id,
title: String(s.title ?? '').trim() || firstUserMsg.slice(0, 50),
firstUserMsg,
lastAssistantExcerpt,
summary,
timestamp: typeof s.timestamp === 'number' ? s.timestamp : 0,
});
}
return out;
}
// Local-path detectors used to decide whether a user prompt refers to a file/dir on disk.
// POSIX: /Volumes/, /Users/, /home/, /opt/, ... or ~/ — backtick excluded (markdown code spans).
const POSIX_ABS_PATH_SRC = "(?:\\/(?:Volumes|Users|home|opt|srv|mnt|data|workspace)\\/|~\\/)[^\\s`\"'<>|*?]+";
@@ -328,6 +376,10 @@ export class AgentExecutor {
if (!this.webview) return;
// Telemetry: wall-clock start of the user-visible turn. Only meaningful
// at loopDepth===0 (action-loop recursions roll up into the same turn).
const turnStartMs = loopDepth === 0 ? Date.now() : 0;
try {
// 0. Safety Check: Rollback any dangling transaction from previous runs
if (this.transactionManager.isActive()) {
@@ -471,9 +523,19 @@ export class AgentExecutor {
const secondBrainTraceCtx = secondBrainTrace
? `\n\n${renderSecondBrainTraceContext(secondBrainTrace)}`
: '';
const retrievalStartMs = Date.now();
const memoryCtx = isCasualConversation
? ''
: this.buildMemoryContext(prompt || '', activeBrain, options.agentSkillFile);
: await this.buildMemoryContext(prompt || '', activeBrain, options.agentSkillFile);
if (loopDepth === 0 && !isCasualConversation && this._lastRetrievalInfo) {
recordTelemetry({
kind: 'retrieval',
durationMs: Date.now() - retrievalStartMs,
brainFiles: this._lastRetrievalInfo.usedBrainFiles.length,
memoryLayers: this._lastRetrievalInfo.usedMemoryLayers,
note: `chunks=${this._lastRetrievalInfo.selectedChunks}/${this._lastRetrievalInfo.totalChunks} lessons=${this._lastRetrievalInfo.lessonFiles.length}`,
});
}
const knowledgeContextForPrompt = isCasualConversation
? ''
: `${brainContext}${brainInventoryCtx}`;
@@ -677,6 +739,16 @@ export class AgentExecutor {
this.options.onStreamLifecycle?.start();
}
// Progressive answering: live-stream tokens to the webview during
// the user-visible first turn (loopDepth === 0). The bubble fills
// as the model generates instead of dropping all at once at the end,
// and any auto-continuation rounds keep posting deltas through the
// same channel. Post-processing (reasoning strip / sanitize /
// policy enforcement) emits a final `streamReplace` so the bubble
// ends up matching the cleaned answer regardless of what slipped
// through live.
const postLiveDeltas = loopDepth === 0;
if (useLmStudioSdk) {
apiUrl = `${ollamaUrl} (sdk)`;
logInfo('Streaming chat via LM Studio SDK.', { model: actualModel });
@@ -691,7 +763,10 @@ export class AgentExecutor {
});
for await (const { token, stopReason } of stream) {
if (this.isStaleRun(runId)) return;
if (token) aiResponseText += token;
if (token) {
aiResponseText += token;
if (postLiveDeltas) this.webview.postMessage({ type: 'streamChunk', value: token });
}
if (stopReason) finishStopReason = stopReason;
}
} catch (err: any) {
@@ -747,6 +822,7 @@ export class AgentExecutor {
const token = engine === 'lmstudio' ? json.choices?.[0]?.delta?.content || '' : json.message?.content || json.response || '';
if (token) {
aiResponseText += token;
if (postLiveDeltas) this.webview.postMessage({ type: 'streamChunk', value: token });
}
const fr = engine === 'lmstudio'
? json.choices?.[0]?.finish_reason
@@ -778,6 +854,7 @@ export class AgentExecutor {
const token = engine === 'lmstudio' ? json.choices?.[0]?.delta?.content || '' : json.message?.content || json.response || '';
if (token) {
aiResponseText += token;
if (postLiveDeltas) this.webview.postMessage({ type: 'streamChunk', value: token });
}
const fr = engine === 'lmstudio'
? json.choices?.[0]?.finish_reason
@@ -829,7 +906,10 @@ export class AgentExecutor {
let retryText = '';
for await (const { token, stopReason } of retryStream) {
if (this.isStaleRun(runId)) return;
if (token) retryText += token;
if (token) {
retryText += token;
if (postLiveDeltas) this.webview.postMessage({ type: 'streamChunk', value: token });
}
if (stopReason) finishStopReason = stopReason;
}
if (retryText.trim()) {
@@ -922,6 +1002,7 @@ export class AgentExecutor {
&& !this.isStaleRun(runId)
) {
continuationCount++;
const continuationStartMs = Date.now();
this.webview.postMessage({ type: 'autoContinue', value: `답변이 길어 이어서 정리하는 중입니다... (${continuationCount}/${config.maxAutoContinuations})` });
try {
const contMsgs: ChatMessage[] = [
@@ -929,11 +1010,24 @@ export class AgentExecutor {
{ role: 'user', content: buildContinuationUserPrompt(originalUserPrompt, cleaned.visible) },
];
lastMaxOutputTokens = computeOutputBudget(estimateMessagesTokens(contMsgs), ctxLimits).maxOutputTokens;
const cr = await this.callNonStreaming({
baseUrl: ollamaUrl, modelName: actualModel, engine, messages: contMsgs,
temperature, maxTokens: lastMaxOutputTokens, contextLength: ctxLimits.contextLength,
signal: this.abortController?.signal,
// Stream the continuation through the same channel as the main turn so
// the user sees the answer keep growing instead of freezing for 1030s
// while we silently call non-streaming. The trailing streamReplace
// (after sanitize / merge) corrects any overlap the model re-emits.
const cr = await this.streamChatOnce({
runId, useLmStudioSdk, engine, ollamaUrl, modelName: actualModel,
messages: contMsgs,
temperature,
maxTokens: lastMaxOutputTokens,
contextLength: ctxLimits.contextLength,
contextOverflowPolicy: config.contextOverflowPolicy,
signal: this.abortController!.signal,
postLiveDeltas,
});
if (cr.aborted) {
logInfo('Auto-continuation aborted mid-stream.', { model: actualModel, round: continuationCount });
break;
}
finishStopReason = cr.stopReason;
const ccl = extractVisibleFinal(cr.text);
if (!ccl.visible.trim()) {
@@ -944,6 +1038,15 @@ export class AgentExecutor {
cleaned = { ...cleaned, visible: mergeContinuationParts(cleaned.visible, ccl.visible), wasThoughtOnly: false };
lastOutputTokens = estimateTokens(ccl.visible);
logInfo('Auto-continued the answer.', { model: actualModel, round: continuationCount, addedChars: ccl.visible.length, totalChars: cleaned.visible.length, contStopReason: cr.stopReason, contMaxTokens: lastMaxOutputTokens });
recordTelemetry({
kind: 'continuation',
durationMs: Date.now() - continuationStartMs,
model: actualModel, engine,
outputTokens: lastOutputTokens,
round: continuationCount,
stopReason: cr.stopReason,
note: `addedChars=${ccl.visible.length} mergedAdd=${cleaned.visible.length - before.length}`,
});
// Guard against a continuation that adds (almost) nothing new after dedup — stop instead of spinning.
if (cleaned.visible.length - before.length < 20) {
logInfo('Continuation added negligible new text — stopping.', { model: actualModel, round: continuationCount });
@@ -1099,7 +1202,32 @@ export class AgentExecutor {
value: { ...this._lastRetrievalInfo, hasAgentSelected: !!options.agentSkillFile, unaddressedChecklist },
});
}
this.webview.postMessage({ type: 'streamChunk', value: finalAssistantContent });
// Progressive answering: the bubble was filled live with raw tokens
// during streaming (and during any auto-continuation rounds). Now
// that we have the cleaned + merged + policy-enforced text, swap the
// bubble's content for the final version so the user sees the
// correct answer regardless of what slipped through live —
// hidden reasoning, mid-stream artifacts, continuation-overlap re-
// emits, truncation notice. Action-loop turns (loopDepth > 0) still
// append via streamChunk because the bubble has multiple action
// segments and we don't have a single "final" to replace with.
if (loopDepth === 0) {
this.webview.postMessage({ type: 'streamReplace', value: finalAssistantContent });
recordTelemetry({
kind: 'turn',
durationMs: Date.now() - turnStartMs,
model: actualModel, engine,
inputTokens,
outputTokens,
contextLength: ctxLimits.contextLength,
stopReason: finishStopReason,
brainFiles: this._lastRetrievalInfo?.usedBrainFiles.length ?? 0,
memoryLayers: this._lastRetrievalInfo?.usedMemoryLayers ?? [],
note: `continuations=${continuationCount} historyDropped=${reqMessages.length - budgetedHistory.length}`,
});
} else {
this.webview.postMessage({ type: 'streamChunk', value: finalAssistantContent });
}
} catch (error: any) {
this.statusBarManager.updateStatus(AgentStatus.Error, error.message);
@@ -2309,7 +2437,7 @@ export class AgentExecutor {
});
}
private buildMemoryContext(currentPrompt: string, activeBrain: BrainProfile, agentSkillFile?: string): string {
private async buildMemoryContext(currentPrompt: string, activeBrain: BrainProfile, agentSkillFile?: string): Promise<string> {
const config = getConfig();
this._lastRetrievalInfo = null;
this._lastLessonContents = [];
@@ -2331,6 +2459,44 @@ export class AgentExecutor {
// keeping the legacy behavior intact.
const scope = resolveScopeForAgent(agentSkillFile, activeBrain.localBrainPath);
// Scale retrieval/memory budget with the configured context window so
// that raising g1nation.contextLength actually gives the RAG pipeline
// more room. At 32K context we keep the legacy 8K total (≈3.2K
// retrieval); at 230K we allocate ~57K total (≈23K retrieval). Capped
// at 80K so scoring stays fast on huge contexts.
const scaledTotalBudget = Math.min(
80000,
Math.max(8000, Math.floor(config.contextLength * 0.25))
);
// Pull recent session summaries for the medium-term layer. We read
// from the sidebar's persisted store directly (same key it writes to)
// to avoid threading another callback through the agent constructor.
const rawSessions = this.context.globalState.get<any[]>('chat_sessions', []) || [];
const recentSessions = compactRecentSessions(
rawSessions,
this.currentTaskId,
Math.max(0, config.memoryMediumTermSessions ?? 0)
);
// Hybrid retrieval (optional): when the user has configured an
// embedding model, fetch a query embedding so searchBrainFiles can
// blend cosine similarity with TF-IDF. Time-bounded — if the
// embedding endpoint is slow or down, we fall through with no
// embedding and the retriever stays in pure-TF-IDF mode.
let queryEmbedding: number[] | undefined;
if (config.embeddingModel) {
const EMBED_QUERY_TIMEOUT_MS = 4000;
try {
queryEmbedding = await Promise.race([
embedQuery(currentPrompt, { baseUrl: config.ollamaUrl, model: config.embeddingModel }),
new Promise<undefined>((resolve) => setTimeout(() => resolve(undefined), EMBED_QUERY_TIMEOUT_MS)),
]);
} catch {
queryEmbedding = undefined;
}
}
// Use the Unified RAG Pipeline
const result = this.retrievalOrchestrator.retrieve(currentPrompt, {
brain: activeBrain,
@@ -2338,13 +2504,36 @@ export class AgentExecutor {
workspacePath,
chatHistory: visibleHistory,
contextBudget: {
totalBudget: 8000,
totalBudget: scaledTotalBudget,
retrievalRatio: 0.4
},
brainFileLimit: config.memoryLongTermFiles,
scopeFolders: scope.folders
scopeFolders: scope.folders,
recentSessions,
mediumTermLimit: config.memoryMediumTermSessions ?? 0,
queryEmbedding,
embeddingModel: config.embeddingModel || undefined,
embeddingBlendAlpha: config.embeddingBlendAlpha,
});
// Fire-and-forget background embedding for the files we just scored.
// Embeds only files that lack a vector for the current model — so
// steady-state turns do no embedding work. The next turn benefits.
if (config.embeddingModel) {
const scoredFilePaths = result.selectedChunks
.filter((c) => c.source === 'brain-memory' && c.metadata.filePath)
.map((c) => c.metadata.filePath!)
.filter((p, i, arr) => arr.indexOf(p) === i);
if (scoredFilePaths.length > 0) {
void backfillBrainEmbeddings(
activeBrain.localBrainPath,
scoredFilePaths,
config.embeddingModel,
(texts) => embedTexts(texts, { baseUrl: config.ollamaUrl, model: config.embeddingModel }),
);
}
}
// Stash what actually fed this turn so handlePrompt can show it under the answer.
const brainRoot = activeBrain.localBrainPath;
const rel = (p?: string) => (p ? (path.relative(brainRoot, p) || p) : '');
@@ -2406,11 +2595,74 @@ export class AgentExecutor {
workspacePath
);
logInfo('Memory extraction completed for session end.', { taskId: this.currentTaskId });
recordTelemetry({
kind: 'session-end',
note: `taskId=${this.currentTaskId} messages=${this.chatHistory.filter((m) => !m.internal).length}`,
});
// Fire-and-forget LLM compression: turns the raw transcript into a
// 23 sentence summary that medium-term retrieval can use instead
// of just "first user msg + last assistant 200 chars". Cheap call
// (~256 output tokens), runs in the background so it never blocks
// the next chat turn.
void this.compressSessionSummary(this.currentTaskId, this.chatHistory.slice());
} catch (error: any) {
logError('Memory extraction failed on session end.', { error: error?.message || String(error) });
}
}
/**
* Compress a finished session into a short summary and persist it to the
* session record. The summary is later read by `compactRecentSessions` so
* the medium-term memory layer carries a real recap instead of a fragment.
*
* Skips sessions with fewer than 3 visible messages — they're typically
* single-question pings where the raw first message is already a good
* summary. Failures are logged and swallowed: a missing summary just
* falls back to the legacy "first user msg" representation.
*/
private async compressSessionSummary(taskId: string, history: ChatMessage[]): Promise<void> {
const visible = history.filter((m) => !m.internal && (m.role === 'user' || m.role === 'assistant'));
if (visible.length < 3) return;
const cfg = getConfig();
const transcript = visible
.map((m) => `${m.role.toUpperCase()}: ${String(m.content).replace(/\s+/g, ' ').slice(0, 400)}`)
.join('\n\n');
const messages: ChatMessage[] = [
{
role: 'system',
content: [
'You compress chat transcripts into a 2-3 sentence summary.',
'Capture: (1) the user\'s topic or task, (2) the main decision or answer reached, (3) any open issue.',
'Reply in the user\'s primary language (mirror Korean ↔ English exactly as in the transcript).',
'Reply with ONLY the summary text. No headers, no quotes, no preamble.',
].join(' '),
internal: true,
},
{ role: 'user', content: `[TRANSCRIPT]\n${transcript}\n[END]` },
];
try {
const result = await this.callNonStreaming({
baseUrl: cfg.ollamaUrl,
modelName: cfg.defaultModel,
engine: resolveEngine(cfg.ollamaUrl),
messages,
temperature: 0.3,
maxTokens: 256,
contextLength: cfg.contextLength,
});
const summary = (result.text || '').trim().replace(/^["'`]+|["'`]+$/g, '');
if (!summary || summary.length < 12) return;
const sessions = this.context.globalState.get<any[]>('chat_sessions', []) || [];
const idx = sessions.findIndex((s) => String(s?.id) === String(taskId));
if (idx < 0) return;
sessions[idx].summary = summary;
await this.context.globalState.update('chat_sessions', sessions);
logInfo('Session summary stored for medium-term recall.', { taskId, length: summary.length });
} catch (e: any) {
logError('Session summary compression failed.', { taskId, error: e?.message ?? String(e) });
}
}
private async createStreamingRequest(params: {
baseUrl: string;
modelName: string;
@@ -2568,6 +2820,134 @@ export class AgentExecutor {
}
}
/**
* Single streaming call used by progressive answering (live-delta main
* stream + auto-continuation rounds). Mirrors the main streaming block in
* handlePrompt but without the empty-stream recovery / non-streaming
* fallback machinery — those only matter for the very first generation.
*
* When `postLiveDeltas` is true, every token is also forwarded to the
* webview as a `streamChunk`, giving the user a real-time view of the
* answer (and of continuation rounds) instead of one big drop at the end.
*
* Returns the accumulated text and the final stop reason. Aborts and
* stale runs surface as `aborted: true` and an empty/partial text — the
* caller decides what to do with that.
*/
private async streamChatOnce(params: {
runId: number;
useLmStudioSdk: boolean;
engine: 'lmstudio' | 'ollama';
ollamaUrl: string;
modelName: string;
messages: ChatMessage[];
temperature: number;
maxTokens: number;
contextLength: number;
contextOverflowPolicy: 'stopAtLimit' | 'truncateMiddle' | 'rollingWindow';
signal: AbortSignal;
postLiveDeltas: boolean;
}): Promise<{ text: string; stopReason?: string; aborted: boolean }> {
let accumulated = '';
let finishStopReason: string | undefined;
const post = (token: string) => {
if (params.postLiveDeltas && token) {
this.webview?.postMessage({ type: 'streamChunk', value: token });
}
};
if (params.useLmStudioSdk) {
try {
const stream = this.options.lmStudioStreamer!.stream({
modelName: params.modelName,
messages: params.messages.map((m) => ({ role: m.role, content: m.content })),
temperature: params.temperature,
maxTokens: params.maxTokens,
contextOverflowPolicy: params.contextOverflowPolicy,
signal: params.signal,
});
for await (const { token, stopReason } of stream) {
if (this.isStaleRun(params.runId)) {
return { text: accumulated, stopReason: finishStopReason, aborted: true };
}
if (token) {
accumulated += token;
post(token);
}
if (stopReason) finishStopReason = stopReason;
}
} catch (err: any) {
if (err?.name === 'AbortError' || params.signal.aborted) {
return { text: accumulated, stopReason: finishStopReason, aborted: true };
}
const msg = err?.message ?? String(err);
if (/context\s*length|contextlengthreached|exceed|too\s*long/i.test(msg)) {
finishStopReason = 'contextLengthReached';
}
logError('streamChatOnce SDK path failed.', { engine: params.engine, error: msg });
throw err;
}
return { text: accumulated, stopReason: finishStopReason, aborted: false };
}
const request = await this.createStreamingRequest({
baseUrl: params.ollamaUrl,
modelName: params.modelName,
reqMessages: params.messages,
temperature: params.temperature,
maxTokens: params.maxTokens,
contextLength: params.contextLength,
});
const reader = request.response.body?.getReader();
if (!reader) throw new Error('Response body is not readable.');
const decoder = new TextDecoder();
let buffer = '';
const consumeJsonLine = (line: string) => {
const trimmed = line.trim();
if (!trimmed || trimmed === 'data: [DONE]') return;
try {
const raw = trimmed.startsWith('data: ') ? trimmed.slice(6) : trimmed;
const json = JSON.parse(raw);
const token = params.engine === 'lmstudio'
? json.choices?.[0]?.delta?.content || ''
: json.message?.content || json.response || '';
if (token) {
accumulated += token;
post(token);
}
const fr = params.engine === 'lmstudio'
? json.choices?.[0]?.finish_reason
: (json.done_reason ?? (json.done === true ? 'stop' : undefined));
if (fr) finishStopReason = fr;
} catch (e: any) {
logError('streamChatOnce: failed to parse chunk.', { engine: params.engine, chunk: summarizeText(trimmed, 200), error: e?.message ?? String(e) });
}
};
try {
while (true) {
const { done, value } = await reader.read();
if (done) break;
if (this.isStaleRun(params.runId)) {
return { text: accumulated, stopReason: finishStopReason, aborted: true };
}
buffer += decoder.decode(value, { stream: true });
const lines = buffer.split('\n');
buffer = lines.pop() || '';
for (const line of lines) consumeJsonLine(line);
}
if (buffer.trim()) consumeJsonLine(buffer);
} catch (err: any) {
if (err?.name === 'AbortError') {
return { text: accumulated, stopReason: finishStopReason, aborted: true };
}
logError('streamChatOnce REST path failed.', { engine: params.engine, error: err?.message ?? String(err) });
throw err;
} finally {
try { reader.releaseLock(); } catch { /* already released on abort */ }
}
return { text: accumulated, stopReason: finishStopReason, aborted: false };
}
private normalizeMessages(messages: ChatMessage[]) {
return messages.map((message) => {
const normalizedContent = typeof message.content === 'string'