refactor: optimize core engine and retrieval logic for v2.80.43
This commit is contained in:
+391
-11
@@ -30,6 +30,7 @@ import { StatusBarManager, AgentStatus } from './core/statusBar';
|
||||
import { lockManager } from './core/lock';
|
||||
import { actionQueue } from './core/queue';
|
||||
import { ConflictResolver } from './core/conflict';
|
||||
import { recordTelemetry } from './core/telemetry';
|
||||
import {
|
||||
buildSecondBrainTrace,
|
||||
enforceProjectClaimPolicyInAnswer,
|
||||
@@ -40,6 +41,8 @@ import {
|
||||
import { MemoryManager } from './memory';
|
||||
import { RetrievalOrchestrator } from './retrieval';
|
||||
import { buildLessonChecklistBlock, isQaRegressionFeedback, findUnaddressedChecklistItems } from './retrieval/lessonHelpers';
|
||||
import { embedQuery, embedTexts } from './retrieval/embeddings';
|
||||
import { backfillBrainEmbeddings } from './retrieval/brainIndex';
|
||||
import { resolveScopeForAgent } from './skills/agentKnowledgeMap';
|
||||
import {
|
||||
extractVisibleFinal,
|
||||
@@ -117,6 +120,51 @@ const AGENT_PROMPTS: Record<AgentRole, string> = {
|
||||
3. Deliver a logical, consistent, and polished response.`
|
||||
};
|
||||
|
||||
/**
|
||||
* Compact recent chat sessions for medium-term memory retrieval.
|
||||
*
|
||||
* Returns up to `limit + 5` recently-touched sessions (excluding the active
|
||||
* one) as small summaries: title + first user message + tail of the last
|
||||
* assistant message. The retrieval orchestrator then scores these against the
|
||||
* current query and selects the top `limit` matches inside the shared budget.
|
||||
*
|
||||
* We pull a few more than `limit` so TF-IDF scoring has room to rerank — the
|
||||
* persisted list is timestamp-ordered, which isn't the same as topical fit.
|
||||
*/
|
||||
function compactRecentSessions(
|
||||
rawSessions: any[],
|
||||
activeSessionId: string | null,
|
||||
limit: number,
|
||||
): Array<{ id: string; title: string; firstUserMsg: string; lastAssistantExcerpt: string; summary?: string; timestamp: number }> {
|
||||
if (!Array.isArray(rawSessions) || rawSessions.length === 0 || limit <= 0) return [];
|
||||
const pool = rawSessions.length > limit + 5 ? limit + 5 : rawSessions.length;
|
||||
const out: Array<{ id: string; title: string; firstUserMsg: string; lastAssistantExcerpt: string; summary?: string; timestamp: number }> = [];
|
||||
for (let i = 0; i < rawSessions.length && out.length < pool; i++) {
|
||||
const s = rawSessions[i];
|
||||
if (!s || typeof s !== 'object') continue;
|
||||
const id = String(s.id ?? '');
|
||||
if (!id || id === activeSessionId) continue;
|
||||
const history: any[] = Array.isArray(s.history) ? s.history : [];
|
||||
if (history.length === 0) continue;
|
||||
const firstUser = history.find((m) => m?.role === 'user');
|
||||
const lastAssistant = [...history].reverse().find((m) => m?.role === 'assistant');
|
||||
const firstUserMsg = String(firstUser?.content ?? '').replace(/\s+/g, ' ').trim().slice(0, 200);
|
||||
const lastTxt = String(lastAssistant?.content ?? '').replace(/\s+/g, ' ').trim();
|
||||
const lastAssistantExcerpt = lastTxt.length <= 200 ? lastTxt : lastTxt.slice(-200);
|
||||
const summary = typeof s.summary === 'string' ? s.summary.trim().slice(0, 600) : undefined;
|
||||
if (!firstUserMsg && !lastAssistantExcerpt && !summary) continue;
|
||||
out.push({
|
||||
id,
|
||||
title: String(s.title ?? '').trim() || firstUserMsg.slice(0, 50),
|
||||
firstUserMsg,
|
||||
lastAssistantExcerpt,
|
||||
summary,
|
||||
timestamp: typeof s.timestamp === 'number' ? s.timestamp : 0,
|
||||
});
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
// Local-path detectors used to decide whether a user prompt refers to a file/dir on disk.
|
||||
// POSIX: /Volumes/, /Users/, /home/, /opt/, ... or ~/ — backtick excluded (markdown code spans).
|
||||
const POSIX_ABS_PATH_SRC = "(?:\\/(?:Volumes|Users|home|opt|srv|mnt|data|workspace)\\/|~\\/)[^\\s`\"'<>|*?]+";
|
||||
@@ -328,6 +376,10 @@ export class AgentExecutor {
|
||||
|
||||
if (!this.webview) return;
|
||||
|
||||
// Telemetry: wall-clock start of the user-visible turn. Only meaningful
|
||||
// at loopDepth===0 (action-loop recursions roll up into the same turn).
|
||||
const turnStartMs = loopDepth === 0 ? Date.now() : 0;
|
||||
|
||||
try {
|
||||
// 0. Safety Check: Rollback any dangling transaction from previous runs
|
||||
if (this.transactionManager.isActive()) {
|
||||
@@ -471,9 +523,19 @@ export class AgentExecutor {
|
||||
const secondBrainTraceCtx = secondBrainTrace
|
||||
? `\n\n${renderSecondBrainTraceContext(secondBrainTrace)}`
|
||||
: '';
|
||||
const retrievalStartMs = Date.now();
|
||||
const memoryCtx = isCasualConversation
|
||||
? ''
|
||||
: this.buildMemoryContext(prompt || '', activeBrain, options.agentSkillFile);
|
||||
: await this.buildMemoryContext(prompt || '', activeBrain, options.agentSkillFile);
|
||||
if (loopDepth === 0 && !isCasualConversation && this._lastRetrievalInfo) {
|
||||
recordTelemetry({
|
||||
kind: 'retrieval',
|
||||
durationMs: Date.now() - retrievalStartMs,
|
||||
brainFiles: this._lastRetrievalInfo.usedBrainFiles.length,
|
||||
memoryLayers: this._lastRetrievalInfo.usedMemoryLayers,
|
||||
note: `chunks=${this._lastRetrievalInfo.selectedChunks}/${this._lastRetrievalInfo.totalChunks} lessons=${this._lastRetrievalInfo.lessonFiles.length}`,
|
||||
});
|
||||
}
|
||||
const knowledgeContextForPrompt = isCasualConversation
|
||||
? ''
|
||||
: `${brainContext}${brainInventoryCtx}`;
|
||||
@@ -677,6 +739,16 @@ export class AgentExecutor {
|
||||
this.options.onStreamLifecycle?.start();
|
||||
}
|
||||
|
||||
// Progressive answering: live-stream tokens to the webview during
|
||||
// the user-visible first turn (loopDepth === 0). The bubble fills
|
||||
// as the model generates instead of dropping all at once at the end,
|
||||
// and any auto-continuation rounds keep posting deltas through the
|
||||
// same channel. Post-processing (reasoning strip / sanitize /
|
||||
// policy enforcement) emits a final `streamReplace` so the bubble
|
||||
// ends up matching the cleaned answer regardless of what slipped
|
||||
// through live.
|
||||
const postLiveDeltas = loopDepth === 0;
|
||||
|
||||
if (useLmStudioSdk) {
|
||||
apiUrl = `${ollamaUrl} (sdk)`;
|
||||
logInfo('Streaming chat via LM Studio SDK.', { model: actualModel });
|
||||
@@ -691,7 +763,10 @@ export class AgentExecutor {
|
||||
});
|
||||
for await (const { token, stopReason } of stream) {
|
||||
if (this.isStaleRun(runId)) return;
|
||||
if (token) aiResponseText += token;
|
||||
if (token) {
|
||||
aiResponseText += token;
|
||||
if (postLiveDeltas) this.webview.postMessage({ type: 'streamChunk', value: token });
|
||||
}
|
||||
if (stopReason) finishStopReason = stopReason;
|
||||
}
|
||||
} catch (err: any) {
|
||||
@@ -747,6 +822,7 @@ export class AgentExecutor {
|
||||
const token = engine === 'lmstudio' ? json.choices?.[0]?.delta?.content || '' : json.message?.content || json.response || '';
|
||||
if (token) {
|
||||
aiResponseText += token;
|
||||
if (postLiveDeltas) this.webview.postMessage({ type: 'streamChunk', value: token });
|
||||
}
|
||||
const fr = engine === 'lmstudio'
|
||||
? json.choices?.[0]?.finish_reason
|
||||
@@ -778,6 +854,7 @@ export class AgentExecutor {
|
||||
const token = engine === 'lmstudio' ? json.choices?.[0]?.delta?.content || '' : json.message?.content || json.response || '';
|
||||
if (token) {
|
||||
aiResponseText += token;
|
||||
if (postLiveDeltas) this.webview.postMessage({ type: 'streamChunk', value: token });
|
||||
}
|
||||
const fr = engine === 'lmstudio'
|
||||
? json.choices?.[0]?.finish_reason
|
||||
@@ -829,7 +906,10 @@ export class AgentExecutor {
|
||||
let retryText = '';
|
||||
for await (const { token, stopReason } of retryStream) {
|
||||
if (this.isStaleRun(runId)) return;
|
||||
if (token) retryText += token;
|
||||
if (token) {
|
||||
retryText += token;
|
||||
if (postLiveDeltas) this.webview.postMessage({ type: 'streamChunk', value: token });
|
||||
}
|
||||
if (stopReason) finishStopReason = stopReason;
|
||||
}
|
||||
if (retryText.trim()) {
|
||||
@@ -922,6 +1002,7 @@ export class AgentExecutor {
|
||||
&& !this.isStaleRun(runId)
|
||||
) {
|
||||
continuationCount++;
|
||||
const continuationStartMs = Date.now();
|
||||
this.webview.postMessage({ type: 'autoContinue', value: `답변이 길어 이어서 정리하는 중입니다... (${continuationCount}/${config.maxAutoContinuations})` });
|
||||
try {
|
||||
const contMsgs: ChatMessage[] = [
|
||||
@@ -929,11 +1010,24 @@ export class AgentExecutor {
|
||||
{ role: 'user', content: buildContinuationUserPrompt(originalUserPrompt, cleaned.visible) },
|
||||
];
|
||||
lastMaxOutputTokens = computeOutputBudget(estimateMessagesTokens(contMsgs), ctxLimits).maxOutputTokens;
|
||||
const cr = await this.callNonStreaming({
|
||||
baseUrl: ollamaUrl, modelName: actualModel, engine, messages: contMsgs,
|
||||
temperature, maxTokens: lastMaxOutputTokens, contextLength: ctxLimits.contextLength,
|
||||
signal: this.abortController?.signal,
|
||||
// Stream the continuation through the same channel as the main turn so
|
||||
// the user sees the answer keep growing instead of freezing for 10–30s
|
||||
// while we silently call non-streaming. The trailing streamReplace
|
||||
// (after sanitize / merge) corrects any overlap the model re-emits.
|
||||
const cr = await this.streamChatOnce({
|
||||
runId, useLmStudioSdk, engine, ollamaUrl, modelName: actualModel,
|
||||
messages: contMsgs,
|
||||
temperature,
|
||||
maxTokens: lastMaxOutputTokens,
|
||||
contextLength: ctxLimits.contextLength,
|
||||
contextOverflowPolicy: config.contextOverflowPolicy,
|
||||
signal: this.abortController!.signal,
|
||||
postLiveDeltas,
|
||||
});
|
||||
if (cr.aborted) {
|
||||
logInfo('Auto-continuation aborted mid-stream.', { model: actualModel, round: continuationCount });
|
||||
break;
|
||||
}
|
||||
finishStopReason = cr.stopReason;
|
||||
const ccl = extractVisibleFinal(cr.text);
|
||||
if (!ccl.visible.trim()) {
|
||||
@@ -944,6 +1038,15 @@ export class AgentExecutor {
|
||||
cleaned = { ...cleaned, visible: mergeContinuationParts(cleaned.visible, ccl.visible), wasThoughtOnly: false };
|
||||
lastOutputTokens = estimateTokens(ccl.visible);
|
||||
logInfo('Auto-continued the answer.', { model: actualModel, round: continuationCount, addedChars: ccl.visible.length, totalChars: cleaned.visible.length, contStopReason: cr.stopReason, contMaxTokens: lastMaxOutputTokens });
|
||||
recordTelemetry({
|
||||
kind: 'continuation',
|
||||
durationMs: Date.now() - continuationStartMs,
|
||||
model: actualModel, engine,
|
||||
outputTokens: lastOutputTokens,
|
||||
round: continuationCount,
|
||||
stopReason: cr.stopReason,
|
||||
note: `addedChars=${ccl.visible.length} mergedAdd=${cleaned.visible.length - before.length}`,
|
||||
});
|
||||
// Guard against a continuation that adds (almost) nothing new after dedup — stop instead of spinning.
|
||||
if (cleaned.visible.length - before.length < 20) {
|
||||
logInfo('Continuation added negligible new text — stopping.', { model: actualModel, round: continuationCount });
|
||||
@@ -1099,7 +1202,32 @@ export class AgentExecutor {
|
||||
value: { ...this._lastRetrievalInfo, hasAgentSelected: !!options.agentSkillFile, unaddressedChecklist },
|
||||
});
|
||||
}
|
||||
this.webview.postMessage({ type: 'streamChunk', value: finalAssistantContent });
|
||||
// Progressive answering: the bubble was filled live with raw tokens
|
||||
// during streaming (and during any auto-continuation rounds). Now
|
||||
// that we have the cleaned + merged + policy-enforced text, swap the
|
||||
// bubble's content for the final version so the user sees the
|
||||
// correct answer regardless of what slipped through live —
|
||||
// hidden reasoning, mid-stream artifacts, continuation-overlap re-
|
||||
// emits, truncation notice. Action-loop turns (loopDepth > 0) still
|
||||
// append via streamChunk because the bubble has multiple action
|
||||
// segments and we don't have a single "final" to replace with.
|
||||
if (loopDepth === 0) {
|
||||
this.webview.postMessage({ type: 'streamReplace', value: finalAssistantContent });
|
||||
recordTelemetry({
|
||||
kind: 'turn',
|
||||
durationMs: Date.now() - turnStartMs,
|
||||
model: actualModel, engine,
|
||||
inputTokens,
|
||||
outputTokens,
|
||||
contextLength: ctxLimits.contextLength,
|
||||
stopReason: finishStopReason,
|
||||
brainFiles: this._lastRetrievalInfo?.usedBrainFiles.length ?? 0,
|
||||
memoryLayers: this._lastRetrievalInfo?.usedMemoryLayers ?? [],
|
||||
note: `continuations=${continuationCount} historyDropped=${reqMessages.length - budgetedHistory.length}`,
|
||||
});
|
||||
} else {
|
||||
this.webview.postMessage({ type: 'streamChunk', value: finalAssistantContent });
|
||||
}
|
||||
|
||||
} catch (error: any) {
|
||||
this.statusBarManager.updateStatus(AgentStatus.Error, error.message);
|
||||
@@ -2309,7 +2437,7 @@ export class AgentExecutor {
|
||||
});
|
||||
}
|
||||
|
||||
private buildMemoryContext(currentPrompt: string, activeBrain: BrainProfile, agentSkillFile?: string): string {
|
||||
private async buildMemoryContext(currentPrompt: string, activeBrain: BrainProfile, agentSkillFile?: string): Promise<string> {
|
||||
const config = getConfig();
|
||||
this._lastRetrievalInfo = null;
|
||||
this._lastLessonContents = [];
|
||||
@@ -2331,6 +2459,44 @@ export class AgentExecutor {
|
||||
// keeping the legacy behavior intact.
|
||||
const scope = resolveScopeForAgent(agentSkillFile, activeBrain.localBrainPath);
|
||||
|
||||
// Scale retrieval/memory budget with the configured context window so
|
||||
// that raising g1nation.contextLength actually gives the RAG pipeline
|
||||
// more room. At 32K context we keep the legacy 8K total (≈3.2K
|
||||
// retrieval); at 230K we allocate ~57K total (≈23K retrieval). Capped
|
||||
// at 80K so scoring stays fast on huge contexts.
|
||||
const scaledTotalBudget = Math.min(
|
||||
80000,
|
||||
Math.max(8000, Math.floor(config.contextLength * 0.25))
|
||||
);
|
||||
|
||||
// Pull recent session summaries for the medium-term layer. We read
|
||||
// from the sidebar's persisted store directly (same key it writes to)
|
||||
// to avoid threading another callback through the agent constructor.
|
||||
const rawSessions = this.context.globalState.get<any[]>('chat_sessions', []) || [];
|
||||
const recentSessions = compactRecentSessions(
|
||||
rawSessions,
|
||||
this.currentTaskId,
|
||||
Math.max(0, config.memoryMediumTermSessions ?? 0)
|
||||
);
|
||||
|
||||
// Hybrid retrieval (optional): when the user has configured an
|
||||
// embedding model, fetch a query embedding so searchBrainFiles can
|
||||
// blend cosine similarity with TF-IDF. Time-bounded — if the
|
||||
// embedding endpoint is slow or down, we fall through with no
|
||||
// embedding and the retriever stays in pure-TF-IDF mode.
|
||||
let queryEmbedding: number[] | undefined;
|
||||
if (config.embeddingModel) {
|
||||
const EMBED_QUERY_TIMEOUT_MS = 4000;
|
||||
try {
|
||||
queryEmbedding = await Promise.race([
|
||||
embedQuery(currentPrompt, { baseUrl: config.ollamaUrl, model: config.embeddingModel }),
|
||||
new Promise<undefined>((resolve) => setTimeout(() => resolve(undefined), EMBED_QUERY_TIMEOUT_MS)),
|
||||
]);
|
||||
} catch {
|
||||
queryEmbedding = undefined;
|
||||
}
|
||||
}
|
||||
|
||||
// Use the Unified RAG Pipeline
|
||||
const result = this.retrievalOrchestrator.retrieve(currentPrompt, {
|
||||
brain: activeBrain,
|
||||
@@ -2338,13 +2504,36 @@ export class AgentExecutor {
|
||||
workspacePath,
|
||||
chatHistory: visibleHistory,
|
||||
contextBudget: {
|
||||
totalBudget: 8000,
|
||||
totalBudget: scaledTotalBudget,
|
||||
retrievalRatio: 0.4
|
||||
},
|
||||
brainFileLimit: config.memoryLongTermFiles,
|
||||
scopeFolders: scope.folders
|
||||
scopeFolders: scope.folders,
|
||||
recentSessions,
|
||||
mediumTermLimit: config.memoryMediumTermSessions ?? 0,
|
||||
queryEmbedding,
|
||||
embeddingModel: config.embeddingModel || undefined,
|
||||
embeddingBlendAlpha: config.embeddingBlendAlpha,
|
||||
});
|
||||
|
||||
// Fire-and-forget background embedding for the files we just scored.
|
||||
// Embeds only files that lack a vector for the current model — so
|
||||
// steady-state turns do no embedding work. The next turn benefits.
|
||||
if (config.embeddingModel) {
|
||||
const scoredFilePaths = result.selectedChunks
|
||||
.filter((c) => c.source === 'brain-memory' && c.metadata.filePath)
|
||||
.map((c) => c.metadata.filePath!)
|
||||
.filter((p, i, arr) => arr.indexOf(p) === i);
|
||||
if (scoredFilePaths.length > 0) {
|
||||
void backfillBrainEmbeddings(
|
||||
activeBrain.localBrainPath,
|
||||
scoredFilePaths,
|
||||
config.embeddingModel,
|
||||
(texts) => embedTexts(texts, { baseUrl: config.ollamaUrl, model: config.embeddingModel }),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Stash what actually fed this turn so handlePrompt can show it under the answer.
|
||||
const brainRoot = activeBrain.localBrainPath;
|
||||
const rel = (p?: string) => (p ? (path.relative(brainRoot, p) || p) : '');
|
||||
@@ -2406,11 +2595,74 @@ export class AgentExecutor {
|
||||
workspacePath
|
||||
);
|
||||
logInfo('Memory extraction completed for session end.', { taskId: this.currentTaskId });
|
||||
recordTelemetry({
|
||||
kind: 'session-end',
|
||||
note: `taskId=${this.currentTaskId} messages=${this.chatHistory.filter((m) => !m.internal).length}`,
|
||||
});
|
||||
// Fire-and-forget LLM compression: turns the raw transcript into a
|
||||
// 2–3 sentence summary that medium-term retrieval can use instead
|
||||
// of just "first user msg + last assistant 200 chars". Cheap call
|
||||
// (~256 output tokens), runs in the background so it never blocks
|
||||
// the next chat turn.
|
||||
void this.compressSessionSummary(this.currentTaskId, this.chatHistory.slice());
|
||||
} catch (error: any) {
|
||||
logError('Memory extraction failed on session end.', { error: error?.message || String(error) });
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Compress a finished session into a short summary and persist it to the
|
||||
* session record. The summary is later read by `compactRecentSessions` so
|
||||
* the medium-term memory layer carries a real recap instead of a fragment.
|
||||
*
|
||||
* Skips sessions with fewer than 3 visible messages — they're typically
|
||||
* single-question pings where the raw first message is already a good
|
||||
* summary. Failures are logged and swallowed: a missing summary just
|
||||
* falls back to the legacy "first user msg" representation.
|
||||
*/
|
||||
private async compressSessionSummary(taskId: string, history: ChatMessage[]): Promise<void> {
|
||||
const visible = history.filter((m) => !m.internal && (m.role === 'user' || m.role === 'assistant'));
|
||||
if (visible.length < 3) return;
|
||||
const cfg = getConfig();
|
||||
const transcript = visible
|
||||
.map((m) => `${m.role.toUpperCase()}: ${String(m.content).replace(/\s+/g, ' ').slice(0, 400)}`)
|
||||
.join('\n\n');
|
||||
const messages: ChatMessage[] = [
|
||||
{
|
||||
role: 'system',
|
||||
content: [
|
||||
'You compress chat transcripts into a 2-3 sentence summary.',
|
||||
'Capture: (1) the user\'s topic or task, (2) the main decision or answer reached, (3) any open issue.',
|
||||
'Reply in the user\'s primary language (mirror Korean ↔ English exactly as in the transcript).',
|
||||
'Reply with ONLY the summary text. No headers, no quotes, no preamble.',
|
||||
].join(' '),
|
||||
internal: true,
|
||||
},
|
||||
{ role: 'user', content: `[TRANSCRIPT]\n${transcript}\n[END]` },
|
||||
];
|
||||
try {
|
||||
const result = await this.callNonStreaming({
|
||||
baseUrl: cfg.ollamaUrl,
|
||||
modelName: cfg.defaultModel,
|
||||
engine: resolveEngine(cfg.ollamaUrl),
|
||||
messages,
|
||||
temperature: 0.3,
|
||||
maxTokens: 256,
|
||||
contextLength: cfg.contextLength,
|
||||
});
|
||||
const summary = (result.text || '').trim().replace(/^["'`]+|["'`]+$/g, '');
|
||||
if (!summary || summary.length < 12) return;
|
||||
const sessions = this.context.globalState.get<any[]>('chat_sessions', []) || [];
|
||||
const idx = sessions.findIndex((s) => String(s?.id) === String(taskId));
|
||||
if (idx < 0) return;
|
||||
sessions[idx].summary = summary;
|
||||
await this.context.globalState.update('chat_sessions', sessions);
|
||||
logInfo('Session summary stored for medium-term recall.', { taskId, length: summary.length });
|
||||
} catch (e: any) {
|
||||
logError('Session summary compression failed.', { taskId, error: e?.message ?? String(e) });
|
||||
}
|
||||
}
|
||||
|
||||
private async createStreamingRequest(params: {
|
||||
baseUrl: string;
|
||||
modelName: string;
|
||||
@@ -2568,6 +2820,134 @@ export class AgentExecutor {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Single streaming call used by progressive answering (live-delta main
|
||||
* stream + auto-continuation rounds). Mirrors the main streaming block in
|
||||
* handlePrompt but without the empty-stream recovery / non-streaming
|
||||
* fallback machinery — those only matter for the very first generation.
|
||||
*
|
||||
* When `postLiveDeltas` is true, every token is also forwarded to the
|
||||
* webview as a `streamChunk`, giving the user a real-time view of the
|
||||
* answer (and of continuation rounds) instead of one big drop at the end.
|
||||
*
|
||||
* Returns the accumulated text and the final stop reason. Aborts and
|
||||
* stale runs surface as `aborted: true` and an empty/partial text — the
|
||||
* caller decides what to do with that.
|
||||
*/
|
||||
private async streamChatOnce(params: {
|
||||
runId: number;
|
||||
useLmStudioSdk: boolean;
|
||||
engine: 'lmstudio' | 'ollama';
|
||||
ollamaUrl: string;
|
||||
modelName: string;
|
||||
messages: ChatMessage[];
|
||||
temperature: number;
|
||||
maxTokens: number;
|
||||
contextLength: number;
|
||||
contextOverflowPolicy: 'stopAtLimit' | 'truncateMiddle' | 'rollingWindow';
|
||||
signal: AbortSignal;
|
||||
postLiveDeltas: boolean;
|
||||
}): Promise<{ text: string; stopReason?: string; aborted: boolean }> {
|
||||
let accumulated = '';
|
||||
let finishStopReason: string | undefined;
|
||||
const post = (token: string) => {
|
||||
if (params.postLiveDeltas && token) {
|
||||
this.webview?.postMessage({ type: 'streamChunk', value: token });
|
||||
}
|
||||
};
|
||||
|
||||
if (params.useLmStudioSdk) {
|
||||
try {
|
||||
const stream = this.options.lmStudioStreamer!.stream({
|
||||
modelName: params.modelName,
|
||||
messages: params.messages.map((m) => ({ role: m.role, content: m.content })),
|
||||
temperature: params.temperature,
|
||||
maxTokens: params.maxTokens,
|
||||
contextOverflowPolicy: params.contextOverflowPolicy,
|
||||
signal: params.signal,
|
||||
});
|
||||
for await (const { token, stopReason } of stream) {
|
||||
if (this.isStaleRun(params.runId)) {
|
||||
return { text: accumulated, stopReason: finishStopReason, aborted: true };
|
||||
}
|
||||
if (token) {
|
||||
accumulated += token;
|
||||
post(token);
|
||||
}
|
||||
if (stopReason) finishStopReason = stopReason;
|
||||
}
|
||||
} catch (err: any) {
|
||||
if (err?.name === 'AbortError' || params.signal.aborted) {
|
||||
return { text: accumulated, stopReason: finishStopReason, aborted: true };
|
||||
}
|
||||
const msg = err?.message ?? String(err);
|
||||
if (/context\s*length|contextlengthreached|exceed|too\s*long/i.test(msg)) {
|
||||
finishStopReason = 'contextLengthReached';
|
||||
}
|
||||
logError('streamChatOnce SDK path failed.', { engine: params.engine, error: msg });
|
||||
throw err;
|
||||
}
|
||||
return { text: accumulated, stopReason: finishStopReason, aborted: false };
|
||||
}
|
||||
|
||||
const request = await this.createStreamingRequest({
|
||||
baseUrl: params.ollamaUrl,
|
||||
modelName: params.modelName,
|
||||
reqMessages: params.messages,
|
||||
temperature: params.temperature,
|
||||
maxTokens: params.maxTokens,
|
||||
contextLength: params.contextLength,
|
||||
});
|
||||
const reader = request.response.body?.getReader();
|
||||
if (!reader) throw new Error('Response body is not readable.');
|
||||
const decoder = new TextDecoder();
|
||||
let buffer = '';
|
||||
const consumeJsonLine = (line: string) => {
|
||||
const trimmed = line.trim();
|
||||
if (!trimmed || trimmed === 'data: [DONE]') return;
|
||||
try {
|
||||
const raw = trimmed.startsWith('data: ') ? trimmed.slice(6) : trimmed;
|
||||
const json = JSON.parse(raw);
|
||||
const token = params.engine === 'lmstudio'
|
||||
? json.choices?.[0]?.delta?.content || ''
|
||||
: json.message?.content || json.response || '';
|
||||
if (token) {
|
||||
accumulated += token;
|
||||
post(token);
|
||||
}
|
||||
const fr = params.engine === 'lmstudio'
|
||||
? json.choices?.[0]?.finish_reason
|
||||
: (json.done_reason ?? (json.done === true ? 'stop' : undefined));
|
||||
if (fr) finishStopReason = fr;
|
||||
} catch (e: any) {
|
||||
logError('streamChatOnce: failed to parse chunk.', { engine: params.engine, chunk: summarizeText(trimmed, 200), error: e?.message ?? String(e) });
|
||||
}
|
||||
};
|
||||
try {
|
||||
while (true) {
|
||||
const { done, value } = await reader.read();
|
||||
if (done) break;
|
||||
if (this.isStaleRun(params.runId)) {
|
||||
return { text: accumulated, stopReason: finishStopReason, aborted: true };
|
||||
}
|
||||
buffer += decoder.decode(value, { stream: true });
|
||||
const lines = buffer.split('\n');
|
||||
buffer = lines.pop() || '';
|
||||
for (const line of lines) consumeJsonLine(line);
|
||||
}
|
||||
if (buffer.trim()) consumeJsonLine(buffer);
|
||||
} catch (err: any) {
|
||||
if (err?.name === 'AbortError') {
|
||||
return { text: accumulated, stopReason: finishStopReason, aborted: true };
|
||||
}
|
||||
logError('streamChatOnce REST path failed.', { engine: params.engine, error: err?.message ?? String(err) });
|
||||
throw err;
|
||||
} finally {
|
||||
try { reader.releaseLock(); } catch { /* already released on abort */ }
|
||||
}
|
||||
return { text: accumulated, stopReason: finishStopReason, aborted: false };
|
||||
}
|
||||
|
||||
private normalizeMessages(messages: ChatMessage[]) {
|
||||
return messages.map((message) => {
|
||||
const normalizedContent = typeof message.content === 'string'
|
||||
|
||||
Reference in New Issue
Block a user