refactor: optimize core engine and retrieval logic for v2.80.43

2026-05-13 19:23:57 +09:00
parent c4260466b9
commit 089abf22db
17 changed files with 1311 additions and 88 deletions
@@ -30,6 +30,7 @@ import { StatusBarManager, AgentStatus } from './core/statusBar';
 import { lockManager } from './core/lock';
 import { actionQueue } from './core/queue';
 import { ConflictResolver } from './core/conflict';
+import { recordTelemetry } from './core/telemetry';
 import {
    buildSecondBrainTrace,
    enforceProjectClaimPolicyInAnswer,
@@ -40,6 +41,8 @@ import {
 import { MemoryManager } from './memory';
 import { RetrievalOrchestrator } from './retrieval';
 import { buildLessonChecklistBlock, isQaRegressionFeedback, findUnaddressedChecklistItems } from './retrieval/lessonHelpers';
+import { embedQuery, embedTexts } from './retrieval/embeddings';
+import { backfillBrainEmbeddings } from './retrieval/brainIndex';
 import { resolveScopeForAgent } from './skills/agentKnowledgeMap';
 import {
    extractVisibleFinal,
@@ -117,6 +120,51 @@ const AGENT_PROMPTS: Record<AgentRole, string> = {
 3. Deliver a logical, consistent, and polished response.`
 };

+/**
+ * Compact recent chat sessions for medium-term memory retrieval.
+ *
+ * Returns up to `limit + 5` recently-touched sessions (excluding the active
+ * one) as small summaries: title + first user message + tail of the last
+ * assistant message. The retrieval orchestrator then scores these against the
+ * current query and selects the top `limit` matches inside the shared budget.
+ *
+ * We pull a few more than `limit` so TF-IDF scoring has room to rerank — the
+ * persisted list is timestamp-ordered, which isn't the same as topical fit.
+ */
+function compactRecentSessions(
+    rawSessions: any[],
+    activeSessionId: string | null,
+    limit: number,
+): Array<{ id: string; title: string; firstUserMsg: string; lastAssistantExcerpt: string; summary?: string; timestamp: number }> {
+    if (!Array.isArray(rawSessions) || rawSessions.length === 0 || limit <= 0) return [];
+    const pool = rawSessions.length > limit + 5 ? limit + 5 : rawSessions.length;
+    const out: Array<{ id: string; title: string; firstUserMsg: string; lastAssistantExcerpt: string; summary?: string; timestamp: number }> = [];
+    for (let i = 0; i < rawSessions.length && out.length < pool; i++) {
+        const s = rawSessions[i];
+        if (!s || typeof s !== 'object') continue;
+        const id = String(s.id ?? '');
+        if (!id || id === activeSessionId) continue;
+        const history: any[] = Array.isArray(s.history) ? s.history : [];
+        if (history.length === 0) continue;
+        const firstUser = history.find((m) => m?.role === 'user');
+        const lastAssistant = [...history].reverse().find((m) => m?.role === 'assistant');
+        const firstUserMsg = String(firstUser?.content ?? '').replace(/\s+/g, ' ').trim().slice(0, 200);
+        const lastTxt = String(lastAssistant?.content ?? '').replace(/\s+/g, ' ').trim();
+        const lastAssistantExcerpt = lastTxt.length <= 200 ? lastTxt : lastTxt.slice(-200);
+        const summary = typeof s.summary === 'string' ? s.summary.trim().slice(0, 600) : undefined;
+        if (!firstUserMsg && !lastAssistantExcerpt && !summary) continue;
+        out.push({
+            id,
+            title: String(s.title ?? '').trim() || firstUserMsg.slice(0, 50),
+            firstUserMsg,
+            lastAssistantExcerpt,
+            summary,
+            timestamp: typeof s.timestamp === 'number' ? s.timestamp : 0,
+        });
+    }
+    return out;
+}
+
 // Local-path detectors used to decide whether a user prompt refers to a file/dir on disk.
 // POSIX: /Volumes/, /Users/, /home/, /opt/, ... or ~/  — backtick excluded (markdown code spans).
 const POSIX_ABS_PATH_SRC = "(?:\\/(?:Volumes|Users|home|opt|srv|mnt|data|workspace)\\/|~\\/)[^\\s`\"'<>|*?]+";
@@ -328,6 +376,10 @@ export class AgentExecutor {

        if (!this.webview) return;

+        // Telemetry: wall-clock start of the user-visible turn. Only meaningful
+        // at loopDepth===0 (action-loop recursions roll up into the same turn).
+        const turnStartMs = loopDepth === 0 ? Date.now() : 0;
+
        try {
            // 0. Safety Check: Rollback any dangling transaction from previous runs
            if (this.transactionManager.isActive()) {
@@ -471,9 +523,19 @@ export class AgentExecutor {
            const secondBrainTraceCtx = secondBrainTrace
                ? `\n\n${renderSecondBrainTraceContext(secondBrainTrace)}`
                : '';
+            const retrievalStartMs = Date.now();
            const memoryCtx = isCasualConversation
                ? ''
-                : this.buildMemoryContext(prompt || '', activeBrain, options.agentSkillFile);
+                : await this.buildMemoryContext(prompt || '', activeBrain, options.agentSkillFile);
+            if (loopDepth === 0 && !isCasualConversation && this._lastRetrievalInfo) {
+                recordTelemetry({
+                    kind: 'retrieval',
+                    durationMs: Date.now() - retrievalStartMs,
+                    brainFiles: this._lastRetrievalInfo.usedBrainFiles.length,
+                    memoryLayers: this._lastRetrievalInfo.usedMemoryLayers,
+                    note: `chunks=${this._lastRetrievalInfo.selectedChunks}/${this._lastRetrievalInfo.totalChunks} lessons=${this._lastRetrievalInfo.lessonFiles.length}`,
+                });
+            }
            const knowledgeContextForPrompt = isCasualConversation
                ? ''
                : `${brainContext}${brainInventoryCtx}`;
@@ -677,6 +739,16 @@ export class AgentExecutor {
                this.options.onStreamLifecycle?.start();
            }

+            // Progressive answering: live-stream tokens to the webview during
+            // the user-visible first turn (loopDepth === 0). The bubble fills
+            // as the model generates instead of dropping all at once at the end,
+            // and any auto-continuation rounds keep posting deltas through the
+            // same channel. Post-processing (reasoning strip / sanitize /
+            // policy enforcement) emits a final `streamReplace` so the bubble
+            // ends up matching the cleaned answer regardless of what slipped
+            // through live.
+            const postLiveDeltas = loopDepth === 0;
+
            if (useLmStudioSdk) {
                apiUrl = `${ollamaUrl} (sdk)`;
                logInfo('Streaming chat via LM Studio SDK.', { model: actualModel });
@@ -691,7 +763,10 @@ export class AgentExecutor {
                    });
                    for await (const { token, stopReason } of stream) {
                        if (this.isStaleRun(runId)) return;
-                        if (token) aiResponseText += token;
+                        if (token) {
+                            aiResponseText += token;
+                            if (postLiveDeltas) this.webview.postMessage({ type: 'streamChunk', value: token });
+                        }
                        if (stopReason) finishStopReason = stopReason;
                    }
                } catch (err: any) {
@@ -747,6 +822,7 @@ export class AgentExecutor {
                                const token = engine === 'lmstudio' ? json.choices?.[0]?.delta?.content || '' : json.message?.content || json.response || '';
                                if (token) {
                                    aiResponseText += token;
+                                    if (postLiveDeltas) this.webview.postMessage({ type: 'streamChunk', value: token });
                                }
                                const fr = engine === 'lmstudio'
                                    ? json.choices?.[0]?.finish_reason
@@ -778,6 +854,7 @@ export class AgentExecutor {
                    const token = engine === 'lmstudio' ? json.choices?.[0]?.delta?.content || '' : json.message?.content || json.response || '';
                    if (token) {
                        aiResponseText += token;
+                        if (postLiveDeltas) this.webview.postMessage({ type: 'streamChunk', value: token });
                    }
                    const fr = engine === 'lmstudio'
                        ? json.choices?.[0]?.finish_reason
@@ -829,7 +906,10 @@ export class AgentExecutor {
                        let retryText = '';
                        for await (const { token, stopReason } of retryStream) {
                            if (this.isStaleRun(runId)) return;
-                            if (token) retryText += token;
+                            if (token) {
+                                retryText += token;
+                                if (postLiveDeltas) this.webview.postMessage({ type: 'streamChunk', value: token });
+                            }
                            if (stopReason) finishStopReason = stopReason;
                        }
                        if (retryText.trim()) {
@@ -922,6 +1002,7 @@ export class AgentExecutor {
                    && !this.isStaleRun(runId)
                ) {
                    continuationCount++;
+                    const continuationStartMs = Date.now();
                    this.webview.postMessage({ type: 'autoContinue', value: `답변이 길어 이어서 정리하는 중입니다... (${continuationCount}/${config.maxAutoContinuations})` });
                    try {
                        const contMsgs: ChatMessage[] = [
@@ -929,11 +1010,24 @@ export class AgentExecutor {
                            { role: 'user', content: buildContinuationUserPrompt(originalUserPrompt, cleaned.visible) },
                        ];
                        lastMaxOutputTokens = computeOutputBudget(estimateMessagesTokens(contMsgs), ctxLimits).maxOutputTokens;
-                        const cr = await this.callNonStreaming({
-                            baseUrl: ollamaUrl, modelName: actualModel, engine, messages: contMsgs,
-                            temperature, maxTokens: lastMaxOutputTokens, contextLength: ctxLimits.contextLength,
-                            signal: this.abortController?.signal,
+                        // Stream the continuation through the same channel as the main turn so
+                        // the user sees the answer keep growing instead of freezing for 10–30s
+                        // while we silently call non-streaming. The trailing streamReplace
+                        // (after sanitize / merge) corrects any overlap the model re-emits.
+                        const cr = await this.streamChatOnce({
+                            runId, useLmStudioSdk, engine, ollamaUrl, modelName: actualModel,
+                            messages: contMsgs,
+                            temperature,
+                            maxTokens: lastMaxOutputTokens,
+                            contextLength: ctxLimits.contextLength,
+                            contextOverflowPolicy: config.contextOverflowPolicy,
+                            signal: this.abortController!.signal,
+                            postLiveDeltas,
                        });
+                        if (cr.aborted) {
+                            logInfo('Auto-continuation aborted mid-stream.', { model: actualModel, round: continuationCount });
+                            break;
+                        }
                        finishStopReason = cr.stopReason;
                        const ccl = extractVisibleFinal(cr.text);
                        if (!ccl.visible.trim()) {
@@ -944,6 +1038,15 @@ export class AgentExecutor {
                        cleaned = { ...cleaned, visible: mergeContinuationParts(cleaned.visible, ccl.visible), wasThoughtOnly: false };
                        lastOutputTokens = estimateTokens(ccl.visible);
                        logInfo('Auto-continued the answer.', { model: actualModel, round: continuationCount, addedChars: ccl.visible.length, totalChars: cleaned.visible.length, contStopReason: cr.stopReason, contMaxTokens: lastMaxOutputTokens });
+                        recordTelemetry({
+                            kind: 'continuation',
+                            durationMs: Date.now() - continuationStartMs,
+                            model: actualModel, engine,
+                            outputTokens: lastOutputTokens,
+                            round: continuationCount,
+                            stopReason: cr.stopReason,
+                            note: `addedChars=${ccl.visible.length} mergedAdd=${cleaned.visible.length - before.length}`,
+                        });
                        // Guard against a continuation that adds (almost) nothing new after dedup — stop instead of spinning.
                        if (cleaned.visible.length - before.length < 20) {
                            logInfo('Continuation added negligible new text — stopping.', { model: actualModel, round: continuationCount });
@@ -1099,7 +1202,32 @@ export class AgentExecutor {
                    value: { ...this._lastRetrievalInfo, hasAgentSelected: !!options.agentSkillFile, unaddressedChecklist },
                });
            }
-            this.webview.postMessage({ type: 'streamChunk', value: finalAssistantContent });
+            // Progressive answering: the bubble was filled live with raw tokens
+            // during streaming (and during any auto-continuation rounds). Now
+            // that we have the cleaned + merged + policy-enforced text, swap the
+            // bubble's content for the final version so the user sees the
+            // correct answer regardless of what slipped through live —
+            // hidden reasoning, mid-stream artifacts, continuation-overlap re-
+            // emits, truncation notice. Action-loop turns (loopDepth > 0) still
+            // append via streamChunk because the bubble has multiple action
+            // segments and we don't have a single "final" to replace with.
+            if (loopDepth === 0) {
+                this.webview.postMessage({ type: 'streamReplace', value: finalAssistantContent });
+                recordTelemetry({
+                    kind: 'turn',
+                    durationMs: Date.now() - turnStartMs,
+                    model: actualModel, engine,
+                    inputTokens,
+                    outputTokens,
+                    contextLength: ctxLimits.contextLength,
+                    stopReason: finishStopReason,
+                    brainFiles: this._lastRetrievalInfo?.usedBrainFiles.length ?? 0,
+                    memoryLayers: this._lastRetrievalInfo?.usedMemoryLayers ?? [],
+                    note: `continuations=${continuationCount} historyDropped=${reqMessages.length - budgetedHistory.length}`,
+                });
+            } else {
+                this.webview.postMessage({ type: 'streamChunk', value: finalAssistantContent });
+            }

        } catch (error: any) {
            this.statusBarManager.updateStatus(AgentStatus.Error, error.message);
@@ -2309,7 +2437,7 @@ export class AgentExecutor {
        });
    }

-    private buildMemoryContext(currentPrompt: string, activeBrain: BrainProfile, agentSkillFile?: string): string {
+    private async buildMemoryContext(currentPrompt: string, activeBrain: BrainProfile, agentSkillFile?: string): Promise<string> {
        const config = getConfig();
        this._lastRetrievalInfo = null;
        this._lastLessonContents = [];
@@ -2331,6 +2459,44 @@ export class AgentExecutor {
        // keeping the legacy behavior intact.
        const scope = resolveScopeForAgent(agentSkillFile, activeBrain.localBrainPath);

+        // Scale retrieval/memory budget with the configured context window so
+        // that raising g1nation.contextLength actually gives the RAG pipeline
+        // more room. At 32K context we keep the legacy 8K total (≈3.2K
+        // retrieval); at 230K we allocate ~57K total (≈23K retrieval). Capped
+        // at 80K so scoring stays fast on huge contexts.
+        const scaledTotalBudget = Math.min(
+            80000,
+            Math.max(8000, Math.floor(config.contextLength * 0.25))
+        );
+
+        // Pull recent session summaries for the medium-term layer. We read
+        // from the sidebar's persisted store directly (same key it writes to)
+        // to avoid threading another callback through the agent constructor.
+        const rawSessions = this.context.globalState.get<any[]>('chat_sessions', []) || [];
+        const recentSessions = compactRecentSessions(
+            rawSessions,
+            this.currentTaskId,
+            Math.max(0, config.memoryMediumTermSessions ?? 0)
+        );
+
+        // Hybrid retrieval (optional): when the user has configured an
+        // embedding model, fetch a query embedding so searchBrainFiles can
+        // blend cosine similarity with TF-IDF. Time-bounded — if the
+        // embedding endpoint is slow or down, we fall through with no
+        // embedding and the retriever stays in pure-TF-IDF mode.
+        let queryEmbedding: number[] | undefined;
+        if (config.embeddingModel) {
+            const EMBED_QUERY_TIMEOUT_MS = 4000;
+            try {
+                queryEmbedding = await Promise.race([
+                    embedQuery(currentPrompt, { baseUrl: config.ollamaUrl, model: config.embeddingModel }),
+                    new Promise<undefined>((resolve) => setTimeout(() => resolve(undefined), EMBED_QUERY_TIMEOUT_MS)),
+                ]);
+            } catch {
+                queryEmbedding = undefined;
+            }
+        }
+
        // Use the Unified RAG Pipeline
        const result = this.retrievalOrchestrator.retrieve(currentPrompt, {
            brain: activeBrain,
@@ -2338,13 +2504,36 @@ export class AgentExecutor {
            workspacePath,
            chatHistory: visibleHistory,
            contextBudget: {
-                totalBudget: 8000,
+                totalBudget: scaledTotalBudget,
                retrievalRatio: 0.4
            },
            brainFileLimit: config.memoryLongTermFiles,
-            scopeFolders: scope.folders
+            scopeFolders: scope.folders,
+            recentSessions,
+            mediumTermLimit: config.memoryMediumTermSessions ?? 0,
+            queryEmbedding,
+            embeddingModel: config.embeddingModel || undefined,
+            embeddingBlendAlpha: config.embeddingBlendAlpha,
        });

+        // Fire-and-forget background embedding for the files we just scored.
+        // Embeds only files that lack a vector for the current model — so
+        // steady-state turns do no embedding work. The next turn benefits.
+        if (config.embeddingModel) {
+            const scoredFilePaths = result.selectedChunks
+                .filter((c) => c.source === 'brain-memory' && c.metadata.filePath)
+                .map((c) => c.metadata.filePath!)
+                .filter((p, i, arr) => arr.indexOf(p) === i);
+            if (scoredFilePaths.length > 0) {
+                void backfillBrainEmbeddings(
+                    activeBrain.localBrainPath,
+                    scoredFilePaths,
+                    config.embeddingModel,
+                    (texts) => embedTexts(texts, { baseUrl: config.ollamaUrl, model: config.embeddingModel }),
+                );
+            }
+        }
+
        // Stash what actually fed this turn so handlePrompt can show it under the answer.
        const brainRoot = activeBrain.localBrainPath;
        const rel = (p?: string) => (p ? (path.relative(brainRoot, p) || p) : '');
@@ -2406,11 +2595,74 @@ export class AgentExecutor {
                workspacePath
            );
            logInfo('Memory extraction completed for session end.', { taskId: this.currentTaskId });
+            recordTelemetry({
+                kind: 'session-end',
+                note: `taskId=${this.currentTaskId} messages=${this.chatHistory.filter((m) => !m.internal).length}`,
+            });
+            // Fire-and-forget LLM compression: turns the raw transcript into a
+            // 2–3 sentence summary that medium-term retrieval can use instead
+            // of just "first user msg + last assistant 200 chars". Cheap call
+            // (~256 output tokens), runs in the background so it never blocks
+            // the next chat turn.
+            void this.compressSessionSummary(this.currentTaskId, this.chatHistory.slice());
        } catch (error: any) {
            logError('Memory extraction failed on session end.', { error: error?.message || String(error) });
        }
    }

+    /**
+     * Compress a finished session into a short summary and persist it to the
+     * session record. The summary is later read by `compactRecentSessions` so
+     * the medium-term memory layer carries a real recap instead of a fragment.
+     *
+     * Skips sessions with fewer than 3 visible messages — they're typically
+     * single-question pings where the raw first message is already a good
+     * summary. Failures are logged and swallowed: a missing summary just
+     * falls back to the legacy "first user msg" representation.
+     */
+    private async compressSessionSummary(taskId: string, history: ChatMessage[]): Promise<void> {
+        const visible = history.filter((m) => !m.internal && (m.role === 'user' || m.role === 'assistant'));
+        if (visible.length < 3) return;
+        const cfg = getConfig();
+        const transcript = visible
+            .map((m) => `${m.role.toUpperCase()}: ${String(m.content).replace(/\s+/g, ' ').slice(0, 400)}`)
+            .join('\n\n');
+        const messages: ChatMessage[] = [
+            {
+                role: 'system',
+                content: [
+                    'You compress chat transcripts into a 2-3 sentence summary.',
+                    'Capture: (1) the user\'s topic or task, (2) the main decision or answer reached, (3) any open issue.',
+                    'Reply in the user\'s primary language (mirror Korean ↔ English exactly as in the transcript).',
+                    'Reply with ONLY the summary text. No headers, no quotes, no preamble.',
+                ].join(' '),
+                internal: true,
+            },
+            { role: 'user', content: `[TRANSCRIPT]\n${transcript}\n[END]` },
+        ];
+        try {
+            const result = await this.callNonStreaming({
+                baseUrl: cfg.ollamaUrl,
+                modelName: cfg.defaultModel,
+                engine: resolveEngine(cfg.ollamaUrl),
+                messages,
+                temperature: 0.3,
+                maxTokens: 256,
+                contextLength: cfg.contextLength,
+            });
+            const summary = (result.text || '').trim().replace(/^["'`]+|["'`]+$/g, '');
+            if (!summary || summary.length < 12) return;
+            const sessions = this.context.globalState.get<any[]>('chat_sessions', []) || [];
+            const idx = sessions.findIndex((s) => String(s?.id) === String(taskId));
+            if (idx < 0) return;
+            sessions[idx].summary = summary;
+            await this.context.globalState.update('chat_sessions', sessions);
+            logInfo('Session summary stored for medium-term recall.', { taskId, length: summary.length });
+        } catch (e: any) {
+            logError('Session summary compression failed.', { taskId, error: e?.message ?? String(e) });
+        }
+    }
+
    private async createStreamingRequest(params: {
        baseUrl: string;
        modelName: string;
@@ -2568,6 +2820,134 @@ export class AgentExecutor {
        }
    }

+    /**
+     * Single streaming call used by progressive answering (live-delta main
+     * stream + auto-continuation rounds). Mirrors the main streaming block in
+     * handlePrompt but without the empty-stream recovery / non-streaming
+     * fallback machinery — those only matter for the very first generation.
+     *
+     * When `postLiveDeltas` is true, every token is also forwarded to the
+     * webview as a `streamChunk`, giving the user a real-time view of the
+     * answer (and of continuation rounds) instead of one big drop at the end.
+     *
+     * Returns the accumulated text and the final stop reason. Aborts and
+     * stale runs surface as `aborted: true` and an empty/partial text — the
+     * caller decides what to do with that.
+     */
+    private async streamChatOnce(params: {
+        runId: number;
+        useLmStudioSdk: boolean;
+        engine: 'lmstudio' | 'ollama';
+        ollamaUrl: string;
+        modelName: string;
+        messages: ChatMessage[];
+        temperature: number;
+        maxTokens: number;
+        contextLength: number;
+        contextOverflowPolicy: 'stopAtLimit' | 'truncateMiddle' | 'rollingWindow';
+        signal: AbortSignal;
+        postLiveDeltas: boolean;
+    }): Promise<{ text: string; stopReason?: string; aborted: boolean }> {
+        let accumulated = '';
+        let finishStopReason: string | undefined;
+        const post = (token: string) => {
+            if (params.postLiveDeltas && token) {
+                this.webview?.postMessage({ type: 'streamChunk', value: token });
+            }
+        };
+
+        if (params.useLmStudioSdk) {
+            try {
+                const stream = this.options.lmStudioStreamer!.stream({
+                    modelName: params.modelName,
+                    messages: params.messages.map((m) => ({ role: m.role, content: m.content })),
+                    temperature: params.temperature,
+                    maxTokens: params.maxTokens,
+                    contextOverflowPolicy: params.contextOverflowPolicy,
+                    signal: params.signal,
+                });
+                for await (const { token, stopReason } of stream) {
+                    if (this.isStaleRun(params.runId)) {
+                        return { text: accumulated, stopReason: finishStopReason, aborted: true };
+                    }
+                    if (token) {
+                        accumulated += token;
+                        post(token);
+                    }
+                    if (stopReason) finishStopReason = stopReason;
+                }
+            } catch (err: any) {
+                if (err?.name === 'AbortError' || params.signal.aborted) {
+                    return { text: accumulated, stopReason: finishStopReason, aborted: true };
+                }
+                const msg = err?.message ?? String(err);
+                if (/context\s*length|contextlengthreached|exceed|too\s*long/i.test(msg)) {
+                    finishStopReason = 'contextLengthReached';
+                }
+                logError('streamChatOnce SDK path failed.', { engine: params.engine, error: msg });
+                throw err;
+            }
+            return { text: accumulated, stopReason: finishStopReason, aborted: false };
+        }
+
+        const request = await this.createStreamingRequest({
+            baseUrl: params.ollamaUrl,
+            modelName: params.modelName,
+            reqMessages: params.messages,
+            temperature: params.temperature,
+            maxTokens: params.maxTokens,
+            contextLength: params.contextLength,
+        });
+        const reader = request.response.body?.getReader();
+        if (!reader) throw new Error('Response body is not readable.');
+        const decoder = new TextDecoder();
+        let buffer = '';
+        const consumeJsonLine = (line: string) => {
+            const trimmed = line.trim();
+            if (!trimmed || trimmed === 'data: [DONE]') return;
+            try {
+                const raw = trimmed.startsWith('data: ') ? trimmed.slice(6) : trimmed;
+                const json = JSON.parse(raw);
+                const token = params.engine === 'lmstudio'
+                    ? json.choices?.[0]?.delta?.content || ''
+                    : json.message?.content || json.response || '';
+                if (token) {
+                    accumulated += token;
+                    post(token);
+                }
+                const fr = params.engine === 'lmstudio'
+                    ? json.choices?.[0]?.finish_reason
+                    : (json.done_reason ?? (json.done === true ? 'stop' : undefined));
+                if (fr) finishStopReason = fr;
+            } catch (e: any) {
+                logError('streamChatOnce: failed to parse chunk.', { engine: params.engine, chunk: summarizeText(trimmed, 200), error: e?.message ?? String(e) });
+            }
+        };
+        try {
+            while (true) {
+                const { done, value } = await reader.read();
+                if (done) break;
+                if (this.isStaleRun(params.runId)) {
+                    return { text: accumulated, stopReason: finishStopReason, aborted: true };
+                }
+                buffer += decoder.decode(value, { stream: true });
+                const lines = buffer.split('\n');
+                buffer = lines.pop() || '';
+                for (const line of lines) consumeJsonLine(line);
+            }
+            if (buffer.trim()) consumeJsonLine(buffer);
+        } catch (err: any) {
+            if (err?.name === 'AbortError') {
+                return { text: accumulated, stopReason: finishStopReason, aborted: true };
+            }
+            logError('streamChatOnce REST path failed.', { engine: params.engine, error: err?.message ?? String(err) });
+            throw err;
+        } finally {
+            try { reader.releaseLock(); } catch { /* already released on abort */ }
+        }
+        return { text: accumulated, stopReason: finishStopReason, aborted: false };
+    }
+
    private normalizeMessages(messages: ChatMessage[]) {
        return messages.map((message) => {
            const normalizedContent = typeof message.content === 'string'