release: v2.80.32 - LM Studio SDK resilience & auto-recovery
This commit is contained in:
+72
-22
@@ -591,33 +591,68 @@ export class AgentExecutor {
|
||||
|
||||
// ── Empty-response auto-recovery ──
|
||||
// Streaming failed silently (network blip, model cold-start, context
|
||||
// overflow, etc.). Before surfacing the error to the user, try one
|
||||
// non-streaming retry: many LM Studio failures are streaming-only
|
||||
// (the SSE channel drops mid-token while a single POST returns the
|
||||
// whole answer fine). This covers the most common "empty response"
|
||||
// pattern users hit without the user having to click anything.
|
||||
// overflow, etc.). Before surfacing the error to the user we try two
|
||||
// recovery steps in order:
|
||||
//
|
||||
// (1) When the empty stream came from the LM Studio SDK path, drop
|
||||
// the cached handle and retry streaming once. The SDK keeps a
|
||||
// per-model handle in its internal map; an aborted prediction
|
||||
// can leave that handle disposed so the next respond() returns
|
||||
// zero tokens cleanly (no error thrown, stream just ends).
|
||||
// A fresh WebSocket / handle lookup recovers from this without
|
||||
// us having to ask the user to retry.
|
||||
//
|
||||
// (2) Fall back to a single non-streaming POST. Many LM Studio
|
||||
// failures are streaming-only (the SSE channel drops mid-token
|
||||
// while one POST returns the whole answer fine).
|
||||
//
|
||||
// Only attempts recovery on loopDepth === 0 — we don't want to
|
||||
// ping-pong inside the autonomous action loop.
|
||||
if (!aiResponseText.trim() && !this.abortController?.signal.aborted && loopDepth === 0) {
|
||||
try {
|
||||
logInfo('Empty stream — trying non-streaming fallback.', { engine, model: actualModel, apiUrl });
|
||||
const fallback = await this.callNonStreaming({
|
||||
baseUrl: ollamaUrl,
|
||||
modelName: actualModel,
|
||||
engine,
|
||||
messages: messagesForRequest,
|
||||
temperature,
|
||||
signal: this.abortController?.signal,
|
||||
});
|
||||
if (fallback && fallback.trim()) {
|
||||
aiResponseText = fallback;
|
||||
logInfo('Non-streaming fallback recovered the answer.', { engine, model: actualModel, length: fallback.length });
|
||||
if (useLmStudioSdk && this.options.lmStudioStreamer?.resetHandle) {
|
||||
try {
|
||||
logInfo('Empty SDK stream — resetting LM Studio handle and retrying streaming once.', { model: actualModel });
|
||||
await this.options.lmStudioStreamer.resetHandle(actualModel);
|
||||
const retryStream = this.options.lmStudioStreamer.stream({
|
||||
modelName: actualModel,
|
||||
messages: messagesForRequest.map((m) => ({ role: m.role, content: m.content })),
|
||||
temperature,
|
||||
signal: this.abortController.signal,
|
||||
});
|
||||
let retryText = '';
|
||||
for await (const { token } of retryStream) {
|
||||
if (this.isStaleRun(runId)) return;
|
||||
if (token) retryText += token;
|
||||
}
|
||||
if (retryText.trim()) {
|
||||
aiResponseText = retryText;
|
||||
logInfo('Handle-reset retry recovered the answer.', { model: actualModel, length: retryText.length });
|
||||
}
|
||||
} catch (retryErr: any) {
|
||||
logError('Handle-reset retry failed.', { model: actualModel, error: retryErr?.message ?? String(retryErr) });
|
||||
}
|
||||
}
|
||||
|
||||
if (!aiResponseText.trim() && !this.abortController?.signal.aborted) {
|
||||
try {
|
||||
logInfo('Empty stream — trying non-streaming fallback.', { engine, model: actualModel, apiUrl });
|
||||
const fallback = await this.callNonStreaming({
|
||||
baseUrl: ollamaUrl,
|
||||
modelName: actualModel,
|
||||
engine,
|
||||
messages: messagesForRequest,
|
||||
temperature,
|
||||
signal: this.abortController?.signal,
|
||||
});
|
||||
if (fallback && fallback.trim()) {
|
||||
aiResponseText = fallback;
|
||||
logInfo('Non-streaming fallback recovered the answer.', { engine, model: actualModel, length: fallback.length });
|
||||
}
|
||||
} catch (recoverErr: any) {
|
||||
logError('Non-streaming fallback also failed.', {
|
||||
engine, model: actualModel, error: recoverErr?.message ?? String(recoverErr),
|
||||
});
|
||||
}
|
||||
} catch (recoverErr: any) {
|
||||
logError('Non-streaming fallback also failed.', {
|
||||
engine, model: actualModel, error: recoverErr?.message ?? String(recoverErr),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -668,6 +703,20 @@ export class AgentExecutor {
|
||||
promptCharCount, messageCount: messagesForRequest.length,
|
||||
fallbackTried: loopDepth === 0 ? 'yes' : 'no',
|
||||
});
|
||||
// Cheap heuristic: parse a parameter-count hint out of the
|
||||
// model identifier (e.g. "google/gemma-4-e2b", "qwen2-1.5b").
|
||||
// Anything <= 3B is small enough that long-context generation
|
||||
// commonly fails by emitting EOS as the first token even though
|
||||
// the server log shows prompt-eval succeeded with truncated=0.
|
||||
const smallModelMatch = actualModel.match(/(?<![0-9.])((?:[0-9]+\.)?[0-9]+)\s*[bB](?![a-zA-Z0-9])|[-_/]e?([0-9]+)b\b/i);
|
||||
const paramB = smallModelMatch
|
||||
? Number(smallModelMatch[1] ?? smallModelMatch[2])
|
||||
: Number.NaN;
|
||||
const looksSmall = Number.isFinite(paramB) && paramB <= 3;
|
||||
const promptIsLarge = promptCharCount > 60000; // ~15k tokens of English/code
|
||||
const contextLimitHint =
|
||||
'LM Studio 로그에 `n_tokens = N, truncated = 0` 인데 `eval time` 이 0ms 라면 모델이 첫 토큰부터 EOS 를 뱉은 것입니다. 보통 컨텍스트 한계 초과 또는 모델 용량 부족입니다. 더 큰 모델(7B+)로 교체하거나 컨텍스트를 줄여 보세요.';
|
||||
|
||||
this.webview.postMessage({
|
||||
type: 'error',
|
||||
value: [
|
||||
@@ -682,6 +731,7 @@ export class AgentExecutor {
|
||||
? ' • 프롬프트가 너무 큽니다 (16k chars 초과). Skill/Brain 컨텍스트를 좁혀 보세요.'
|
||||
: ' • 다른 모델로 전환하거나 LM Studio 서버를 재시작',
|
||||
' • Settings에서 maxContextSize 또는 memoryLongTermFiles 줄이기',
|
||||
...(looksSmall || promptIsLarge ? [' • ' + contextLimitHint] : []),
|
||||
].join('\n')
|
||||
});
|
||||
return;
|
||||
|
||||
Reference in New Issue
Block a user