release: v2.80.32 - LM Studio SDK resilience & auto-recovery

This commit is contained in:
g1nation
2026-05-11 13:19:07 +09:00
parent 5d3df0816f
commit 6347a223a7
10 changed files with 195 additions and 72 deletions
+72 -22
View File
@@ -591,33 +591,68 @@ export class AgentExecutor {
// ── Empty-response auto-recovery ──
// Streaming failed silently (network blip, model cold-start, context
// overflow, etc.). Before surfacing the error to the user, try one
// non-streaming retry: many LM Studio failures are streaming-only
// (the SSE channel drops mid-token while a single POST returns the
// whole answer fine). This covers the most common "empty response"
// pattern users hit without the user having to click anything.
// overflow, etc.). Before surfacing the error to the user we try two
// recovery steps in order:
//
// (1) When the empty stream came from the LM Studio SDK path, drop
// the cached handle and retry streaming once. The SDK keeps a
// per-model handle in its internal map; an aborted prediction
// can leave that handle disposed so the next respond() returns
// zero tokens cleanly (no error thrown, stream just ends).
// A fresh WebSocket / handle lookup recovers from this without
// us having to ask the user to retry.
//
// (2) Fall back to a single non-streaming POST. Many LM Studio
// failures are streaming-only (the SSE channel drops mid-token
// while one POST returns the whole answer fine).
//
// Only attempts recovery on loopDepth === 0 — we don't want to
// ping-pong inside the autonomous action loop.
if (!aiResponseText.trim() && !this.abortController?.signal.aborted && loopDepth === 0) {
try {
logInfo('Empty stream — trying non-streaming fallback.', { engine, model: actualModel, apiUrl });
const fallback = await this.callNonStreaming({
baseUrl: ollamaUrl,
modelName: actualModel,
engine,
messages: messagesForRequest,
temperature,
signal: this.abortController?.signal,
});
if (fallback && fallback.trim()) {
aiResponseText = fallback;
logInfo('Non-streaming fallback recovered the answer.', { engine, model: actualModel, length: fallback.length });
if (useLmStudioSdk && this.options.lmStudioStreamer?.resetHandle) {
try {
logInfo('Empty SDK stream — resetting LM Studio handle and retrying streaming once.', { model: actualModel });
await this.options.lmStudioStreamer.resetHandle(actualModel);
const retryStream = this.options.lmStudioStreamer.stream({
modelName: actualModel,
messages: messagesForRequest.map((m) => ({ role: m.role, content: m.content })),
temperature,
signal: this.abortController.signal,
});
let retryText = '';
for await (const { token } of retryStream) {
if (this.isStaleRun(runId)) return;
if (token) retryText += token;
}
if (retryText.trim()) {
aiResponseText = retryText;
logInfo('Handle-reset retry recovered the answer.', { model: actualModel, length: retryText.length });
}
} catch (retryErr: any) {
logError('Handle-reset retry failed.', { model: actualModel, error: retryErr?.message ?? String(retryErr) });
}
}
if (!aiResponseText.trim() && !this.abortController?.signal.aborted) {
try {
logInfo('Empty stream — trying non-streaming fallback.', { engine, model: actualModel, apiUrl });
const fallback = await this.callNonStreaming({
baseUrl: ollamaUrl,
modelName: actualModel,
engine,
messages: messagesForRequest,
temperature,
signal: this.abortController?.signal,
});
if (fallback && fallback.trim()) {
aiResponseText = fallback;
logInfo('Non-streaming fallback recovered the answer.', { engine, model: actualModel, length: fallback.length });
}
} catch (recoverErr: any) {
logError('Non-streaming fallback also failed.', {
engine, model: actualModel, error: recoverErr?.message ?? String(recoverErr),
});
}
} catch (recoverErr: any) {
logError('Non-streaming fallback also failed.', {
engine, model: actualModel, error: recoverErr?.message ?? String(recoverErr),
});
}
}
@@ -668,6 +703,20 @@ export class AgentExecutor {
promptCharCount, messageCount: messagesForRequest.length,
fallbackTried: loopDepth === 0 ? 'yes' : 'no',
});
// Cheap heuristic: parse a parameter-count hint out of the
// model identifier (e.g. "google/gemma-4-e2b", "qwen2-1.5b").
// Anything <= 3B is small enough that long-context generation
// commonly fails by emitting EOS as the first token even though
// the server log shows prompt-eval succeeded with truncated=0.
const smallModelMatch = actualModel.match(/(?<![0-9.])((?:[0-9]+\.)?[0-9]+)\s*[bB](?![a-zA-Z0-9])|[-_/]e?([0-9]+)b\b/i);
const paramB = smallModelMatch
? Number(smallModelMatch[1] ?? smallModelMatch[2])
: Number.NaN;
const looksSmall = Number.isFinite(paramB) && paramB <= 3;
const promptIsLarge = promptCharCount > 60000; // ~15k tokens of English/code
const contextLimitHint =
'LM Studio 로그에 `n_tokens = N, truncated = 0` 인데 `eval time` 이 0ms 라면 모델이 첫 토큰부터 EOS 를 뱉은 것입니다. 보통 컨텍스트 한계 초과 또는 모델 용량 부족입니다. 더 큰 모델(7B+)로 교체하거나 컨텍스트를 줄여 보세요.';
this.webview.postMessage({
type: 'error',
value: [
@@ -682,6 +731,7 @@ export class AgentExecutor {
? ' • 프롬프트가 너무 큽니다 (16k chars 초과). Skill/Brain 컨텍스트를 좁혀 보세요.'
: ' • 다른 모델로 전환하거나 LM Studio 서버를 재시작',
' • Settings에서 maxContextSize 또는 memoryLongTermFiles 줄이기',
...(looksSmall || promptIsLarge ? [' • ' + contextLimitHint] : []),
].join('\n')
});
return;
+18 -3
View File
@@ -7,8 +7,14 @@ export interface ILMStudioClient {
listLoaded(): Promise<string[]>;
/** Like listLoaded() but caches the result for `ttlMs` to avoid hammering the SDK. */
listLoadedCached(ttlMs?: number): Promise<string[]>;
/** Resolve a chat-ready handle for an already-loaded (or just-loaded) model. */
getModelHandle(modelKey: string): Promise<LLM>;
/**
* Resolve a chat-ready handle for an already-loaded (or just-loaded) model.
*
* `options.refresh: true` drops the SDK + WebSocket so any disposed handle
* sitting in the SDK's internal handle map is discarded. Use this after a
* "Model is disposed!" or "lock() request could not be registered" error.
*/
getModelHandle(modelKey: string, options?: { refresh?: boolean }): Promise<LLM>;
isReachable(): Promise<boolean>;
setBaseUrl(httpBaseUrl: string): void;
}
@@ -111,8 +117,17 @@ export class LMStudioClient implements ILMStudioClient {
}
}
async getModelHandle(modelKey: string): Promise<LLM> {
async getModelHandle(modelKey: string, options?: { refresh?: boolean }): Promise<LLM> {
try {
if (options?.refresh) {
// Recreate the SDK + WebSocket so the SDK's internal handle
// cache is dropped. The next llm.model() call mints a fresh
// handle instead of returning the disposed one from the
// previous (aborted) prediction.
this._sdk = undefined;
this._loadedCache = undefined;
logInfo('LM Studio SDK handle refresh requested — dropped cached SDK client.', { modelKey });
}
return await this.getSdk().llm.model(modelKey);
} catch (e: any) {
const msg = e?.message ?? String(e);
+80 -31
View File
@@ -18,6 +18,12 @@ export interface ChatStreamRequest {
export interface IChatStreamer {
/** Token-level streaming for an LM Studio chat completion via the WebSocket SDK. */
stream(req: ChatStreamRequest): AsyncIterable<{ token: string }>;
/**
* Drop the SDK's cached handle for `modelName`. Callers invoke this when
* the previous stream returned zero tokens with no error — a symptom of a
* silently-disposed handle that needs a fresh WebSocket round-trip.
*/
resetHandle?(modelName: string): Promise<void>;
}
/**
@@ -39,41 +45,84 @@ export class LMStudioStreamer implements IChatStreamer {
throw new LMStudioLifecycleError('LMStudioStreamer.stream called without a model name.');
}
const model = await this.client.getModelHandle(trimmedModel);
logInfo('LM Studio SDK chat stream started.', { model: trimmedModel, messageCount: req.messages.length });
// One automatic retry path: when the first attempt blows up with a
// "Model is disposed!" / "lock() request could not be registered"
// error before any tokens have been yielded, we drop the cached SDK
// handle and try once more. These errors are caused by a previous
// aborted prediction leaving the SDK's internal handle map pointing
// at a dead WebSocket binding — a fresh client.model() lookup minted
// from a recreated SDK fixes it. We only retry when zero tokens have
// streamed: if the consumer already saw partial output, restarting
// would duplicate tokens.
for (let attempt = 1; attempt <= 2; attempt++) {
const refresh = attempt > 1;
const model = await this.client.getModelHandle(trimmedModel, refresh ? { refresh: true } : undefined);
logInfo('LM Studio SDK chat stream started.', { model: trimmedModel, messageCount: req.messages.length, attempt });
const prediction = (model as any).respond(req.messages, {
temperature: req.temperature,
maxTokens: req.maxTokens ?? 4096,
signal: req.signal,
});
const prediction = (model as any).respond(req.messages, {
temperature: req.temperature,
maxTokens: req.maxTokens ?? 4096,
signal: req.signal,
});
// Bridge AbortSignal → prediction.cancel(): without this, an aborted
// request keeps generating on the LM Studio server. The orphaned
// prediction holds locks on the model handle, which is a known cause
// of "lock() request could not be registered" on the very next
// request — the reused handle is still bound to a dead prediction.
const onAbort = () => {
try { (prediction as any)?.cancel?.(); } catch { /* swallow — best effort */ }
};
if (req.signal) {
if (req.signal.aborted) onAbort();
else req.signal.addEventListener('abort', onAbort, { once: true });
}
try {
for await (const fragment of prediction as AsyncIterable<{ content: string }>) {
if (req.signal?.aborted) return;
const token = fragment?.content ?? '';
if (token) yield { token };
// Bridge AbortSignal → prediction.cancel(): without this, an
// aborted request keeps generating on the LM Studio server. The
// orphaned prediction holds locks on the model handle, which is
// a known cause of "lock() request could not be registered" on
// the very next request — the reused handle is still bound to a
// dead prediction.
const onAbort = () => {
try { (prediction as any)?.cancel?.(); } catch { /* swallow — best effort */ }
};
if (req.signal) {
if (req.signal.aborted) onAbort();
else req.signal.addEventListener('abort', onAbort, { once: true });
}
let yielded = 0;
let caught: any = null;
try {
for await (const fragment of prediction as AsyncIterable<{ content: string }>) {
if (req.signal?.aborted) return;
const token = fragment?.content ?? '';
if (token) {
yielded++;
yield { token };
}
}
} catch (err: any) {
if (req.signal?.aborted) return;
if (err?.name === 'AbortError') return;
caught = err;
} finally {
req.signal?.removeEventListener?.('abort', onAbort);
}
if (!caught) return;
const errMsg = String(caught?.message ?? caught);
const handleDead = /\bdisposed\b/i.test(errMsg)
|| /lock\(\) request could not be registered/i.test(errMsg);
if (handleDead && yielded === 0 && attempt === 1) {
logInfo('Dead LM Studio handle detected — retrying with a fresh SDK.', { model: trimmedModel, error: errMsg });
continue;
}
logError('LM Studio SDK chat stream failed.', { model: trimmedModel, error: errMsg, attempt });
throw caught;
}
}
async resetHandle(modelName: string): Promise<void> {
const trimmed = (modelName || '').trim();
if (!trimmed) return;
try {
await this.client.getModelHandle(trimmed, { refresh: true });
} catch (err: any) {
if (req.signal?.aborted) return;
if (err?.name === 'AbortError') return;
logError('LM Studio SDK chat stream failed.', { model: trimmedModel, error: err?.message ?? String(err) });
throw err;
} finally {
req.signal?.removeEventListener?.('abort', onAbort);
// Best effort — caller will see the next stream() attempt fail
// with a normal error path if the refresh itself was broken.
logError('LM Studio handle reset failed.', { model: trimmed, error: err?.message ?? String(err) });
}
}
}