release: v2.80.32 - LM Studio SDK resilience & auto-recovery
This commit is contained in:
+18
-3
@@ -7,8 +7,14 @@ export interface ILMStudioClient {
|
||||
listLoaded(): Promise<string[]>;
|
||||
/** Like listLoaded() but caches the result for `ttlMs` to avoid hammering the SDK. */
|
||||
listLoadedCached(ttlMs?: number): Promise<string[]>;
|
||||
/** Resolve a chat-ready handle for an already-loaded (or just-loaded) model. */
|
||||
getModelHandle(modelKey: string): Promise<LLM>;
|
||||
/**
|
||||
* Resolve a chat-ready handle for an already-loaded (or just-loaded) model.
|
||||
*
|
||||
* `options.refresh: true` drops the SDK + WebSocket so any disposed handle
|
||||
* sitting in the SDK's internal handle map is discarded. Use this after a
|
||||
* "Model is disposed!" or "lock() request could not be registered" error.
|
||||
*/
|
||||
getModelHandle(modelKey: string, options?: { refresh?: boolean }): Promise<LLM>;
|
||||
isReachable(): Promise<boolean>;
|
||||
setBaseUrl(httpBaseUrl: string): void;
|
||||
}
|
||||
@@ -111,8 +117,17 @@ export class LMStudioClient implements ILMStudioClient {
|
||||
}
|
||||
}
|
||||
|
||||
async getModelHandle(modelKey: string): Promise<LLM> {
|
||||
async getModelHandle(modelKey: string, options?: { refresh?: boolean }): Promise<LLM> {
|
||||
try {
|
||||
if (options?.refresh) {
|
||||
// Recreate the SDK + WebSocket so the SDK's internal handle
|
||||
// cache is dropped. The next llm.model() call mints a fresh
|
||||
// handle instead of returning the disposed one from the
|
||||
// previous (aborted) prediction.
|
||||
this._sdk = undefined;
|
||||
this._loadedCache = undefined;
|
||||
logInfo('LM Studio SDK handle refresh requested — dropped cached SDK client.', { modelKey });
|
||||
}
|
||||
return await this.getSdk().llm.model(modelKey);
|
||||
} catch (e: any) {
|
||||
const msg = e?.message ?? String(e);
|
||||
|
||||
+80
-31
@@ -18,6 +18,12 @@ export interface ChatStreamRequest {
|
||||
export interface IChatStreamer {
|
||||
/** Token-level streaming for an LM Studio chat completion via the WebSocket SDK. */
|
||||
stream(req: ChatStreamRequest): AsyncIterable<{ token: string }>;
|
||||
/**
|
||||
* Drop the SDK's cached handle for `modelName`. Callers invoke this when
|
||||
* the previous stream returned zero tokens with no error — a symptom of a
|
||||
* silently-disposed handle that needs a fresh WebSocket round-trip.
|
||||
*/
|
||||
resetHandle?(modelName: string): Promise<void>;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -39,41 +45,84 @@ export class LMStudioStreamer implements IChatStreamer {
|
||||
throw new LMStudioLifecycleError('LMStudioStreamer.stream called without a model name.');
|
||||
}
|
||||
|
||||
const model = await this.client.getModelHandle(trimmedModel);
|
||||
logInfo('LM Studio SDK chat stream started.', { model: trimmedModel, messageCount: req.messages.length });
|
||||
// One automatic retry path: when the first attempt blows up with a
|
||||
// "Model is disposed!" / "lock() request could not be registered"
|
||||
// error before any tokens have been yielded, we drop the cached SDK
|
||||
// handle and try once more. These errors are caused by a previous
|
||||
// aborted prediction leaving the SDK's internal handle map pointing
|
||||
// at a dead WebSocket binding — a fresh client.model() lookup minted
|
||||
// from a recreated SDK fixes it. We only retry when zero tokens have
|
||||
// streamed: if the consumer already saw partial output, restarting
|
||||
// would duplicate tokens.
|
||||
for (let attempt = 1; attempt <= 2; attempt++) {
|
||||
const refresh = attempt > 1;
|
||||
const model = await this.client.getModelHandle(trimmedModel, refresh ? { refresh: true } : undefined);
|
||||
logInfo('LM Studio SDK chat stream started.', { model: trimmedModel, messageCount: req.messages.length, attempt });
|
||||
|
||||
const prediction = (model as any).respond(req.messages, {
|
||||
temperature: req.temperature,
|
||||
maxTokens: req.maxTokens ?? 4096,
|
||||
signal: req.signal,
|
||||
});
|
||||
const prediction = (model as any).respond(req.messages, {
|
||||
temperature: req.temperature,
|
||||
maxTokens: req.maxTokens ?? 4096,
|
||||
signal: req.signal,
|
||||
});
|
||||
|
||||
// Bridge AbortSignal → prediction.cancel(): without this, an aborted
|
||||
// request keeps generating on the LM Studio server. The orphaned
|
||||
// prediction holds locks on the model handle, which is a known cause
|
||||
// of "lock() request could not be registered" on the very next
|
||||
// request — the reused handle is still bound to a dead prediction.
|
||||
const onAbort = () => {
|
||||
try { (prediction as any)?.cancel?.(); } catch { /* swallow — best effort */ }
|
||||
};
|
||||
if (req.signal) {
|
||||
if (req.signal.aborted) onAbort();
|
||||
else req.signal.addEventListener('abort', onAbort, { once: true });
|
||||
}
|
||||
|
||||
try {
|
||||
for await (const fragment of prediction as AsyncIterable<{ content: string }>) {
|
||||
if (req.signal?.aborted) return;
|
||||
const token = fragment?.content ?? '';
|
||||
if (token) yield { token };
|
||||
// Bridge AbortSignal → prediction.cancel(): without this, an
|
||||
// aborted request keeps generating on the LM Studio server. The
|
||||
// orphaned prediction holds locks on the model handle, which is
|
||||
// a known cause of "lock() request could not be registered" on
|
||||
// the very next request — the reused handle is still bound to a
|
||||
// dead prediction.
|
||||
const onAbort = () => {
|
||||
try { (prediction as any)?.cancel?.(); } catch { /* swallow — best effort */ }
|
||||
};
|
||||
if (req.signal) {
|
||||
if (req.signal.aborted) onAbort();
|
||||
else req.signal.addEventListener('abort', onAbort, { once: true });
|
||||
}
|
||||
|
||||
let yielded = 0;
|
||||
let caught: any = null;
|
||||
try {
|
||||
for await (const fragment of prediction as AsyncIterable<{ content: string }>) {
|
||||
if (req.signal?.aborted) return;
|
||||
const token = fragment?.content ?? '';
|
||||
if (token) {
|
||||
yielded++;
|
||||
yield { token };
|
||||
}
|
||||
}
|
||||
} catch (err: any) {
|
||||
if (req.signal?.aborted) return;
|
||||
if (err?.name === 'AbortError') return;
|
||||
caught = err;
|
||||
} finally {
|
||||
req.signal?.removeEventListener?.('abort', onAbort);
|
||||
}
|
||||
|
||||
if (!caught) return;
|
||||
|
||||
const errMsg = String(caught?.message ?? caught);
|
||||
const handleDead = /\bdisposed\b/i.test(errMsg)
|
||||
|| /lock\(\) request could not be registered/i.test(errMsg);
|
||||
|
||||
if (handleDead && yielded === 0 && attempt === 1) {
|
||||
logInfo('Dead LM Studio handle detected — retrying with a fresh SDK.', { model: trimmedModel, error: errMsg });
|
||||
continue;
|
||||
}
|
||||
|
||||
logError('LM Studio SDK chat stream failed.', { model: trimmedModel, error: errMsg, attempt });
|
||||
throw caught;
|
||||
}
|
||||
}
|
||||
|
||||
async resetHandle(modelName: string): Promise<void> {
|
||||
const trimmed = (modelName || '').trim();
|
||||
if (!trimmed) return;
|
||||
try {
|
||||
await this.client.getModelHandle(trimmed, { refresh: true });
|
||||
} catch (err: any) {
|
||||
if (req.signal?.aborted) return;
|
||||
if (err?.name === 'AbortError') return;
|
||||
logError('LM Studio SDK chat stream failed.', { model: trimmedModel, error: err?.message ?? String(err) });
|
||||
throw err;
|
||||
} finally {
|
||||
req.signal?.removeEventListener?.('abort', onAbort);
|
||||
// Best effort — caller will see the next stream() attempt fail
|
||||
// with a normal error path if the refresh itself was broken.
|
||||
logError('LM Studio handle reset failed.', { model: trimmed, error: err?.message ?? String(err) });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user