import type { ILMStudioClient } from './client'; import { LMStudioLifecycleError } from './client'; import { logError, logInfo } from '../utils'; export interface ChatStreamMessage { role: 'user' | 'assistant' | 'system'; content: string; } export interface ChatStreamRequest { modelName: string; messages: ChatStreamMessage[]; temperature: number; /** Upper bound on tokens to generate. Omit to fall back to a conservative default. */ maxTokens?: number; /** LM Studio context-overflow safety net used only if the prompt still exceeds the window. */ contextOverflowPolicy?: 'stopAtLimit' | 'truncateMiddle' | 'rollingWindow'; signal?: AbortSignal; } /** * One stream event. `token` carries generated text (possibly empty for the final event); * `stopReason` is set on the *last* event only and is the SDK's `stats.stopReason` * (e.g. `eosFound`, `maxPredictedTokensReached`, `contextLengthReached`, `userStopped`). */ export interface ChatStreamEvent { token: string; stopReason?: string; } export interface IChatStreamer { /** Token-level streaming for an LM Studio chat completion via the WebSocket SDK. */ stream(req: ChatStreamRequest): AsyncIterable; /** * Drop the SDK's cached handle for `modelName`. Callers invoke this when * the previous stream returned zero tokens with no error — a symptom of a * silently-disposed handle that needs a fresh WebSocket round-trip. */ resetHandle?(modelName: string): Promise; } /** * Adapter that streams LM Studio chat completions via @lmstudio/sdk's `model.respond()`, * replacing the manual fetch + SSE parser path used for the OpenAI-compatible REST endpoint. * * Benefits over the REST path: * - No SSE parsing (no `data: [DONE]` / partial-chunk fragility). * - Reuses the same WebSocket the lifecycle manager already opened — handle lookup is cheap * if the model is already loaded, and load-on-first-use is implicit when it isn't. * - First-class `signal` support for user-cancel and abort propagation. */ export class LMStudioStreamer implements IChatStreamer { constructor(private readonly client: ILMStudioClient) {} async *stream(req: ChatStreamRequest): AsyncIterable { const trimmedModel = (req.modelName || '').trim(); if (!trimmedModel) { throw new LMStudioLifecycleError('LMStudioStreamer.stream called without a model name.'); } // One automatic retry path: when the first attempt blows up with a // "Model is disposed!" / "lock() request could not be registered" // error before any tokens have been yielded, we drop the cached SDK // handle and try once more. These errors are caused by a previous // aborted prediction leaving the SDK's internal handle map pointing // at a dead WebSocket binding — a fresh client.model() lookup minted // from a recreated SDK fixes it. We only retry when zero tokens have // streamed: if the consumer already saw partial output, restarting // would duplicate tokens. for (let attempt = 1; attempt <= 2; attempt++) { const refresh = attempt > 1; const model = await this.client.getModelHandle(trimmedModel, refresh ? { refresh: true } : undefined); logInfo('LM Studio SDK chat stream started.', { model: trimmedModel, messageCount: req.messages.length, attempt }); const prediction = (model as any).respond(req.messages, { temperature: req.temperature, maxTokens: req.maxTokens ?? 4096, // Safety net: if our own token budgeting still underestimated and the prompt // exceeds the model's context window, decide whether the SDK should fail // loudly (stopAtLimit — default) or silently drop content. contextOverflowPolicy: req.contextOverflowPolicy ?? 'stopAtLimit', signal: req.signal, }); // Bridge AbortSignal → prediction.cancel(): without this, an // aborted request keeps generating on the LM Studio server. The // orphaned prediction holds locks on the model handle, which is // a known cause of "lock() request could not be registered" on // the very next request — the reused handle is still bound to a // dead prediction. const onAbort = () => { try { (prediction as any)?.cancel?.(); } catch { /* swallow — best effort */ } }; if (req.signal) { if (req.signal.aborted) onAbort(); else req.signal.addEventListener('abort', onAbort, { once: true }); } let yielded = 0; let caught: any = null; try { for await (const fragment of prediction as AsyncIterable<{ content: string }>) { if (req.signal?.aborted) return; const token = fragment?.content ?? ''; if (token) { yielded++; yield { token }; } } } catch (err: any) { if (req.signal?.aborted) return; if (err?.name === 'AbortError') return; caught = err; } finally { req.signal?.removeEventListener?.('abort', onAbort); } if (!caught) { if (req.signal?.aborted) return; // The prediction object is also a Promise; awaiting it after // the stream drains gives us stats.stopReason so callers can tell a truncated // answer (maxPredictedTokensReached / contextLengthReached) from a normal one. let stopReason: string | undefined; try { const result: any = await prediction; stopReason = result?.stats?.stopReason; if (stopReason) { logInfo('LM Studio SDK chat stream finished.', { model: trimmedModel, stopReason, tokensYielded: yielded }); } } catch { /* result unavailable on some SDK versions — non-fatal */ } // Don't claim `eosFound` if we couldn't actually read the stop reason — leave it // undefined so the caller treats it as 'unknown' (and its mid-sentence heuristics kick in). yield { token: '', stopReason }; return; } const errMsg = String(caught?.message ?? caught); const handleDead = /\bdisposed\b/i.test(errMsg) || /lock\(\) request could not be registered/i.test(errMsg); if (handleDead && yielded === 0 && attempt === 1) { logInfo('Dead LM Studio handle detected — retrying with a fresh SDK.', { model: trimmedModel, error: errMsg }); continue; } logError('LM Studio SDK chat stream failed.', { model: trimmedModel, error: errMsg, attempt }); throw caught; } } async resetHandle(modelName: string): Promise { const trimmed = (modelName || '').trim(); if (!trimmed) return; try { await this.client.getModelHandle(trimmed, { refresh: true }); } catch (err: any) { // Best effort — caller will see the next stream() attempt fail // with a normal error path if the refresh itself was broken. logError('LM Studio handle reset failed.', { model: trimmed, error: err?.message ?? String(err) }); } } }