import type { ILMStudioClient } from './client'; import { LMStudioLifecycleError } from './client'; import { logError, logInfo } from '../utils'; export interface ChatStreamMessage { role: 'user' | 'assistant' | 'system'; content: string; } /** Shared sampling block. SDK and REST paths both read this — keep them in sync. */ export interface LmStudioSampling { topP?: number; topK?: number; minP?: number; repeatPenalty?: number; } /** * Translate the sampling block into the OpenAI-compatible REST body extension that LM Studio * understands. Ollama uses the same field names inside `options`. Returns an object you can * spread into either body. Values <= 0 / <= 1 (penalty) are dropped so they fall back to engine * defaults instead of effectively disabling sampling. */ export function samplingToRestBody(s: LmStudioSampling | undefined): Record { const out: Record = {}; if (!s) return out; if (typeof s.topP === 'number' && s.topP > 0 && s.topP <= 1) out.top_p = s.topP; if (typeof s.topK === 'number' && s.topK > 0) out.top_k = s.topK; if (typeof s.minP === 'number' && s.minP > 0 && s.minP <= 1) out.min_p = s.minP; if (typeof s.repeatPenalty === 'number' && s.repeatPenalty > 1) out.repeat_penalty = s.repeatPenalty; return out; } export interface ChatStreamRequest { modelName: string; messages: ChatStreamMessage[]; temperature: number; /** Upper bound on tokens to generate. Omit to fall back to a conservative default. */ maxTokens?: number; /** LM Studio context-overflow safety net used only if the prompt still exceeds the window. */ contextOverflowPolicy?: 'stopAtLimit' | 'truncateMiddle' | 'rollingWindow'; /** Sampling — defaults match small-model glitch-suppression presets. Each is omitted from the SDK call when undefined. */ topP?: number; topK?: number; minP?: number; repeatPenalty?: number; /** Draft model key for speculative decoding. Empty/undefined disables. */ draftModel?: string; signal?: AbortSignal; } /** Subset of LM Studio's `PredictionResult.stats` we expose to callers. */ export interface ChatStreamStats { tokensPerSecond?: number; timeToFirstTokenSec?: number; predictedTokensCount?: number; promptTokensCount?: number; totalTimeSec?: number; /** Speculative decoding (only set when `draftModel` was used). */ draftModelKey?: string; draftTokensCount?: number; acceptedDraftTokensCount?: number; } /** * One stream event. `token` carries generated text (possibly empty for the final event); * `stopReason` is set on the *last* event only and is the SDK's `stats.stopReason` * (e.g. `eosFound`, `maxPredictedTokensReached`, `contextLengthReached`, `userStopped`). * `stats` is also set on the *last* event when LM Studio reports prediction stats. */ export interface ChatStreamEvent { token: string; stopReason?: string; stats?: ChatStreamStats; } export interface IChatStreamer { /** Token-level streaming for an LM Studio chat completion via the WebSocket SDK. */ stream(req: ChatStreamRequest): AsyncIterable; /** * Drop the SDK's cached handle for `modelName`. Callers invoke this when * the previous stream returned zero tokens with no error — a symptom of a * silently-disposed handle that needs a fresh WebSocket round-trip. */ resetHandle?(modelName: string): Promise; } /** * Adapter that streams LM Studio chat completions via @lmstudio/sdk's `model.respond()`, * replacing the manual fetch + SSE parser path used for the OpenAI-compatible REST endpoint. * * Benefits over the REST path: * - No SSE parsing (no `data: [DONE]` / partial-chunk fragility). * - Reuses the same WebSocket the lifecycle manager already opened — handle lookup is cheap * if the model is already loaded, and load-on-first-use is implicit when it isn't. * - First-class `signal` support for user-cancel and abort propagation. */ export class LMStudioStreamer implements IChatStreamer { constructor(private readonly client: ILMStudioClient) {} async *stream(req: ChatStreamRequest): AsyncIterable { const trimmedModel = (req.modelName || '').trim(); if (!trimmedModel) { throw new LMStudioLifecycleError('LMStudioStreamer.stream called without a model name.'); } // One automatic retry path: when the first attempt blows up with a // "Model is disposed!" / "lock() request could not be registered" // error before any tokens have been yielded, we drop the cached SDK // handle and try once more. These errors are caused by a previous // aborted prediction leaving the SDK's internal handle map pointing // at a dead WebSocket binding — a fresh client.model() lookup minted // from a recreated SDK fixes it. We only retry when zero tokens have // streamed: if the consumer already saw partial output, restarting // would duplicate tokens. for (let attempt = 1; attempt <= 2; attempt++) { const refresh = attempt > 1; const model = await this.client.getModelHandle(trimmedModel, refresh ? { refresh: true } : undefined); logInfo('LM Studio SDK chat stream started.', { model: trimmedModel, messageCount: req.messages.length, attempt }); // Sampling defaults match the historical glitch-suppression preset for small / // quantized models (한글 토큰 깨짐 방지) but are now overridable per-call. const respondOpts: any = { temperature: req.temperature, maxTokens: req.maxTokens ?? 4096, // Safety net: if our own token budgeting still underestimated and the prompt // exceeds the model's context window, decide whether the SDK should fail // loudly (stopAtLimit — default) or silently drop content. contextOverflowPolicy: req.contextOverflowPolicy ?? 'stopAtLimit', signal: req.signal, }; if (typeof req.topP === 'number') respondOpts.topPSampling = req.topP; if (typeof req.topK === 'number' && req.topK > 0) respondOpts.topKSampling = req.topK; if (typeof req.minP === 'number' && req.minP > 0) respondOpts.minPSampling = req.minP; if (typeof req.repeatPenalty === 'number' && req.repeatPenalty > 1) respondOpts.repeatPenalty = req.repeatPenalty; // Speculative decoding — LM Studio loads the draft model lazily on first use if needed // (we also `preloadDraftModel` after main load to avoid that cold cost). if (req.draftModel && req.draftModel.trim()) respondOpts.draftModel = req.draftModel.trim(); const prediction = (model as any).respond(req.messages, respondOpts); // Bridge AbortSignal → prediction.cancel(): without this, an // aborted request keeps generating on the LM Studio server. The // orphaned prediction holds locks on the model handle, which is // a known cause of "lock() request could not be registered" on // the very next request — the reused handle is still bound to a // dead prediction. const onAbort = () => { try { (prediction as any)?.cancel?.(); } catch { /* swallow — best effort */ } }; if (req.signal) { if (req.signal.aborted) onAbort(); else req.signal.addEventListener('abort', onAbort, { once: true }); } let yielded = 0; let caught: any = null; try { for await (const fragment of prediction as AsyncIterable<{ content: string }>) { if (req.signal?.aborted) return; const token = fragment?.content ?? ''; if (token) { yielded++; yield { token }; } } } catch (err: any) { if (req.signal?.aborted) return; if (err?.name === 'AbortError') return; caught = err; } finally { req.signal?.removeEventListener?.('abort', onAbort); } if (!caught) { if (req.signal?.aborted) return; // The prediction object is also a Promise; awaiting it after // the stream drains gives us stats.stopReason so callers can tell a truncated // answer (maxPredictedTokensReached / contextLengthReached) from a normal one, // plus throughput numbers (tok/s, TTFT) we surface to the UI. let stopReason: string | undefined; let stats: ChatStreamEvent['stats']; try { const result: any = await prediction; stopReason = result?.stats?.stopReason; const s = result?.stats; if (s) { stats = { tokensPerSecond: typeof s.tokensPerSecond === 'number' ? s.tokensPerSecond : undefined, timeToFirstTokenSec: typeof s.timeToFirstTokenSec === 'number' ? s.timeToFirstTokenSec : undefined, predictedTokensCount: typeof s.predictedTokensCount === 'number' ? s.predictedTokensCount : undefined, promptTokensCount: typeof s.promptTokensCount === 'number' ? s.promptTokensCount : undefined, totalTimeSec: typeof s.totalTimeSec === 'number' ? s.totalTimeSec : undefined, draftModelKey: typeof s.usedDraftModelKey === 'string' ? s.usedDraftModelKey : undefined, draftTokensCount: typeof s.totalDraftTokensCount === 'number' ? s.totalDraftTokensCount : undefined, acceptedDraftTokensCount: typeof s.acceptedDraftTokensCount === 'number' ? s.acceptedDraftTokensCount : undefined, }; } if (stopReason || stats) { logInfo('LM Studio SDK chat stream finished.', { model: trimmedModel, stopReason, tokensYielded: yielded, tokensPerSecond: stats?.tokensPerSecond, ttftSec: stats?.timeToFirstTokenSec, }); } } catch { /* result unavailable on some SDK versions — non-fatal */ } // Empty-but-clean stream is treated like a dead handle on attempt 1: // recreate the SDK and try once more. Same root cause (handle bound to // a stale prediction) but no exception is thrown — just an empty stream. if (yielded === 0 && attempt === 1) { logInfo('Empty SDK stream with no error — retrying with a fresh SDK.', { model: trimmedModel }); continue; } // Don't claim `eosFound` if we couldn't actually read the stop reason — leave it // undefined so the caller treats it as 'unknown' (and its mid-sentence heuristics kick in). yield { token: '', stopReason, stats }; return; } const errMsg = String(caught?.message ?? caught); // Broaden the "handle is bound to a dead WebSocket binding" detection. All of // these resolve with the same fix (recreate the SDK client so the next // llm.model() lookup mints a fresh handle). const handleDead = /\bdisposed\b/i.test(errMsg) || /lock\(\) request could not be registered/i.test(errMsg) || /channel\s+closed/i.test(errMsg) || /WebSocket\s+(?:is\s+not\s+open|closed|disconnected)/i.test(errMsg) || /Connection\s+(?:lost|reset|closed)/i.test(errMsg) || /\bECONNRESET\b/i.test(errMsg) || /socket\s+hang\s*up/i.test(errMsg); if (handleDead && yielded === 0 && attempt === 1) { logInfo('Dead LM Studio handle detected — retrying with a fresh SDK.', { model: trimmedModel, error: errMsg }); continue; } logError('LM Studio SDK chat stream failed.', { model: trimmedModel, error: errMsg, attempt }); throw caught; } } async resetHandle(modelName: string): Promise { const trimmed = (modelName || '').trim(); if (!trimmed) return; try { await this.client.getModelHandle(trimmed, { refresh: true }); } catch (err: any) { // Best effort — caller will see the next stream() attempt fail // with a normal error path if the refresh itself was broken. logError('LM Studio handle reset failed.', { model: trimmed, error: err?.message ?? String(err) }); } } }