0712014fcb
- ASTRA-DEBUG 정상 흐름 로그를 console.error → logInfo/console.log 로 강등 (chatHandlers, extension, slashRouter): DevTools에 ERR로 찍히던 오탐 제거 - sidebar webview에 명시적 CSP meta 추가 + font-src에 data: 허용 (sidebar.html, sidebarProvider._getHtml): VS Code outer iframe이 codicon.ttf를 data:font/ttf 로 inject하면서 기본 CSP에 막혀 매 prompt 마다 violation 경고가 찍히던 문제 해소 - 누적된 LM Studio / agent / 컨텍스트 매니저 / 테스트 갱신 동반 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
253 lines
13 KiB
TypeScript
253 lines
13 KiB
TypeScript
import type { ILMStudioClient } from './client';
|
|
import { LMStudioLifecycleError } from './client';
|
|
import { logError, logInfo } from '../utils';
|
|
|
|
export interface ChatStreamMessage {
|
|
role: 'user' | 'assistant' | 'system';
|
|
content: string;
|
|
}
|
|
|
|
/** Shared sampling block. SDK and REST paths both read this — keep them in sync. */
|
|
export interface LmStudioSampling {
|
|
topP?: number;
|
|
topK?: number;
|
|
minP?: number;
|
|
repeatPenalty?: number;
|
|
}
|
|
|
|
/**
|
|
* Translate the sampling block into the OpenAI-compatible REST body extension that LM Studio
|
|
* understands. Ollama uses the same field names inside `options`. Returns an object you can
|
|
* spread into either body. Values <= 0 / <= 1 (penalty) are dropped so they fall back to engine
|
|
* defaults instead of effectively disabling sampling.
|
|
*/
|
|
export function samplingToRestBody(s: LmStudioSampling | undefined): Record<string, number> {
|
|
const out: Record<string, number> = {};
|
|
if (!s) return out;
|
|
if (typeof s.topP === 'number' && s.topP > 0 && s.topP <= 1) out.top_p = s.topP;
|
|
if (typeof s.topK === 'number' && s.topK > 0) out.top_k = s.topK;
|
|
if (typeof s.minP === 'number' && s.minP > 0 && s.minP <= 1) out.min_p = s.minP;
|
|
if (typeof s.repeatPenalty === 'number' && s.repeatPenalty > 1) out.repeat_penalty = s.repeatPenalty;
|
|
return out;
|
|
}
|
|
|
|
export interface ChatStreamRequest {
|
|
modelName: string;
|
|
messages: ChatStreamMessage[];
|
|
temperature: number;
|
|
/** Upper bound on tokens to generate. Omit to fall back to a conservative default. */
|
|
maxTokens?: number;
|
|
/** LM Studio context-overflow safety net used only if the prompt still exceeds the window. */
|
|
contextOverflowPolicy?: 'stopAtLimit' | 'truncateMiddle' | 'rollingWindow';
|
|
/** Sampling — defaults match small-model glitch-suppression presets. Each is omitted from the SDK call when undefined. */
|
|
topP?: number;
|
|
topK?: number;
|
|
minP?: number;
|
|
repeatPenalty?: number;
|
|
/** Draft model key for speculative decoding. Empty/undefined disables. */
|
|
draftModel?: string;
|
|
signal?: AbortSignal;
|
|
}
|
|
|
|
/** Subset of LM Studio's `PredictionResult.stats` we expose to callers. */
|
|
export interface ChatStreamStats {
|
|
tokensPerSecond?: number;
|
|
timeToFirstTokenSec?: number;
|
|
predictedTokensCount?: number;
|
|
promptTokensCount?: number;
|
|
totalTimeSec?: number;
|
|
/** Speculative decoding (only set when `draftModel` was used). */
|
|
draftModelKey?: string;
|
|
draftTokensCount?: number;
|
|
acceptedDraftTokensCount?: number;
|
|
}
|
|
|
|
/**
|
|
* One stream event. `token` carries generated text (possibly empty for the final event);
|
|
* `stopReason` is set on the *last* event only and is the SDK's `stats.stopReason`
|
|
* (e.g. `eosFound`, `maxPredictedTokensReached`, `contextLengthReached`, `userStopped`).
|
|
* `stats` is also set on the *last* event when LM Studio reports prediction stats.
|
|
*/
|
|
export interface ChatStreamEvent {
|
|
token: string;
|
|
stopReason?: string;
|
|
stats?: ChatStreamStats;
|
|
}
|
|
|
|
export interface IChatStreamer {
|
|
/** Token-level streaming for an LM Studio chat completion via the WebSocket SDK. */
|
|
stream(req: ChatStreamRequest): AsyncIterable<ChatStreamEvent>;
|
|
/**
|
|
* Drop the SDK's cached handle for `modelName`. Callers invoke this when
|
|
* the previous stream returned zero tokens with no error — a symptom of a
|
|
* silently-disposed handle that needs a fresh WebSocket round-trip.
|
|
*/
|
|
resetHandle?(modelName: string): Promise<void>;
|
|
}
|
|
|
|
/**
|
|
* Adapter that streams LM Studio chat completions via @lmstudio/sdk's `model.respond()`,
|
|
* replacing the manual fetch + SSE parser path used for the OpenAI-compatible REST endpoint.
|
|
*
|
|
* Benefits over the REST path:
|
|
* - No SSE parsing (no `data: [DONE]` / partial-chunk fragility).
|
|
* - Reuses the same WebSocket the lifecycle manager already opened — handle lookup is cheap
|
|
* if the model is already loaded, and load-on-first-use is implicit when it isn't.
|
|
* - First-class `signal` support for user-cancel and abort propagation.
|
|
*/
|
|
export class LMStudioStreamer implements IChatStreamer {
|
|
constructor(private readonly client: ILMStudioClient) {}
|
|
|
|
async *stream(req: ChatStreamRequest): AsyncIterable<ChatStreamEvent> {
|
|
const trimmedModel = (req.modelName || '').trim();
|
|
if (!trimmedModel) {
|
|
throw new LMStudioLifecycleError('LMStudioStreamer.stream called without a model name.');
|
|
}
|
|
|
|
// One automatic retry path: when the first attempt blows up with a
|
|
// "Model is disposed!" / "lock() request could not be registered"
|
|
// error before any tokens have been yielded, we drop the cached SDK
|
|
// handle and try once more. These errors are caused by a previous
|
|
// aborted prediction leaving the SDK's internal handle map pointing
|
|
// at a dead WebSocket binding — a fresh client.model() lookup minted
|
|
// from a recreated SDK fixes it. We only retry when zero tokens have
|
|
// streamed: if the consumer already saw partial output, restarting
|
|
// would duplicate tokens.
|
|
for (let attempt = 1; attempt <= 2; attempt++) {
|
|
const refresh = attempt > 1;
|
|
const model = await this.client.getModelHandle(trimmedModel, refresh ? { refresh: true } : undefined);
|
|
logInfo('LM Studio SDK chat stream started.', { model: trimmedModel, messageCount: req.messages.length, attempt });
|
|
|
|
// Sampling defaults match the historical glitch-suppression preset for small /
|
|
// quantized models (한글 토큰 깨짐 방지) but are now overridable per-call.
|
|
const respondOpts: any = {
|
|
temperature: req.temperature,
|
|
maxTokens: req.maxTokens ?? 4096,
|
|
// Safety net: if our own token budgeting still underestimated and the prompt
|
|
// exceeds the model's context window, decide whether the SDK should fail
|
|
// loudly (stopAtLimit — default) or silently drop content.
|
|
contextOverflowPolicy: req.contextOverflowPolicy ?? 'stopAtLimit',
|
|
signal: req.signal,
|
|
};
|
|
if (typeof req.topP === 'number') respondOpts.topPSampling = req.topP;
|
|
if (typeof req.topK === 'number' && req.topK > 0) respondOpts.topKSampling = req.topK;
|
|
if (typeof req.minP === 'number' && req.minP > 0) respondOpts.minPSampling = req.minP;
|
|
if (typeof req.repeatPenalty === 'number' && req.repeatPenalty > 1) respondOpts.repeatPenalty = req.repeatPenalty;
|
|
// Speculative decoding — LM Studio loads the draft model lazily on first use if needed
|
|
// (we also `preloadDraftModel` after main load to avoid that cold cost).
|
|
if (req.draftModel && req.draftModel.trim()) respondOpts.draftModel = req.draftModel.trim();
|
|
const prediction = (model as any).respond(req.messages, respondOpts);
|
|
|
|
// Bridge AbortSignal → prediction.cancel(): without this, an
|
|
// aborted request keeps generating on the LM Studio server. The
|
|
// orphaned prediction holds locks on the model handle, which is
|
|
// a known cause of "lock() request could not be registered" on
|
|
// the very next request — the reused handle is still bound to a
|
|
// dead prediction.
|
|
const onAbort = () => {
|
|
try { (prediction as any)?.cancel?.(); } catch { /* swallow — best effort */ }
|
|
};
|
|
if (req.signal) {
|
|
if (req.signal.aborted) onAbort();
|
|
else req.signal.addEventListener('abort', onAbort, { once: true });
|
|
}
|
|
|
|
let yielded = 0;
|
|
let caught: any = null;
|
|
try {
|
|
for await (const fragment of prediction as AsyncIterable<{ content: string }>) {
|
|
if (req.signal?.aborted) return;
|
|
const token = fragment?.content ?? '';
|
|
if (token) {
|
|
yielded++;
|
|
yield { token };
|
|
}
|
|
}
|
|
} catch (err: any) {
|
|
if (req.signal?.aborted) return;
|
|
if (err?.name === 'AbortError') return;
|
|
caught = err;
|
|
} finally {
|
|
req.signal?.removeEventListener?.('abort', onAbort);
|
|
}
|
|
|
|
if (!caught) {
|
|
if (req.signal?.aborted) return;
|
|
// The prediction object is also a Promise<PredictionResult>; awaiting it after
|
|
// the stream drains gives us stats.stopReason so callers can tell a truncated
|
|
// answer (maxPredictedTokensReached / contextLengthReached) from a normal one,
|
|
// plus throughput numbers (tok/s, TTFT) we surface to the UI.
|
|
let stopReason: string | undefined;
|
|
let stats: ChatStreamEvent['stats'];
|
|
try {
|
|
const result: any = await prediction;
|
|
stopReason = result?.stats?.stopReason;
|
|
const s = result?.stats;
|
|
if (s) {
|
|
stats = {
|
|
tokensPerSecond: typeof s.tokensPerSecond === 'number' ? s.tokensPerSecond : undefined,
|
|
timeToFirstTokenSec: typeof s.timeToFirstTokenSec === 'number' ? s.timeToFirstTokenSec : undefined,
|
|
predictedTokensCount: typeof s.predictedTokensCount === 'number' ? s.predictedTokensCount : undefined,
|
|
promptTokensCount: typeof s.promptTokensCount === 'number' ? s.promptTokensCount : undefined,
|
|
totalTimeSec: typeof s.totalTimeSec === 'number' ? s.totalTimeSec : undefined,
|
|
draftModelKey: typeof s.usedDraftModelKey === 'string' ? s.usedDraftModelKey : undefined,
|
|
draftTokensCount: typeof s.totalDraftTokensCount === 'number' ? s.totalDraftTokensCount : undefined,
|
|
acceptedDraftTokensCount: typeof s.acceptedDraftTokensCount === 'number' ? s.acceptedDraftTokensCount : undefined,
|
|
};
|
|
}
|
|
if (stopReason || stats) {
|
|
logInfo('LM Studio SDK chat stream finished.', {
|
|
model: trimmedModel, stopReason, tokensYielded: yielded,
|
|
tokensPerSecond: stats?.tokensPerSecond, ttftSec: stats?.timeToFirstTokenSec,
|
|
});
|
|
}
|
|
} catch { /* result unavailable on some SDK versions — non-fatal */ }
|
|
// Empty-but-clean stream is treated like a dead handle on attempt 1:
|
|
// recreate the SDK and try once more. Same root cause (handle bound to
|
|
// a stale prediction) but no exception is thrown — just an empty stream.
|
|
if (yielded === 0 && attempt === 1) {
|
|
logInfo('Empty SDK stream with no error — retrying with a fresh SDK.', { model: trimmedModel });
|
|
continue;
|
|
}
|
|
// Don't claim `eosFound` if we couldn't actually read the stop reason — leave it
|
|
// undefined so the caller treats it as 'unknown' (and its mid-sentence heuristics kick in).
|
|
yield { token: '', stopReason, stats };
|
|
return;
|
|
}
|
|
|
|
const errMsg = String(caught?.message ?? caught);
|
|
// Broaden the "handle is bound to a dead WebSocket binding" detection. All of
|
|
// these resolve with the same fix (recreate the SDK client so the next
|
|
// llm.model() lookup mints a fresh handle).
|
|
const handleDead =
|
|
/\bdisposed\b/i.test(errMsg)
|
|
|| /lock\(\) request could not be registered/i.test(errMsg)
|
|
|| /channel\s+closed/i.test(errMsg)
|
|
|| /WebSocket\s+(?:is\s+not\s+open|closed|disconnected)/i.test(errMsg)
|
|
|| /Connection\s+(?:lost|reset|closed)/i.test(errMsg)
|
|
|| /\bECONNRESET\b/i.test(errMsg)
|
|
|| /socket\s+hang\s*up/i.test(errMsg);
|
|
|
|
if (handleDead && yielded === 0 && attempt === 1) {
|
|
logInfo('Dead LM Studio handle detected — retrying with a fresh SDK.', { model: trimmedModel, error: errMsg });
|
|
continue;
|
|
}
|
|
|
|
logError('LM Studio SDK chat stream failed.', { model: trimmedModel, error: errMsg, attempt });
|
|
throw caught;
|
|
}
|
|
}
|
|
|
|
async resetHandle(modelName: string): Promise<void> {
|
|
const trimmed = (modelName || '').trim();
|
|
if (!trimmed) return;
|
|
try {
|
|
await this.client.getModelHandle(trimmed, { refresh: true });
|
|
} catch (err: any) {
|
|
// Best effort — caller will see the next stream() attempt fail
|
|
// with a normal error path if the refresh itself was broken.
|
|
logError('LM Studio handle reset failed.', { model: trimmed, error: err?.message ?? String(err) });
|
|
}
|
|
}
|
|
}
|