chore: version up to 2.80.34 and package

2026-05-12 22:54:21 +09:00
parent 148bfb070b
commit 065e598cca
26 changed files with 2023 additions and 139 deletions
@@ -11,13 +11,26 @@ export interface ChatStreamRequest {
    modelName: string;
    messages: ChatStreamMessage[];
    temperature: number;
+    /** Upper bound on tokens to generate. Omit to fall back to a conservative default. */
    maxTokens?: number;
+    /** LM Studio context-overflow safety net used only if the prompt still exceeds the window. */
+    contextOverflowPolicy?: 'stopAtLimit' | 'truncateMiddle' | 'rollingWindow';
    signal?: AbortSignal;
 }

+/**
+ * One stream event. `token` carries generated text (possibly empty for the final event);
+ * `stopReason` is set on the *last* event only and is the SDK's `stats.stopReason`
+ * (e.g. `eosFound`, `maxPredictedTokensReached`, `contextLengthReached`, `userStopped`).
+ */
+export interface ChatStreamEvent {
+    token: string;
+    stopReason?: string;
+}
+
 export interface IChatStreamer {
    /** Token-level streaming for an LM Studio chat completion via the WebSocket SDK. */
-    stream(req: ChatStreamRequest): AsyncIterable<{ token: string }>;
+    stream(req: ChatStreamRequest): AsyncIterable<ChatStreamEvent>;
    /**
     * Drop the SDK's cached handle for `modelName`. Callers invoke this when
     * the previous stream returned zero tokens with no error — a symptom of a
@@ -39,7 +52,7 @@ export interface IChatStreamer {
 export class LMStudioStreamer implements IChatStreamer {
    constructor(private readonly client: ILMStudioClient) {}

-    async *stream(req: ChatStreamRequest): AsyncIterable<{ token: string }> {
+    async *stream(req: ChatStreamRequest): AsyncIterable<ChatStreamEvent> {
        const trimmedModel = (req.modelName || '').trim();
        if (!trimmedModel) {
            throw new LMStudioLifecycleError('LMStudioStreamer.stream called without a model name.');
@@ -62,6 +75,10 @@ export class LMStudioStreamer implements IChatStreamer {
            const prediction = (model as any).respond(req.messages, {
                temperature: req.temperature,
                maxTokens: req.maxTokens ?? 4096,
+                // Safety net: if our own token budgeting still underestimated and the prompt
+                // exceeds the model's context window, decide whether the SDK should fail
+                // loudly (stopAtLimit — default) or silently drop content.
+                contextOverflowPolicy: req.contextOverflowPolicy ?? 'stopAtLimit',
                signal: req.signal,
            });

@@ -98,7 +115,22 @@ export class LMStudioStreamer implements IChatStreamer {
                req.signal?.removeEventListener?.('abort', onAbort);
            }

-            if (!caught) return;
+            if (!caught) {
+                if (req.signal?.aborted) return;
+                // The prediction object is also a Promise<PredictionResult>; awaiting it after
+                // the stream drains gives us stats.stopReason so callers can tell a truncated
+                // answer (maxPredictedTokensReached / contextLengthReached) from a normal one.
+                let stopReason: string | undefined;
+                try {
+                    const result: any = await prediction;
+                    stopReason = result?.stats?.stopReason;
+                    if (stopReason) {
+                        logInfo('LM Studio SDK chat stream finished.', { model: trimmedModel, stopReason, tokensYielded: yielded });
+                    }
+                } catch { /* result unavailable on some SDK versions — non-fatal */ }
+                yield { token: '', stopReason: stopReason ?? 'eosFound' };
+                return;
+            }

            const errMsg = String(caught?.message ?? caught);
            const handleDead = /\bdisposed\b/i.test(errMsg)