chore: bump version to 2.80.27 and update core features

2026-05-09 01:16:12 +09:00
parent 5ffb472d22
commit 3220a126fd
41 changed files with 4457 additions and 72 deletions
@@ -1,10 +1,14 @@
-import { LMStudioClient as SDKClient } from '@lmstudio/sdk';
+import { LMStudioClient as SDKClient, LLM } from '@lmstudio/sdk';
 import { logError, logInfo } from '../utils';

 export interface ILMStudioClient {
    load(modelKey: string, signal?: AbortSignal): Promise<void>;
    unload(modelKey: string): Promise<void>;
    listLoaded(): Promise<string[]>;
+    /** Like listLoaded() but caches the result for `ttlMs` to avoid hammering the SDK. */
+    listLoadedCached(ttlMs?: number): Promise<string[]>;
+    /** Resolve a chat-ready handle for an already-loaded (or just-loaded) model. */
+    getModelHandle(modelKey: string): Promise<LLM>;
    isReachable(): Promise<boolean>;
    setBaseUrl(httpBaseUrl: string): void;
 }
@@ -36,6 +40,8 @@ export function httpToWebSocketUrl(httpBaseUrl: string): string | undefined {
 export class LMStudioClient implements ILMStudioClient {
    private _sdk: SDKClient | undefined;
    private _wsUrl: string | undefined;
+    private _loadedCache: { value: string[]; expiresAt: number } | undefined;
+    private static readonly DEFAULT_LOADED_CACHE_TTL_MS = 5000;

    constructor(httpBaseUrl: string) {
        this.setBaseUrl(httpBaseUrl);
@@ -46,6 +52,7 @@ export class LMStudioClient implements ILMStudioClient {
        if (ws !== this._wsUrl) {
            this._wsUrl = ws;
            this._sdk = undefined;
+            this._loadedCache = undefined;
        }
    }

@@ -59,6 +66,7 @@ export class LMStudioClient implements ILMStudioClient {
    async load(modelKey: string, signal?: AbortSignal): Promise<void> {
        try {
            await this.getSdk().llm.load(modelKey, signal ? { signal } : undefined);
+            this._loadedCache = undefined;
            logInfo('LM Studio model loaded.', { modelKey });
        } catch (e: any) {
            const msg = e?.message ?? String(e);
@@ -69,6 +77,7 @@ export class LMStudioClient implements ILMStudioClient {
    async unload(modelKey: string): Promise<void> {
        try {
            await this.getSdk().llm.unload(modelKey);
+            this._loadedCache = undefined;
            logInfo('LM Studio model unloaded.', { modelKey });
        } catch (e: any) {
            const msg = e?.message ?? String(e);
@@ -88,6 +97,29 @@ export class LMStudioClient implements ILMStudioClient {
        }
    }

+    async listLoadedCached(ttlMs: number = LMStudioClient.DEFAULT_LOADED_CACHE_TTL_MS): Promise<string[]> {
+        const now = Date.now();
+        if (this._loadedCache && this._loadedCache.expiresAt > now) {
+            return this._loadedCache.value.slice();
+        }
+        try {
+            const value = await this.listLoaded();
+            this._loadedCache = { value, expiresAt: now + ttlMs };
+            return value.slice();
+        } catch {
+            return [];
+        }
+    }
+
+    async getModelHandle(modelKey: string): Promise<LLM> {
+        try {
+            return await this.getSdk().llm.model(modelKey);
+        } catch (e: any) {
+            const msg = e?.message ?? String(e);
+            throw new LMStudioLifecycleError(`Failed to acquire LM Studio model handle "${modelKey}": ${msg}`, e);
+        }
+    }
+
    async isReachable(): Promise<boolean> {
        try {
            await this.getSdk().llm.listLoaded();
@@ -1,6 +1,7 @@
 import type { ILMStudioClient } from './client';
 import type { IActivityTracker } from './activityTracker';
 import type { EngineKind } from '../utils';
+import type { ISystemSpecsProvider, IModelMemoryEstimator } from '../system/specs';
 import { logError, logInfo } from '../utils';

 export type LifecycleState = 'idle' | 'loading' | 'loaded' | 'streaming' | 'unloading';
@@ -19,6 +20,15 @@ export interface LifecycleManagerDeps {
    switchDebounceMs?: number;
    /** Initial engine. Default 'lmstudio'. */
    initialEngine?: EngineKind;
+    /**
+     * Optional pre-load memory budget check. When both are provided, a warn-only
+     * advisory is emitted via `notifyError` (and a structured log line) before
+     * attempting to load a model that the heuristic predicts will not fit.
+     * The load is **not** blocked — the user may have a quantization the
+     * estimator does not recognize.
+     */
+    systemSpecs?: ISystemSpecsProvider;
+    memoryEstimator?: IModelMemoryEstimator;
 }

 export class ModelLifecycleManager {
@@ -207,6 +217,38 @@ export class ModelLifecycleManager {
        }
    }

+    /**
+     * Warn-only RAM budget check. If the heuristic estimator says the model is
+     * unlikely to fit, surface a non-blocking advisory and log it. The load
+     * still proceeds — the heuristic can be wrong (unrecognized quantization,
+     * sparse / MoE models) and the user may have explicit intent.
+     */
+    private checkMemoryBudget(modelKey: string): void {
+        const specsProvider = this.deps.systemSpecs;
+        const estimator = this.deps.memoryEstimator;
+        if (!specsProvider || !estimator) return;
+        try {
+            const specs = specsProvider.get();
+            const requiredGB = estimator.estimate(modelKey);
+            if (requiredGB > specs.safeModelBudgetGB) {
+                const msg =
+                    `Model "${modelKey}" estimated at ~${requiredGB.toFixed(1)}GB ` +
+                    `exceeds your safe RAM budget of ${specs.safeModelBudgetGB}GB. ` +
+                    `If load fails, try a smaller quantization (q4 / q5).`;
+                logInfo('LM Studio pre-load memory advisory.', {
+                    model: modelKey,
+                    requiredGB: Number(requiredGB.toFixed(2)),
+                    budgetGB: specs.safeModelBudgetGB,
+                    totalRamGB: Number(specs.totalRamGB.toFixed(2)),
+                });
+                this.deps.notifyError?.(msg);
+            }
+        } catch (e: any) {
+            // Diagnostic-only; never block a load on advisory failures.
+            logError('Memory budget check failed.', { error: e?.message ?? String(e) });
+        }
+    }
+
    private async doSwitch(modelKey: string): Promise<void> {
        if (this.disposed) return;
        if (this.engine !== 'lmstudio') return;
@@ -225,6 +267,8 @@ export class ModelLifecycleManager {
            this.currentModel = null;
        }

+        this.checkMemoryBudget(modelKey);
+
        this.state = 'loading';
        this.currentModel = modelKey;
        const ac = new AbortController();
@@ -0,0 +1,64 @@
+import type { ILMStudioClient } from './client';
+import { LMStudioLifecycleError } from './client';
+import { logError, logInfo } from '../utils';
+
+export interface ChatStreamMessage {
+    role: 'user' | 'assistant' | 'system';
+    content: string;
+}
+
+export interface ChatStreamRequest {
+    modelName: string;
+    messages: ChatStreamMessage[];
+    temperature: number;
+    maxTokens?: number;
+    signal?: AbortSignal;
+}
+
+export interface IChatStreamer {
+    /** Token-level streaming for an LM Studio chat completion via the WebSocket SDK. */
+    stream(req: ChatStreamRequest): AsyncIterable<{ token: string }>;
+}
+
+/**
+ * Adapter that streams LM Studio chat completions via @lmstudio/sdk's `model.respond()`,
+ * replacing the manual fetch + SSE parser path used for the OpenAI-compatible REST endpoint.
+ *
+ * Benefits over the REST path:
+ *  - No SSE parsing (no `data: [DONE]` / partial-chunk fragility).
+ *  - Reuses the same WebSocket the lifecycle manager already opened — handle lookup is cheap
+ *    if the model is already loaded, and load-on-first-use is implicit when it isn't.
+ *  - First-class `signal` support for user-cancel and abort propagation.
+ */
+export class LMStudioStreamer implements IChatStreamer {
+    constructor(private readonly client: ILMStudioClient) {}
+
+    async *stream(req: ChatStreamRequest): AsyncIterable<{ token: string }> {
+        const trimmedModel = (req.modelName || '').trim();
+        if (!trimmedModel) {
+            throw new LMStudioLifecycleError('LMStudioStreamer.stream called without a model name.');
+        }
+
+        const model = await this.client.getModelHandle(trimmedModel);
+        logInfo('LM Studio SDK chat stream started.', { model: trimmedModel, messageCount: req.messages.length });
+
+        const prediction = (model as any).respond(req.messages, {
+            temperature: req.temperature,
+            maxTokens: req.maxTokens ?? 4096,
+            signal: req.signal,
+        });
+
+        try {
+            for await (const fragment of prediction as AsyncIterable<{ content: string }>) {
+                if (req.signal?.aborted) return;
+                const token = fragment?.content ?? '';
+                if (token) yield { token };
+            }
+        } catch (err: any) {
+            if (req.signal?.aborted) return;
+            if (err?.name === 'AbortError') return;
+            logError('LM Studio SDK chat stream failed.', { model: trimmedModel, error: err?.message ?? String(err) });
+            throw err;
+        }
+    }
+}