connectai/src/lmstudio/client.ts

import { LMStudioClient as SDKClient, LLM, type LLMLoadModelConfig } from '@lmstudio/sdk';
import { logError, logInfo } from '../utils';

/** Load-time options forwarded to LM Studio's `llm.load()`. Subset of `LLMLoadModelConfig`. */
export interface LMStudioLoadConfig {
    flashAttention?: boolean;
    /** "max" | "off" | number 0-1 */
    gpuOffloadRatio?: 'max' | 'off' | number;
    offloadKVCacheToGpu?: boolean;
    keepModelInMemory?: boolean;
    useFp16ForKVCache?: boolean;
    /** 0 / undefined = engine default */
    evalBatchSize?: number;
}

export interface ILMStudioClient {
    load(modelKey: string, signal?: AbortSignal, loadConfig?: LMStudioLoadConfig): Promise<void>;
    unload(modelKey: string): Promise<void>;
    listLoaded(): Promise<string[]>;
    /** Like listLoaded() but caches the result for `ttlMs` to avoid hammering the SDK. */
    listLoadedCached(ttlMs?: number): Promise<string[]>;
    /**
     * List every LLM the user has downloaded into LM Studio, regardless of
     * whether it is currently loaded. Returns the SDK `modelKey` of each entry —
     * the exact identifier `llm.load()` accepts. Use this for the dropdown so
     * the list does not depend on LM Studio's JIT setting (REST `/v1/models`
     * only returns loaded models when JIT is off).
     */
    listDownloaded(): Promise<string[]>;
    /** Cached variant; the downloaded list only changes when the user installs/removes a model. */
    listDownloadedCached(ttlMs?: number): Promise<string[]>;
    /** Pre-warm a draft model for speculative decoding. Idempotent + best-effort. */
    preloadDraftModel?(draftModelKey: string): Promise<void>;
    /**
     * Resolve a chat-ready handle for an already-loaded (or just-loaded) model.
     *
     * `options.refresh: true` drops the SDK + WebSocket so any disposed handle
     * sitting in the SDK's internal handle map is discarded. Use this after a
     * "Model is disposed!" or "lock() request could not be registered" error.
     */
    getModelHandle(modelKey: string, options?: { refresh?: boolean }): Promise<LLM>;
    /**
     * The model's *actually-loaded* context window in tokens (LM Studio's
     * `llm.getContextLength()`), or `undefined` if it can't be determined.
     *
     * The user-facing `g1nation.contextLength` setting is only a budgeting
     * intent — the real ceiling is whatever window the model was loaded with.
     * Budgeting against the larger of the two silently overflows the server,
     * which then truncates the prompt or emits EOS as the first token (empty
     * answer). Cached per-key because it only changes on reload.
     */
    getModelContextLength(modelKey: string): Promise<number | undefined>;
    isReachable(): Promise<boolean>;
    setBaseUrl(httpBaseUrl: string): void;
}

export class LMStudioLifecycleError extends Error {
    constructor(message: string, public readonly cause?: unknown) {
        super(message);
        this.name = 'LMStudioLifecycleError';
    }
}

export function httpToWebSocketUrl(httpBaseUrl: string): string | undefined {
    const trimmed = (httpBaseUrl || '').trim();
    if (!trimmed) return undefined;
    try {
        const url = new URL(trimmed);
        if (url.protocol === 'http:') url.protocol = 'ws:';
        else if (url.protocol === 'https:') url.protocol = 'wss:';
        else if (url.protocol !== 'ws:' && url.protocol !== 'wss:') return undefined;
        // Strip every REST-only path suffix LM Studio ships with so the SDK lands on the
        // WebSocket root. Loop because /api/v0 → /api → '' should fully unwind.
        const REST_SUFFIXES = ['/api/v0', '/api/v1', '/v1', '/api'];
        let changed = true;
        while (changed) {
            changed = false;
            for (const suffix of REST_SUFFIXES) {
                if (url.pathname.endsWith(suffix)) {
                    url.pathname = url.pathname.slice(0, -suffix.length);
                    changed = true;
                    break;
                }
            }
        }
        const out = url.toString().replace(/\/+$/, '');
        return out;
    } catch {
        return undefined;
    }
}

export class LMStudioClient implements ILMStudioClient {
    private _sdk: SDKClient | undefined;
    private _wsUrl: string | undefined;
    private _loadedCache: { value: string[]; expiresAt: number } | undefined;
    private _downloadedCache: { value: string[]; expiresAt: number } | undefined;
    private _contextLengthCache = new Map<string, { value: number; expiresAt: number }>();
    private static readonly DEFAULT_LOADED_CACHE_TTL_MS = 5000;
    private static readonly DEFAULT_DOWNLOADED_CACHE_TTL_MS = 60_000;
    private static readonly DEFAULT_CONTEXT_LENGTH_CACHE_TTL_MS = 60_000;

    constructor(httpBaseUrl: string) {
        this.setBaseUrl(httpBaseUrl);
    }

    setBaseUrl(httpBaseUrl: string): void {
        const ws = httpToWebSocketUrl(httpBaseUrl);
        if (ws !== this._wsUrl) {
            this._wsUrl = ws;
            this._sdk = undefined;
            this._loadedCache = undefined;
            this._downloadedCache = undefined;
            this._contextLengthCache.clear();
        }
    }

    private getSdk(): SDKClient {
        if (!this._sdk) {
            this._sdk = new SDKClient(this._wsUrl ? { baseUrl: this._wsUrl } : {});
        }
        return this._sdk;
    }

    async load(modelKey: string, signal?: AbortSignal, loadConfig?: LMStudioLoadConfig): Promise<void> {
        try {
            const opts: { signal?: AbortSignal; config?: LLMLoadModelConfig } = {};
            if (signal) opts.signal = signal;
            const config = this._buildLoadConfig(loadConfig);
            if (Object.keys(config).length > 0) opts.config = config;
            await this.getSdk().llm.load(modelKey, Object.keys(opts).length > 0 ? opts : undefined);
            this._loadedCache = undefined;
            // Loading does not change the downloaded-models set; leave _downloadedCache alone.
            logInfo('LM Studio model loaded.', { modelKey, configKeys: Object.keys(config) });
        } catch (e: any) {
            const msg = e?.message ?? String(e);
            throw new LMStudioLifecycleError(`Failed to load LM Studio model "${modelKey}": ${msg}`, e);
        }
    }

    /** Translate our flat LMStudioLoadConfig into LM Studio's nested LLMLoadModelConfig shape. */
    private _buildLoadConfig(lc: LMStudioLoadConfig | undefined): LLMLoadModelConfig {
        const out: LLMLoadModelConfig = {};
        if (!lc) return out;
        if (typeof lc.flashAttention === 'boolean') out.flashAttention = lc.flashAttention;
        if (typeof lc.offloadKVCacheToGpu === 'boolean') out.offloadKVCacheToGpu = lc.offloadKVCacheToGpu;
        if (typeof lc.keepModelInMemory === 'boolean') out.keepModelInMemory = lc.keepModelInMemory;
        if (typeof lc.useFp16ForKVCache === 'boolean') out.useFp16ForKVCache = lc.useFp16ForKVCache;
        if (typeof lc.evalBatchSize === 'number' && lc.evalBatchSize > 0) out.evalBatchSize = lc.evalBatchSize;
        if (lc.gpuOffloadRatio !== undefined) {
            // GPUSetting is deprecated but still accepted — wraps a single `ratio`.
            out.gpu = { ratio: lc.gpuOffloadRatio as any };
        }
        return out;
    }

    async preloadDraftModel(draftModelKey: string): Promise<void> {
        const key = (draftModelKey || '').trim();
        if (!key) return;
        try {
            const llm: any = this.getSdk().llm;
            if (typeof llm.unstable_preloadDraftModel === 'function') {
                await llm.unstable_preloadDraftModel(key);
                logInfo('LM Studio draft model preloaded.', { draftModelKey: key });
            }
        } catch (e: any) {
            // Best-effort — the main model's respond({draftModel}) will still load it lazily.
            logError('LM Studio draft model preload failed.', { draftModelKey: key, error: e?.message ?? String(e) });
        }
    }

    async unload(modelKey: string): Promise<void> {
        try {
            await this.getSdk().llm.unload(modelKey);
            this._loadedCache = undefined;
            logInfo('LM Studio model unloaded.', { modelKey });
        } catch (e: any) {
            const msg = e?.message ?? String(e);
            throw new LMStudioLifecycleError(`Failed to unload LM Studio model "${modelKey}": ${msg}`, e);
        }
    }

    /** Force the next downloaded/loaded-models call to re-fetch (use after install / remove). */
    invalidateCaches(): void {
        this._loadedCache = undefined;
        this._downloadedCache = undefined;
        this._contextLengthCache.clear();
    }

    async listLoaded(): Promise<string[]> {
        try {
            const items: any[] = await this.getSdk().llm.listLoaded();
            return items
                .map((m) => m?.identifier ?? m?.modelKey ?? m?.path ?? null)
                .filter((id): id is string => typeof id === 'string' && id.length > 0);
        } catch (e: any) {
            const msg = e?.message ?? String(e);
            throw new LMStudioLifecycleError(`Failed to list loaded LM Studio models: ${msg}`, e);
        }
    }

    async listLoadedCached(ttlMs: number = LMStudioClient.DEFAULT_LOADED_CACHE_TTL_MS): Promise<string[]> {
        const now = Date.now();
        if (this._loadedCache && this._loadedCache.expiresAt > now) {
            return this._loadedCache.value.slice();
        }
        try {
            const value = await this.listLoaded();
            this._loadedCache = { value, expiresAt: now + ttlMs };
            return value.slice();
        } catch {
            return [];
        }
    }

    async listDownloaded(): Promise<string[]> {
        try {
            const items: any[] = await this.getSdk().system.listDownloadedModels('llm');
            return items
                .map((m) => m?.modelKey ?? null)
                .filter((k): k is string => typeof k === 'string' && k.length > 0);
        } catch (e: any) {
            const msg = e?.message ?? String(e);
            logError('Failed to list downloaded LM Studio models.', { error: msg });
            return [];
        }
    }

    async listDownloadedCached(ttlMs: number = LMStudioClient.DEFAULT_DOWNLOADED_CACHE_TTL_MS): Promise<string[]> {
        const now = Date.now();
        if (this._downloadedCache && this._downloadedCache.expiresAt > now) {
            return this._downloadedCache.value.slice();
        }
        const value = await this.listDownloaded();
        // Only cache non-empty results — an empty array often signals a transient SDK error,
        // and caching that for 60s would hide a freshly-started LM Studio process.
        if (value.length > 0) {
            this._downloadedCache = { value, expiresAt: now + ttlMs };
        }
        return value.slice();
    }

    async getModelHandle(modelKey: string, options?: { refresh?: boolean }): Promise<LLM> {
        try {
            if (options?.refresh) {
                // Recreate the SDK + WebSocket so the SDK's internal handle
                // cache is dropped. The next llm.model() call mints a fresh
                // handle instead of returning the disposed one from the
                // previous (aborted) prediction.
                this._sdk = undefined;
                this._loadedCache = undefined;
                logInfo('LM Studio SDK handle refresh requested — dropped cached SDK client.', { modelKey });
            }
            return await this.getSdk().llm.model(modelKey);
        } catch (e: any) {
            const msg = e?.message ?? String(e);
            throw new LMStudioLifecycleError(`Failed to acquire LM Studio model handle "${modelKey}": ${msg}`, e);
        }
    }

    async getModelContextLength(modelKey: string): Promise<number | undefined> {
        const key = (modelKey || '').trim();
        if (!key) return undefined;
        const now = Date.now();
        const cached = this._contextLengthCache.get(key);
        if (cached && cached.expiresAt > now) return cached.value;
        try {
            // Reuses the same handle the stream will use. If the model isn't
            // loaded yet this forces a JIT load — acceptable since the very next
            // step streams from it anyway. Best-effort: any failure (incl. the
            // load-coalescing "Operation canceled" race) falls back to undefined
            // so the caller keeps the configured window.
            const handle: any = await this.getSdk().llm.model(key);
            const len = typeof handle?.getContextLength === 'function'
                ? await handle.getContextLength()
                : undefined;
            if (typeof len === 'number' && Number.isFinite(len) && len > 0) {
                this._contextLengthCache.set(key, {
                    value: len,
                    expiresAt: now + LMStudioClient.DEFAULT_CONTEXT_LENGTH_CACHE_TTL_MS,
                });
                return len;
            }
            return undefined;
        } catch (e: any) {
            logError('Failed to query LM Studio model context length.', { modelKey: key, error: e?.message ?? String(e) });
            return undefined;
        }
    }

    async isReachable(): Promise<boolean> {
        try {
            await this.getSdk().llm.listLoaded();
            return true;
        } catch (e: any) {
            logError('LM Studio not reachable.', { error: e?.message ?? String(e) });
            return false;
        }
    }
}