chore: bump version to 2.80.27 and update core features
This commit is contained in:
+33
-1
@@ -1,10 +1,14 @@
|
||||
import { LMStudioClient as SDKClient } from '@lmstudio/sdk';
|
||||
import { LMStudioClient as SDKClient, LLM } from '@lmstudio/sdk';
|
||||
import { logError, logInfo } from '../utils';
|
||||
|
||||
export interface ILMStudioClient {
|
||||
load(modelKey: string, signal?: AbortSignal): Promise<void>;
|
||||
unload(modelKey: string): Promise<void>;
|
||||
listLoaded(): Promise<string[]>;
|
||||
/** Like listLoaded() but caches the result for `ttlMs` to avoid hammering the SDK. */
|
||||
listLoadedCached(ttlMs?: number): Promise<string[]>;
|
||||
/** Resolve a chat-ready handle for an already-loaded (or just-loaded) model. */
|
||||
getModelHandle(modelKey: string): Promise<LLM>;
|
||||
isReachable(): Promise<boolean>;
|
||||
setBaseUrl(httpBaseUrl: string): void;
|
||||
}
|
||||
@@ -36,6 +40,8 @@ export function httpToWebSocketUrl(httpBaseUrl: string): string | undefined {
|
||||
export class LMStudioClient implements ILMStudioClient {
|
||||
private _sdk: SDKClient | undefined;
|
||||
private _wsUrl: string | undefined;
|
||||
private _loadedCache: { value: string[]; expiresAt: number } | undefined;
|
||||
private static readonly DEFAULT_LOADED_CACHE_TTL_MS = 5000;
|
||||
|
||||
constructor(httpBaseUrl: string) {
|
||||
this.setBaseUrl(httpBaseUrl);
|
||||
@@ -46,6 +52,7 @@ export class LMStudioClient implements ILMStudioClient {
|
||||
if (ws !== this._wsUrl) {
|
||||
this._wsUrl = ws;
|
||||
this._sdk = undefined;
|
||||
this._loadedCache = undefined;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -59,6 +66,7 @@ export class LMStudioClient implements ILMStudioClient {
|
||||
async load(modelKey: string, signal?: AbortSignal): Promise<void> {
|
||||
try {
|
||||
await this.getSdk().llm.load(modelKey, signal ? { signal } : undefined);
|
||||
this._loadedCache = undefined;
|
||||
logInfo('LM Studio model loaded.', { modelKey });
|
||||
} catch (e: any) {
|
||||
const msg = e?.message ?? String(e);
|
||||
@@ -69,6 +77,7 @@ export class LMStudioClient implements ILMStudioClient {
|
||||
async unload(modelKey: string): Promise<void> {
|
||||
try {
|
||||
await this.getSdk().llm.unload(modelKey);
|
||||
this._loadedCache = undefined;
|
||||
logInfo('LM Studio model unloaded.', { modelKey });
|
||||
} catch (e: any) {
|
||||
const msg = e?.message ?? String(e);
|
||||
@@ -88,6 +97,29 @@ export class LMStudioClient implements ILMStudioClient {
|
||||
}
|
||||
}
|
||||
|
||||
async listLoadedCached(ttlMs: number = LMStudioClient.DEFAULT_LOADED_CACHE_TTL_MS): Promise<string[]> {
|
||||
const now = Date.now();
|
||||
if (this._loadedCache && this._loadedCache.expiresAt > now) {
|
||||
return this._loadedCache.value.slice();
|
||||
}
|
||||
try {
|
||||
const value = await this.listLoaded();
|
||||
this._loadedCache = { value, expiresAt: now + ttlMs };
|
||||
return value.slice();
|
||||
} catch {
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
async getModelHandle(modelKey: string): Promise<LLM> {
|
||||
try {
|
||||
return await this.getSdk().llm.model(modelKey);
|
||||
} catch (e: any) {
|
||||
const msg = e?.message ?? String(e);
|
||||
throw new LMStudioLifecycleError(`Failed to acquire LM Studio model handle "${modelKey}": ${msg}`, e);
|
||||
}
|
||||
}
|
||||
|
||||
async isReachable(): Promise<boolean> {
|
||||
try {
|
||||
await this.getSdk().llm.listLoaded();
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import type { ILMStudioClient } from './client';
|
||||
import type { IActivityTracker } from './activityTracker';
|
||||
import type { EngineKind } from '../utils';
|
||||
import type { ISystemSpecsProvider, IModelMemoryEstimator } from '../system/specs';
|
||||
import { logError, logInfo } from '../utils';
|
||||
|
||||
export type LifecycleState = 'idle' | 'loading' | 'loaded' | 'streaming' | 'unloading';
|
||||
@@ -19,6 +20,15 @@ export interface LifecycleManagerDeps {
|
||||
switchDebounceMs?: number;
|
||||
/** Initial engine. Default 'lmstudio'. */
|
||||
initialEngine?: EngineKind;
|
||||
/**
|
||||
* Optional pre-load memory budget check. When both are provided, a warn-only
|
||||
* advisory is emitted via `notifyError` (and a structured log line) before
|
||||
* attempting to load a model that the heuristic predicts will not fit.
|
||||
* The load is **not** blocked — the user may have a quantization the
|
||||
* estimator does not recognize.
|
||||
*/
|
||||
systemSpecs?: ISystemSpecsProvider;
|
||||
memoryEstimator?: IModelMemoryEstimator;
|
||||
}
|
||||
|
||||
export class ModelLifecycleManager {
|
||||
@@ -207,6 +217,38 @@ export class ModelLifecycleManager {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Warn-only RAM budget check. If the heuristic estimator says the model is
|
||||
* unlikely to fit, surface a non-blocking advisory and log it. The load
|
||||
* still proceeds — the heuristic can be wrong (unrecognized quantization,
|
||||
* sparse / MoE models) and the user may have explicit intent.
|
||||
*/
|
||||
private checkMemoryBudget(modelKey: string): void {
|
||||
const specsProvider = this.deps.systemSpecs;
|
||||
const estimator = this.deps.memoryEstimator;
|
||||
if (!specsProvider || !estimator) return;
|
||||
try {
|
||||
const specs = specsProvider.get();
|
||||
const requiredGB = estimator.estimate(modelKey);
|
||||
if (requiredGB > specs.safeModelBudgetGB) {
|
||||
const msg =
|
||||
`Model "${modelKey}" estimated at ~${requiredGB.toFixed(1)}GB ` +
|
||||
`exceeds your safe RAM budget of ${specs.safeModelBudgetGB}GB. ` +
|
||||
`If load fails, try a smaller quantization (q4 / q5).`;
|
||||
logInfo('LM Studio pre-load memory advisory.', {
|
||||
model: modelKey,
|
||||
requiredGB: Number(requiredGB.toFixed(2)),
|
||||
budgetGB: specs.safeModelBudgetGB,
|
||||
totalRamGB: Number(specs.totalRamGB.toFixed(2)),
|
||||
});
|
||||
this.deps.notifyError?.(msg);
|
||||
}
|
||||
} catch (e: any) {
|
||||
// Diagnostic-only; never block a load on advisory failures.
|
||||
logError('Memory budget check failed.', { error: e?.message ?? String(e) });
|
||||
}
|
||||
}
|
||||
|
||||
private async doSwitch(modelKey: string): Promise<void> {
|
||||
if (this.disposed) return;
|
||||
if (this.engine !== 'lmstudio') return;
|
||||
@@ -225,6 +267,8 @@ export class ModelLifecycleManager {
|
||||
this.currentModel = null;
|
||||
}
|
||||
|
||||
this.checkMemoryBudget(modelKey);
|
||||
|
||||
this.state = 'loading';
|
||||
this.currentModel = modelKey;
|
||||
const ac = new AbortController();
|
||||
|
||||
@@ -0,0 +1,64 @@
|
||||
import type { ILMStudioClient } from './client';
|
||||
import { LMStudioLifecycleError } from './client';
|
||||
import { logError, logInfo } from '../utils';
|
||||
|
||||
export interface ChatStreamMessage {
|
||||
role: 'user' | 'assistant' | 'system';
|
||||
content: string;
|
||||
}
|
||||
|
||||
export interface ChatStreamRequest {
|
||||
modelName: string;
|
||||
messages: ChatStreamMessage[];
|
||||
temperature: number;
|
||||
maxTokens?: number;
|
||||
signal?: AbortSignal;
|
||||
}
|
||||
|
||||
export interface IChatStreamer {
|
||||
/** Token-level streaming for an LM Studio chat completion via the WebSocket SDK. */
|
||||
stream(req: ChatStreamRequest): AsyncIterable<{ token: string }>;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adapter that streams LM Studio chat completions via @lmstudio/sdk's `model.respond()`,
|
||||
* replacing the manual fetch + SSE parser path used for the OpenAI-compatible REST endpoint.
|
||||
*
|
||||
* Benefits over the REST path:
|
||||
* - No SSE parsing (no `data: [DONE]` / partial-chunk fragility).
|
||||
* - Reuses the same WebSocket the lifecycle manager already opened — handle lookup is cheap
|
||||
* if the model is already loaded, and load-on-first-use is implicit when it isn't.
|
||||
* - First-class `signal` support for user-cancel and abort propagation.
|
||||
*/
|
||||
export class LMStudioStreamer implements IChatStreamer {
|
||||
constructor(private readonly client: ILMStudioClient) {}
|
||||
|
||||
async *stream(req: ChatStreamRequest): AsyncIterable<{ token: string }> {
|
||||
const trimmedModel = (req.modelName || '').trim();
|
||||
if (!trimmedModel) {
|
||||
throw new LMStudioLifecycleError('LMStudioStreamer.stream called without a model name.');
|
||||
}
|
||||
|
||||
const model = await this.client.getModelHandle(trimmedModel);
|
||||
logInfo('LM Studio SDK chat stream started.', { model: trimmedModel, messageCount: req.messages.length });
|
||||
|
||||
const prediction = (model as any).respond(req.messages, {
|
||||
temperature: req.temperature,
|
||||
maxTokens: req.maxTokens ?? 4096,
|
||||
signal: req.signal,
|
||||
});
|
||||
|
||||
try {
|
||||
for await (const fragment of prediction as AsyncIterable<{ content: string }>) {
|
||||
if (req.signal?.aborted) return;
|
||||
const token = fragment?.content ?? '';
|
||||
if (token) yield { token };
|
||||
}
|
||||
} catch (err: any) {
|
||||
if (req.signal?.aborted) return;
|
||||
if (err?.name === 'AbortError') return;
|
||||
logError('LM Studio SDK chat stream failed.', { model: trimmedModel, error: err?.message ?? String(err) });
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user