76d5fedfb5
큰 입력 시 "Failed to acquire LM Studio model handle … Operation canceled" 로 턴 전체가 죽던 문제를 3계층으로 해결. 일반 채팅(코어 경로)은 그동안 단일 예산 호출이라 약한 모델·큰 입력에서 무너졌다 — 그 갭을 메움. - 핸들 race 수정: getModelHandle 을 재시도 루프 안으로 이동. 취소/죽은-핸들 류 에러는 SDK 재생성 후 1회 자동 재시도(실제 사용자 취소는 존중). 라이프 사이클의 동시 로드가 abort 되며 SDK 가 coalesce 한 JIT 조회까지 죽던 것. - Phase 1 실제 창 정렬: llm.getContextLength()(캐시)로 실측 창에 예산 클램프. 설정값보다 작은 창으로 로드된 경우 서버 truncation/빈 답변 차단. 배지에 표시. - Phase 2 코어 Map-Reduce: 단일 입력이 (유효 창 × ratio) 초과 시 청크→질의 인지형 추출→통합. 부분/전체 폴백, 무관 시 정직 신호. 동시성 기본 2. - Phase 3 메타 노출: 진행/결과 배지 표시, [조각 k] 출처 옵트인. 신규 설정 5종. /meet·/review 전용 경로는 불변. 테스트 +25건, 전체 684 통과. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
301 lines
13 KiB
TypeScript
301 lines
13 KiB
TypeScript
import { LMStudioClient as SDKClient, LLM, type LLMLoadModelConfig } from '@lmstudio/sdk';
|
|
import { logError, logInfo } from '../utils';
|
|
|
|
/** Load-time options forwarded to LM Studio's `llm.load()`. Subset of `LLMLoadModelConfig`. */
|
|
export interface LMStudioLoadConfig {
|
|
flashAttention?: boolean;
|
|
/** "max" | "off" | number 0-1 */
|
|
gpuOffloadRatio?: 'max' | 'off' | number;
|
|
offloadKVCacheToGpu?: boolean;
|
|
keepModelInMemory?: boolean;
|
|
useFp16ForKVCache?: boolean;
|
|
/** 0 / undefined = engine default */
|
|
evalBatchSize?: number;
|
|
}
|
|
|
|
export interface ILMStudioClient {
|
|
load(modelKey: string, signal?: AbortSignal, loadConfig?: LMStudioLoadConfig): Promise<void>;
|
|
unload(modelKey: string): Promise<void>;
|
|
listLoaded(): Promise<string[]>;
|
|
/** Like listLoaded() but caches the result for `ttlMs` to avoid hammering the SDK. */
|
|
listLoadedCached(ttlMs?: number): Promise<string[]>;
|
|
/**
|
|
* List every LLM the user has downloaded into LM Studio, regardless of
|
|
* whether it is currently loaded. Returns the SDK `modelKey` of each entry —
|
|
* the exact identifier `llm.load()` accepts. Use this for the dropdown so
|
|
* the list does not depend on LM Studio's JIT setting (REST `/v1/models`
|
|
* only returns loaded models when JIT is off).
|
|
*/
|
|
listDownloaded(): Promise<string[]>;
|
|
/** Cached variant; the downloaded list only changes when the user installs/removes a model. */
|
|
listDownloadedCached(ttlMs?: number): Promise<string[]>;
|
|
/** Pre-warm a draft model for speculative decoding. Idempotent + best-effort. */
|
|
preloadDraftModel?(draftModelKey: string): Promise<void>;
|
|
/**
|
|
* Resolve a chat-ready handle for an already-loaded (or just-loaded) model.
|
|
*
|
|
* `options.refresh: true` drops the SDK + WebSocket so any disposed handle
|
|
* sitting in the SDK's internal handle map is discarded. Use this after a
|
|
* "Model is disposed!" or "lock() request could not be registered" error.
|
|
*/
|
|
getModelHandle(modelKey: string, options?: { refresh?: boolean }): Promise<LLM>;
|
|
/**
|
|
* The model's *actually-loaded* context window in tokens (LM Studio's
|
|
* `llm.getContextLength()`), or `undefined` if it can't be determined.
|
|
*
|
|
* The user-facing `g1nation.contextLength` setting is only a budgeting
|
|
* intent — the real ceiling is whatever window the model was loaded with.
|
|
* Budgeting against the larger of the two silently overflows the server,
|
|
* which then truncates the prompt or emits EOS as the first token (empty
|
|
* answer). Cached per-key because it only changes on reload.
|
|
*/
|
|
getModelContextLength(modelKey: string): Promise<number | undefined>;
|
|
isReachable(): Promise<boolean>;
|
|
setBaseUrl(httpBaseUrl: string): void;
|
|
}
|
|
|
|
export class LMStudioLifecycleError extends Error {
|
|
constructor(message: string, public readonly cause?: unknown) {
|
|
super(message);
|
|
this.name = 'LMStudioLifecycleError';
|
|
}
|
|
}
|
|
|
|
export function httpToWebSocketUrl(httpBaseUrl: string): string | undefined {
|
|
const trimmed = (httpBaseUrl || '').trim();
|
|
if (!trimmed) return undefined;
|
|
try {
|
|
const url = new URL(trimmed);
|
|
if (url.protocol === 'http:') url.protocol = 'ws:';
|
|
else if (url.protocol === 'https:') url.protocol = 'wss:';
|
|
else if (url.protocol !== 'ws:' && url.protocol !== 'wss:') return undefined;
|
|
// Strip every REST-only path suffix LM Studio ships with so the SDK lands on the
|
|
// WebSocket root. Loop because /api/v0 → /api → '' should fully unwind.
|
|
const REST_SUFFIXES = ['/api/v0', '/api/v1', '/v1', '/api'];
|
|
let changed = true;
|
|
while (changed) {
|
|
changed = false;
|
|
for (const suffix of REST_SUFFIXES) {
|
|
if (url.pathname.endsWith(suffix)) {
|
|
url.pathname = url.pathname.slice(0, -suffix.length);
|
|
changed = true;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
const out = url.toString().replace(/\/+$/, '');
|
|
return out;
|
|
} catch {
|
|
return undefined;
|
|
}
|
|
}
|
|
|
|
export class LMStudioClient implements ILMStudioClient {
|
|
private _sdk: SDKClient | undefined;
|
|
private _wsUrl: string | undefined;
|
|
private _loadedCache: { value: string[]; expiresAt: number } | undefined;
|
|
private _downloadedCache: { value: string[]; expiresAt: number } | undefined;
|
|
private _contextLengthCache = new Map<string, { value: number; expiresAt: number }>();
|
|
private static readonly DEFAULT_LOADED_CACHE_TTL_MS = 5000;
|
|
private static readonly DEFAULT_DOWNLOADED_CACHE_TTL_MS = 60_000;
|
|
private static readonly DEFAULT_CONTEXT_LENGTH_CACHE_TTL_MS = 60_000;
|
|
|
|
constructor(httpBaseUrl: string) {
|
|
this.setBaseUrl(httpBaseUrl);
|
|
}
|
|
|
|
setBaseUrl(httpBaseUrl: string): void {
|
|
const ws = httpToWebSocketUrl(httpBaseUrl);
|
|
if (ws !== this._wsUrl) {
|
|
this._wsUrl = ws;
|
|
this._sdk = undefined;
|
|
this._loadedCache = undefined;
|
|
this._downloadedCache = undefined;
|
|
this._contextLengthCache.clear();
|
|
}
|
|
}
|
|
|
|
private getSdk(): SDKClient {
|
|
if (!this._sdk) {
|
|
this._sdk = new SDKClient(this._wsUrl ? { baseUrl: this._wsUrl } : {});
|
|
}
|
|
return this._sdk;
|
|
}
|
|
|
|
async load(modelKey: string, signal?: AbortSignal, loadConfig?: LMStudioLoadConfig): Promise<void> {
|
|
try {
|
|
const opts: { signal?: AbortSignal; config?: LLMLoadModelConfig } = {};
|
|
if (signal) opts.signal = signal;
|
|
const config = this._buildLoadConfig(loadConfig);
|
|
if (Object.keys(config).length > 0) opts.config = config;
|
|
await this.getSdk().llm.load(modelKey, Object.keys(opts).length > 0 ? opts : undefined);
|
|
this._loadedCache = undefined;
|
|
// Loading does not change the downloaded-models set; leave _downloadedCache alone.
|
|
logInfo('LM Studio model loaded.', { modelKey, configKeys: Object.keys(config) });
|
|
} catch (e: any) {
|
|
const msg = e?.message ?? String(e);
|
|
throw new LMStudioLifecycleError(`Failed to load LM Studio model "${modelKey}": ${msg}`, e);
|
|
}
|
|
}
|
|
|
|
/** Translate our flat LMStudioLoadConfig into LM Studio's nested LLMLoadModelConfig shape. */
|
|
private _buildLoadConfig(lc: LMStudioLoadConfig | undefined): LLMLoadModelConfig {
|
|
const out: LLMLoadModelConfig = {};
|
|
if (!lc) return out;
|
|
if (typeof lc.flashAttention === 'boolean') out.flashAttention = lc.flashAttention;
|
|
if (typeof lc.offloadKVCacheToGpu === 'boolean') out.offloadKVCacheToGpu = lc.offloadKVCacheToGpu;
|
|
if (typeof lc.keepModelInMemory === 'boolean') out.keepModelInMemory = lc.keepModelInMemory;
|
|
if (typeof lc.useFp16ForKVCache === 'boolean') out.useFp16ForKVCache = lc.useFp16ForKVCache;
|
|
if (typeof lc.evalBatchSize === 'number' && lc.evalBatchSize > 0) out.evalBatchSize = lc.evalBatchSize;
|
|
if (lc.gpuOffloadRatio !== undefined) {
|
|
// GPUSetting is deprecated but still accepted — wraps a single `ratio`.
|
|
out.gpu = { ratio: lc.gpuOffloadRatio as any };
|
|
}
|
|
return out;
|
|
}
|
|
|
|
async preloadDraftModel(draftModelKey: string): Promise<void> {
|
|
const key = (draftModelKey || '').trim();
|
|
if (!key) return;
|
|
try {
|
|
const llm: any = this.getSdk().llm;
|
|
if (typeof llm.unstable_preloadDraftModel === 'function') {
|
|
await llm.unstable_preloadDraftModel(key);
|
|
logInfo('LM Studio draft model preloaded.', { draftModelKey: key });
|
|
}
|
|
} catch (e: any) {
|
|
// Best-effort — the main model's respond({draftModel}) will still load it lazily.
|
|
logError('LM Studio draft model preload failed.', { draftModelKey: key, error: e?.message ?? String(e) });
|
|
}
|
|
}
|
|
|
|
async unload(modelKey: string): Promise<void> {
|
|
try {
|
|
await this.getSdk().llm.unload(modelKey);
|
|
this._loadedCache = undefined;
|
|
logInfo('LM Studio model unloaded.', { modelKey });
|
|
} catch (e: any) {
|
|
const msg = e?.message ?? String(e);
|
|
throw new LMStudioLifecycleError(`Failed to unload LM Studio model "${modelKey}": ${msg}`, e);
|
|
}
|
|
}
|
|
|
|
/** Force the next downloaded/loaded-models call to re-fetch (use after install / remove). */
|
|
invalidateCaches(): void {
|
|
this._loadedCache = undefined;
|
|
this._downloadedCache = undefined;
|
|
this._contextLengthCache.clear();
|
|
}
|
|
|
|
async listLoaded(): Promise<string[]> {
|
|
try {
|
|
const items: any[] = await this.getSdk().llm.listLoaded();
|
|
return items
|
|
.map((m) => m?.identifier ?? m?.modelKey ?? m?.path ?? null)
|
|
.filter((id): id is string => typeof id === 'string' && id.length > 0);
|
|
} catch (e: any) {
|
|
const msg = e?.message ?? String(e);
|
|
throw new LMStudioLifecycleError(`Failed to list loaded LM Studio models: ${msg}`, e);
|
|
}
|
|
}
|
|
|
|
async listLoadedCached(ttlMs: number = LMStudioClient.DEFAULT_LOADED_CACHE_TTL_MS): Promise<string[]> {
|
|
const now = Date.now();
|
|
if (this._loadedCache && this._loadedCache.expiresAt > now) {
|
|
return this._loadedCache.value.slice();
|
|
}
|
|
try {
|
|
const value = await this.listLoaded();
|
|
this._loadedCache = { value, expiresAt: now + ttlMs };
|
|
return value.slice();
|
|
} catch {
|
|
return [];
|
|
}
|
|
}
|
|
|
|
async listDownloaded(): Promise<string[]> {
|
|
try {
|
|
const items: any[] = await this.getSdk().system.listDownloadedModels('llm');
|
|
return items
|
|
.map((m) => m?.modelKey ?? null)
|
|
.filter((k): k is string => typeof k === 'string' && k.length > 0);
|
|
} catch (e: any) {
|
|
const msg = e?.message ?? String(e);
|
|
logError('Failed to list downloaded LM Studio models.', { error: msg });
|
|
return [];
|
|
}
|
|
}
|
|
|
|
async listDownloadedCached(ttlMs: number = LMStudioClient.DEFAULT_DOWNLOADED_CACHE_TTL_MS): Promise<string[]> {
|
|
const now = Date.now();
|
|
if (this._downloadedCache && this._downloadedCache.expiresAt > now) {
|
|
return this._downloadedCache.value.slice();
|
|
}
|
|
const value = await this.listDownloaded();
|
|
// Only cache non-empty results — an empty array often signals a transient SDK error,
|
|
// and caching that for 60s would hide a freshly-started LM Studio process.
|
|
if (value.length > 0) {
|
|
this._downloadedCache = { value, expiresAt: now + ttlMs };
|
|
}
|
|
return value.slice();
|
|
}
|
|
|
|
async getModelHandle(modelKey: string, options?: { refresh?: boolean }): Promise<LLM> {
|
|
try {
|
|
if (options?.refresh) {
|
|
// Recreate the SDK + WebSocket so the SDK's internal handle
|
|
// cache is dropped. The next llm.model() call mints a fresh
|
|
// handle instead of returning the disposed one from the
|
|
// previous (aborted) prediction.
|
|
this._sdk = undefined;
|
|
this._loadedCache = undefined;
|
|
logInfo('LM Studio SDK handle refresh requested — dropped cached SDK client.', { modelKey });
|
|
}
|
|
return await this.getSdk().llm.model(modelKey);
|
|
} catch (e: any) {
|
|
const msg = e?.message ?? String(e);
|
|
throw new LMStudioLifecycleError(`Failed to acquire LM Studio model handle "${modelKey}": ${msg}`, e);
|
|
}
|
|
}
|
|
|
|
async getModelContextLength(modelKey: string): Promise<number | undefined> {
|
|
const key = (modelKey || '').trim();
|
|
if (!key) return undefined;
|
|
const now = Date.now();
|
|
const cached = this._contextLengthCache.get(key);
|
|
if (cached && cached.expiresAt > now) return cached.value;
|
|
try {
|
|
// Reuses the same handle the stream will use. If the model isn't
|
|
// loaded yet this forces a JIT load — acceptable since the very next
|
|
// step streams from it anyway. Best-effort: any failure (incl. the
|
|
// load-coalescing "Operation canceled" race) falls back to undefined
|
|
// so the caller keeps the configured window.
|
|
const handle: any = await this.getSdk().llm.model(key);
|
|
const len = typeof handle?.getContextLength === 'function'
|
|
? await handle.getContextLength()
|
|
: undefined;
|
|
if (typeof len === 'number' && Number.isFinite(len) && len > 0) {
|
|
this._contextLengthCache.set(key, {
|
|
value: len,
|
|
expiresAt: now + LMStudioClient.DEFAULT_CONTEXT_LENGTH_CACHE_TTL_MS,
|
|
});
|
|
return len;
|
|
}
|
|
return undefined;
|
|
} catch (e: any) {
|
|
logError('Failed to query LM Studio model context length.', { modelKey: key, error: e?.message ?? String(e) });
|
|
return undefined;
|
|
}
|
|
}
|
|
|
|
async isReachable(): Promise<boolean> {
|
|
try {
|
|
await this.getSdk().llm.listLoaded();
|
|
return true;
|
|
} catch (e: any) {
|
|
logError('LM Studio not reachable.', { error: e?.message ?? String(e) });
|
|
return false;
|
|
}
|
|
}
|
|
}
|