import type { ILMStudioClient, LMStudioLoadConfig } from './client'; import type { IActivityTracker } from './activityTracker'; import type { EngineKind } from '../utils'; import type { ISystemSpecsProvider, IModelMemoryEstimator } from '../system/specs'; import { logError, logInfo } from '../utils'; export type LifecycleState = 'idle' | 'loading' | 'loaded' | 'streaming' | 'unloading'; export interface LifecycleConfig { idleTimeoutMs: number; autoLoadOnSelect: boolean; /** Forwarded to `llm.load()` config field. Omit to use engine defaults. */ loadConfig?: LMStudioLoadConfig; /** When set, the lifecycle manager pre-warms this draft model after every successful load. */ draftModel?: string; } export interface LifecycleManagerDeps { client: ILMStudioClient; activity: IActivityTracker; getConfig: () => LifecycleConfig; notifyError?: (msg: string) => void; /** Debounce window for rapid model switches. Default 300ms. Use 0 in tests for synchronous behavior. */ switchDebounceMs?: number; /** Initial engine. Default 'lmstudio'. */ initialEngine?: EngineKind; /** * Optional pre-load memory budget check. When both are provided, a warn-only * advisory is emitted via `notifyError` (and a structured log line) before * attempting to load a model that the heuristic predicts will not fit. * The load is **not** blocked — the user may have a quantization the * estimator does not recognize. */ systemSpecs?: ISystemSpecsProvider; memoryEstimator?: IModelMemoryEstimator; } export class ModelLifecycleManager { private state: LifecycleState = 'idle'; private currentModel: string | null = null; private pendingModel: string | null = null; private engine: EngineKind; private idleTimer: ReturnType | undefined; private switchDebounce: ReturnType | undefined; private loadAbort: AbortController | undefined; private readonly activitySub: { dispose(): void }; private disposed = false; constructor(private readonly deps: LifecycleManagerDeps) { this.engine = deps.initialEngine ?? 'lmstudio'; this.activitySub = deps.activity.onActivity(() => this.onActivity()); } setEngine(engine: EngineKind): void { if (engine === this.engine) return; const wasLmStudio = this.engine === 'lmstudio'; this.engine = engine; if (wasLmStudio && engine !== 'lmstudio') { this.clearIdleTimer(); this.cancelPendingSwitch(); this.cancelLoad(); this.state = 'idle'; this.currentModel = null; this.pendingModel = null; } } onModelSelected(modelKey: string): void { if (this.disposed) return; if (this.engine !== 'lmstudio') return; if (!this.deps.getConfig().autoLoadOnSelect) return; const trimmed = (modelKey || '').trim(); if (!trimmed) return; // Mid-stream: queue the latest selection, apply on streamEnd. if (this.state === 'streaming') { this.pendingModel = trimmed; return; } // Same model already in flight or active — keep timer fresh, no reload. if ((this.state === 'loaded' || this.state === 'loading') && this.currentModel === trimmed) { if (this.state === 'loaded') this.resetIdleTimer(); return; } this.cancelPendingSwitch(); const delay = this.deps.switchDebounceMs ?? 300; if (delay <= 0) { void this.doSwitch(trimmed); return; } this.switchDebounce = setTimeout(() => { this.switchDebounce = undefined; void this.doSwitch(trimmed); }, delay); } onStreamStart(): void { if (this.disposed) return; if (this.engine !== 'lmstudio') return; this.clearIdleTimer(); if (this.state === 'loaded') this.state = 'streaming'; } onStreamEnd(): void { if (this.disposed) return; if (this.engine !== 'lmstudio') return; if (this.state === 'streaming') { this.state = 'loaded'; if (this.pendingModel && this.pendingModel !== this.currentModel) { const next = this.pendingModel; this.pendingModel = null; void this.doSwitch(next); } else { this.pendingModel = null; this.resetIdleTimer(); } } } /** Best-effort eject before extension shutdown. Bounded by timeoutMs. */ async disposeAndUnload(timeoutMs: number = 2000): Promise { if (this.disposed) return; this.disposed = true; this.clearIdleTimer(); this.cancelPendingSwitch(); this.cancelLoad(); this.activitySub.dispose(); const shouldUnload = this.engine === 'lmstudio' && (this.state === 'loaded' || this.state === 'streaming') && this.currentModel !== null; if (!shouldUnload) { this.state = 'idle'; this.currentModel = null; return; } const target = this.currentModel as string; this.state = 'unloading'; try { await Promise.race([ this.deps.client.unload(target), new Promise((_, reject) => setTimeout(() => reject(new Error(`unload timed out after ${timeoutMs}ms`)), timeoutMs) ), ]); } catch (e: any) { logError('LM Studio unload during dispose failed.', { model: target, error: e?.message ?? String(e) }); } this.state = 'idle'; this.currentModel = null; } /** vscode.Disposable shape — fire and forget. */ dispose(): void { void this.disposeAndUnload(); } // Test/inspection helpers public _getState(): LifecycleState { return this.state; } public _getCurrentModel(): string | null { return this.currentModel; } public _hasIdleTimer(): boolean { return this.idleTimer !== undefined; } // ---------- internals ---------- private onActivity(): void { if (this.disposed) return; if (this.engine !== 'lmstudio') return; if (this.state !== 'loaded') return; this.resetIdleTimer(); } private clearIdleTimer(): void { if (this.idleTimer) { clearTimeout(this.idleTimer); this.idleTimer = undefined; } } private cancelPendingSwitch(): void { if (this.switchDebounce) { clearTimeout(this.switchDebounce); this.switchDebounce = undefined; } } private resetIdleTimer(): void { this.clearIdleTimer(); const ms = this.deps.getConfig().idleTimeoutMs; if (!Number.isFinite(ms) || ms <= 0) return; this.idleTimer = setTimeout(() => { this.idleTimer = undefined; void this.doIdleEject(); }, ms); } private async doIdleEject(): Promise { if (this.state !== 'loaded' || !this.currentModel) return; const target = this.currentModel; this.state = 'unloading'; try { await this.deps.client.unload(target); logInfo('LM Studio model auto-ejected after idle.', { model: target }); } catch (e: any) { logError('LM Studio auto-eject failed.', { model: target, error: e?.message ?? String(e) }); this.deps.notifyError?.(`LM Studio auto-eject failed: ${e?.message ?? e}`); } this.state = 'idle'; this.currentModel = null; } private cancelLoad(): void { if (this.loadAbort) { try { this.loadAbort.abort(); } catch { /* noop */ } this.loadAbort = undefined; } } /** * Warn-only RAM budget check. If the heuristic estimator says the model is * unlikely to fit, surface a non-blocking advisory and log it. The load * still proceeds — the heuristic can be wrong (unrecognized quantization, * sparse / MoE models) and the user may have explicit intent. */ private checkMemoryBudget(modelKey: string): void { const specsProvider = this.deps.systemSpecs; const estimator = this.deps.memoryEstimator; if (!specsProvider || !estimator) return; try { const specs = specsProvider.get(); const requiredGB = estimator.estimate(modelKey); if (requiredGB > specs.safeModelBudgetGB) { const msg = `Model "${modelKey}" estimated at ~${requiredGB.toFixed(1)}GB ` + `exceeds your safe RAM budget of ${specs.safeModelBudgetGB}GB. ` + `If load fails, try a smaller quantization (q4 / q5).`; logInfo('LM Studio pre-load memory advisory.', { model: modelKey, requiredGB: Number(requiredGB.toFixed(2)), budgetGB: specs.safeModelBudgetGB, totalRamGB: Number(specs.totalRamGB.toFixed(2)), }); this.deps.notifyError?.(msg); } } catch (e: any) { // Diagnostic-only; never block a load on advisory failures. logError('Memory budget check failed.', { error: e?.message ?? String(e) }); } } private async doSwitch(modelKey: string): Promise { if (this.disposed) return; if (this.engine !== 'lmstudio') return; this.cancelLoad(); this.clearIdleTimer(); // ── 1) 타깃 외 *로드된 모든 LLM* 언로드 (VRAM 회수) ─────────────────── // lifecycle 이 추적하는 currentModel 뿐 아니라, 수동 로드·JIT·이전 세션으로 // LM Studio 에 떠 있는 다른 모델까지 모두 내린다. (예: 26b 가 떠 있는 상태에서 // 12b 로 전환 시 26b 를 자동 언로드해 VRAM 을 비워야 12b 가 로드된다.) // 보호: ① 타깃 모델 ② 설정된 draft 모델(speculative decoding) ③ 임베딩 모델 // (검색 기능이 의존) 은 언로드하지 않는다. // 실패해도 load 는 무조건 진행 — 한 모델 unload 실패가 전체 전환을 막지 않게. this.state = 'unloading'; const cfg0 = this.deps.getConfig(); const keep = new Set([modelKey, cfg0.draftModel].filter((m): m is string => !!m)); try { const loaded = await this.deps.client.listLoaded(); for (const m of loaded) { if (keep.has(m)) continue; if (/embed/i.test(m)) continue; // 임베딩 모델 보호 try { await this.deps.client.unload(m); logInfo('LM Studio: 전환 전 다른 모델 언로드 (VRAM 회수).', { unloaded: m, target: modelKey }); } catch (e: any) { logError('LM Studio unload before switch failed — 계속 진행.', { model: m, error: e?.message ?? String(e) }); } } } catch (e: any) { // listLoaded 실패 시: 추적 중인 currentModel 만이라도 언로드 (기존 동작). logError('listLoaded failed before switch — tracked currentModel 만 언로드 시도.', { error: e?.message ?? String(e) }); if (this.currentModel && this.currentModel !== modelKey) { try { await this.deps.client.unload(this.currentModel); } catch { /* noop */ } } } this.currentModel = null; this.checkMemoryBudget(modelKey); // ── 2) Load 새 모델 ─────────────────────────────────────────────────── this.state = 'loading'; this.currentModel = modelKey; const ac = new AbortController(); this.loadAbort = ac; try { const cfg = this.deps.getConfig(); await this.deps.client.load(modelKey, ac.signal, cfg.loadConfig); if (this.loadAbort !== ac) return; // superseded by a newer switch this.loadAbort = undefined; this.state = 'loaded'; this.resetIdleTimer(); // Pre-warm the draft model so the first speculative prediction doesn't pay a cold-load cost. if (cfg.draftModel && this.deps.client.preloadDraftModel) { void this.deps.client.preloadDraftModel(cfg.draftModel); } } catch (e: any) { if (ac.signal.aborted) { // 새 switch 가 우리를 abort 시킨 경우 → 그 switch 가 state 를 새로 정함. // 우리는 손대지 말고 빠진다. return; } logError('LM Studio model load failed.', { model: modelKey, error: e?.message ?? String(e) }); this.deps.notifyError?.(`LM Studio load failed: ${e?.message ?? e}`); if (this.loadAbort === ac) this.loadAbort = undefined; // Load 실패 → 어떤 모델도 안 떠 있는 깨끗한 상태로 복귀. 다음 호출이 같은 // 모델을 다시 시도할 수 있게 currentModel 도 비운다. this.state = 'idle'; this.currentModel = null; } } }