Files
connectai/src/lmstudio/lifecycleManager.ts
T
koriweb 6d06311d60 fix(lmstudio): 모델 전환 시 다른 모델 전부 자동 언로드 (v2.2.210)
VRAM 부족으로 12b 등 다른 모델 로드 실패하던 문제 강화.
- lifecycleManager.doSwitch: 추적 중인 currentModel 만이 아니라 listLoaded()
  기반으로 *로드된 모든 LLM* 을 타깃 전 언로드(VRAM 회수). draft 모델·임베딩
  모델은 보호. listLoaded 실패 시 기존 동작(tracked unload)으로 폴백.
- extension.ts: defaultModel 설정 변경(설정 패널/settings.json 포함) 시
  lifecycle.onModelSelected 호출 → 설정 패널 전환도 unload→load 발동.
- 테스트 FakeLMStudioClient 가 실제 로드 상태를 추적하도록 갱신.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-09 11:27:43 +09:00

328 lines
13 KiB
TypeScript

import type { ILMStudioClient, LMStudioLoadConfig } from './client';
import type { IActivityTracker } from './activityTracker';
import type { EngineKind } from '../utils';
import type { ISystemSpecsProvider, IModelMemoryEstimator } from '../system/specs';
import { logError, logInfo } from '../utils';
export type LifecycleState = 'idle' | 'loading' | 'loaded' | 'streaming' | 'unloading';
export interface LifecycleConfig {
idleTimeoutMs: number;
autoLoadOnSelect: boolean;
/** Forwarded to `llm.load()` config field. Omit to use engine defaults. */
loadConfig?: LMStudioLoadConfig;
/** When set, the lifecycle manager pre-warms this draft model after every successful load. */
draftModel?: string;
}
export interface LifecycleManagerDeps {
client: ILMStudioClient;
activity: IActivityTracker;
getConfig: () => LifecycleConfig;
notifyError?: (msg: string) => void;
/** Debounce window for rapid model switches. Default 300ms. Use 0 in tests for synchronous behavior. */
switchDebounceMs?: number;
/** Initial engine. Default 'lmstudio'. */
initialEngine?: EngineKind;
/**
* Optional pre-load memory budget check. When both are provided, a warn-only
* advisory is emitted via `notifyError` (and a structured log line) before
* attempting to load a model that the heuristic predicts will not fit.
* The load is **not** blocked — the user may have a quantization the
* estimator does not recognize.
*/
systemSpecs?: ISystemSpecsProvider;
memoryEstimator?: IModelMemoryEstimator;
}
export class ModelLifecycleManager {
private state: LifecycleState = 'idle';
private currentModel: string | null = null;
private pendingModel: string | null = null;
private engine: EngineKind;
private idleTimer: ReturnType<typeof setTimeout> | undefined;
private switchDebounce: ReturnType<typeof setTimeout> | undefined;
private loadAbort: AbortController | undefined;
private readonly activitySub: { dispose(): void };
private disposed = false;
constructor(private readonly deps: LifecycleManagerDeps) {
this.engine = deps.initialEngine ?? 'lmstudio';
this.activitySub = deps.activity.onActivity(() => this.onActivity());
}
setEngine(engine: EngineKind): void {
if (engine === this.engine) return;
const wasLmStudio = this.engine === 'lmstudio';
this.engine = engine;
if (wasLmStudio && engine !== 'lmstudio') {
this.clearIdleTimer();
this.cancelPendingSwitch();
this.cancelLoad();
this.state = 'idle';
this.currentModel = null;
this.pendingModel = null;
}
}
onModelSelected(modelKey: string): void {
if (this.disposed) return;
if (this.engine !== 'lmstudio') return;
if (!this.deps.getConfig().autoLoadOnSelect) return;
const trimmed = (modelKey || '').trim();
if (!trimmed) return;
// Mid-stream: queue the latest selection, apply on streamEnd.
if (this.state === 'streaming') {
this.pendingModel = trimmed;
return;
}
// Same model already in flight or active — keep timer fresh, no reload.
if ((this.state === 'loaded' || this.state === 'loading') && this.currentModel === trimmed) {
if (this.state === 'loaded') this.resetIdleTimer();
return;
}
this.cancelPendingSwitch();
const delay = this.deps.switchDebounceMs ?? 300;
if (delay <= 0) {
void this.doSwitch(trimmed);
return;
}
this.switchDebounce = setTimeout(() => {
this.switchDebounce = undefined;
void this.doSwitch(trimmed);
}, delay);
}
onStreamStart(): void {
if (this.disposed) return;
if (this.engine !== 'lmstudio') return;
this.clearIdleTimer();
if (this.state === 'loaded') this.state = 'streaming';
}
onStreamEnd(): void {
if (this.disposed) return;
if (this.engine !== 'lmstudio') return;
if (this.state === 'streaming') {
this.state = 'loaded';
if (this.pendingModel && this.pendingModel !== this.currentModel) {
const next = this.pendingModel;
this.pendingModel = null;
void this.doSwitch(next);
} else {
this.pendingModel = null;
this.resetIdleTimer();
}
}
}
/** Best-effort eject before extension shutdown. Bounded by timeoutMs. */
async disposeAndUnload(timeoutMs: number = 2000): Promise<void> {
if (this.disposed) return;
this.disposed = true;
this.clearIdleTimer();
this.cancelPendingSwitch();
this.cancelLoad();
this.activitySub.dispose();
const shouldUnload =
this.engine === 'lmstudio' &&
(this.state === 'loaded' || this.state === 'streaming') &&
this.currentModel !== null;
if (!shouldUnload) {
this.state = 'idle';
this.currentModel = null;
return;
}
const target = this.currentModel as string;
this.state = 'unloading';
try {
await Promise.race([
this.deps.client.unload(target),
new Promise<void>((_, reject) =>
setTimeout(() => reject(new Error(`unload timed out after ${timeoutMs}ms`)), timeoutMs)
),
]);
} catch (e: any) {
logError('LM Studio unload during dispose failed.', { model: target, error: e?.message ?? String(e) });
}
this.state = 'idle';
this.currentModel = null;
}
/** vscode.Disposable shape — fire and forget. */
dispose(): void {
void this.disposeAndUnload();
}
// Test/inspection helpers
public _getState(): LifecycleState { return this.state; }
public _getCurrentModel(): string | null { return this.currentModel; }
public _hasIdleTimer(): boolean { return this.idleTimer !== undefined; }
// ---------- internals ----------
private onActivity(): void {
if (this.disposed) return;
if (this.engine !== 'lmstudio') return;
if (this.state !== 'loaded') return;
this.resetIdleTimer();
}
private clearIdleTimer(): void {
if (this.idleTimer) {
clearTimeout(this.idleTimer);
this.idleTimer = undefined;
}
}
private cancelPendingSwitch(): void {
if (this.switchDebounce) {
clearTimeout(this.switchDebounce);
this.switchDebounce = undefined;
}
}
private resetIdleTimer(): void {
this.clearIdleTimer();
const ms = this.deps.getConfig().idleTimeoutMs;
if (!Number.isFinite(ms) || ms <= 0) return;
this.idleTimer = setTimeout(() => {
this.idleTimer = undefined;
void this.doIdleEject();
}, ms);
}
private async doIdleEject(): Promise<void> {
if (this.state !== 'loaded' || !this.currentModel) return;
const target = this.currentModel;
this.state = 'unloading';
try {
await this.deps.client.unload(target);
logInfo('LM Studio model auto-ejected after idle.', { model: target });
} catch (e: any) {
logError('LM Studio auto-eject failed.', { model: target, error: e?.message ?? String(e) });
this.deps.notifyError?.(`LM Studio auto-eject failed: ${e?.message ?? e}`);
}
this.state = 'idle';
this.currentModel = null;
}
private cancelLoad(): void {
if (this.loadAbort) {
try { this.loadAbort.abort(); } catch { /* noop */ }
this.loadAbort = undefined;
}
}
/**
* Warn-only RAM budget check. If the heuristic estimator says the model is
* unlikely to fit, surface a non-blocking advisory and log it. The load
* still proceeds — the heuristic can be wrong (unrecognized quantization,
* sparse / MoE models) and the user may have explicit intent.
*/
private checkMemoryBudget(modelKey: string): void {
const specsProvider = this.deps.systemSpecs;
const estimator = this.deps.memoryEstimator;
if (!specsProvider || !estimator) return;
try {
const specs = specsProvider.get();
const requiredGB = estimator.estimate(modelKey);
if (requiredGB > specs.safeModelBudgetGB) {
const msg =
`Model "${modelKey}" estimated at ~${requiredGB.toFixed(1)}GB ` +
`exceeds your safe RAM budget of ${specs.safeModelBudgetGB}GB. ` +
`If load fails, try a smaller quantization (q4 / q5).`;
logInfo('LM Studio pre-load memory advisory.', {
model: modelKey,
requiredGB: Number(requiredGB.toFixed(2)),
budgetGB: specs.safeModelBudgetGB,
totalRamGB: Number(specs.totalRamGB.toFixed(2)),
});
this.deps.notifyError?.(msg);
}
} catch (e: any) {
// Diagnostic-only; never block a load on advisory failures.
logError('Memory budget check failed.', { error: e?.message ?? String(e) });
}
}
private async doSwitch(modelKey: string): Promise<void> {
if (this.disposed) return;
if (this.engine !== 'lmstudio') return;
this.cancelLoad();
this.clearIdleTimer();
// ── 1) 타깃 외 *로드된 모든 LLM* 언로드 (VRAM 회수) ───────────────────
// lifecycle 이 추적하는 currentModel 뿐 아니라, 수동 로드·JIT·이전 세션으로
// LM Studio 에 떠 있는 다른 모델까지 모두 내린다. (예: 26b 가 떠 있는 상태에서
// 12b 로 전환 시 26b 를 자동 언로드해 VRAM 을 비워야 12b 가 로드된다.)
// 보호: ① 타깃 모델 ② 설정된 draft 모델(speculative decoding) ③ 임베딩 모델
// (검색 기능이 의존) 은 언로드하지 않는다.
// 실패해도 load 는 무조건 진행 — 한 모델 unload 실패가 전체 전환을 막지 않게.
this.state = 'unloading';
const cfg0 = this.deps.getConfig();
const keep = new Set<string>([modelKey, cfg0.draftModel].filter((m): m is string => !!m));
try {
const loaded = await this.deps.client.listLoaded();
for (const m of loaded) {
if (keep.has(m)) continue;
if (/embed/i.test(m)) continue; // 임베딩 모델 보호
try {
await this.deps.client.unload(m);
logInfo('LM Studio: 전환 전 다른 모델 언로드 (VRAM 회수).', { unloaded: m, target: modelKey });
} catch (e: any) {
logError('LM Studio unload before switch failed — 계속 진행.', { model: m, error: e?.message ?? String(e) });
}
}
} catch (e: any) {
// listLoaded 실패 시: 추적 중인 currentModel 만이라도 언로드 (기존 동작).
logError('listLoaded failed before switch — tracked currentModel 만 언로드 시도.', { error: e?.message ?? String(e) });
if (this.currentModel && this.currentModel !== modelKey) {
try { await this.deps.client.unload(this.currentModel); } catch { /* noop */ }
}
}
this.currentModel = null;
this.checkMemoryBudget(modelKey);
// ── 2) Load 새 모델 ───────────────────────────────────────────────────
this.state = 'loading';
this.currentModel = modelKey;
const ac = new AbortController();
this.loadAbort = ac;
try {
const cfg = this.deps.getConfig();
await this.deps.client.load(modelKey, ac.signal, cfg.loadConfig);
if (this.loadAbort !== ac) return; // superseded by a newer switch
this.loadAbort = undefined;
this.state = 'loaded';
this.resetIdleTimer();
// Pre-warm the draft model so the first speculative prediction doesn't pay a cold-load cost.
if (cfg.draftModel && this.deps.client.preloadDraftModel) {
void this.deps.client.preloadDraftModel(cfg.draftModel);
}
} catch (e: any) {
if (ac.signal.aborted) {
// 새 switch 가 우리를 abort 시킨 경우 → 그 switch 가 state 를 새로 정함.
// 우리는 손대지 말고 빠진다.
return;
}
logError('LM Studio model load failed.', { model: modelKey, error: e?.message ?? String(e) });
this.deps.notifyError?.(`LM Studio load failed: ${e?.message ?? e}`);
if (this.loadAbort === ac) this.loadAbort = undefined;
// Load 실패 → 어떤 모델도 안 떠 있는 깨끗한 상태로 복귀. 다음 호출이 같은
// 모델을 다시 시도할 수 있게 currentModel 도 비운다.
this.state = 'idle';
this.currentModel = null;
}
}
}