chore: v2.2.73 — ASTRA-DEBUG 로그 레벨 + webview CSP font-src 보강

- ASTRA-DEBUG 정상 흐름 로그를 console.error → logInfo/console.log 로 강등 (chatHandlers, extension, slashRouter): DevTools에 ERR로 찍히던 오탐 제거 - sidebar webview에 명시적 CSP meta 추가 + font-src에 data: 허용 (sidebar.html, sidebarProvider._getHtml): VS Code outer iframe이 codicon.ttf를 data:font/ttf 로 inject하면서 기본 CSP에 막혀 매 prompt 마다 violation 경고가 찍히던 문제 해소 - 누적된 LM Studio / agent / 컨텍스트 매니저 / 테스트 갱신 동반 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-23 15:52:19 +09:00
parent 36db170844
commit 0712014fcb
43 changed files with 2417 additions and 977 deletions
@@ -1,8 +1,20 @@
-import { LMStudioClient as SDKClient, LLM } from '@lmstudio/sdk';
+import { LMStudioClient as SDKClient, LLM, type LLMLoadModelConfig } from '@lmstudio/sdk';
 import { logError, logInfo } from '../utils';

+/** Load-time options forwarded to LM Studio's `llm.load()`. Subset of `LLMLoadModelConfig`. */
+export interface LMStudioLoadConfig {
+    flashAttention?: boolean;
+    /** "max" | "off" | number 0-1 */
+    gpuOffloadRatio?: 'max' | 'off' | number;
+    offloadKVCacheToGpu?: boolean;
+    keepModelInMemory?: boolean;
+    useFp16ForKVCache?: boolean;
+    /** 0 / undefined = engine default */
+    evalBatchSize?: number;
+}
+
 export interface ILMStudioClient {
-    load(modelKey: string, signal?: AbortSignal): Promise<void>;
+    load(modelKey: string, signal?: AbortSignal, loadConfig?: LMStudioLoadConfig): Promise<void>;
    unload(modelKey: string): Promise<void>;
    listLoaded(): Promise<string[]>;
    /** Like listLoaded() but caches the result for `ttlMs` to avoid hammering the SDK. */
@@ -15,6 +27,10 @@ export interface ILMStudioClient {
     * only returns loaded models when JIT is off).
     */
    listDownloaded(): Promise<string[]>;
+    /** Cached variant; the downloaded list only changes when the user installs/removes a model. */
+    listDownloadedCached(ttlMs?: number): Promise<string[]>;
+    /** Pre-warm a draft model for speculative decoding. Idempotent + best-effort. */
+    preloadDraftModel?(draftModelKey: string): Promise<void>;
    /**
     * Resolve a chat-ready handle for an already-loaded (or just-loaded) model.
     *
@@ -42,8 +58,20 @@ export function httpToWebSocketUrl(httpBaseUrl: string): string | undefined {
        if (url.protocol === 'http:') url.protocol = 'ws:';
        else if (url.protocol === 'https:') url.protocol = 'wss:';
        else if (url.protocol !== 'ws:' && url.protocol !== 'wss:') return undefined;
-        if (url.pathname.endsWith('/v1')) url.pathname = url.pathname.slice(0, -3);
-        if (url.pathname.endsWith('/api')) url.pathname = url.pathname.slice(0, -4);
+        // Strip every REST-only path suffix LM Studio ships with so the SDK lands on the
+        // WebSocket root. Loop because /api/v0 → /api → '' should fully unwind.
+        const REST_SUFFIXES = ['/api/v0', '/api/v1', '/v1', '/api'];
+        let changed = true;
+        while (changed) {
+            changed = false;
+            for (const suffix of REST_SUFFIXES) {
+                if (url.pathname.endsWith(suffix)) {
+                    url.pathname = url.pathname.slice(0, -suffix.length);
+                    changed = true;
+                    break;
+                }
+            }
+        }
        const out = url.toString().replace(/\/+$/, '');
        return out;
    } catch {
@@ -55,7 +83,9 @@ export class LMStudioClient implements ILMStudioClient {
    private _sdk: SDKClient | undefined;
    private _wsUrl: string | undefined;
    private _loadedCache: { value: string[]; expiresAt: number } | undefined;
+    private _downloadedCache: { value: string[]; expiresAt: number } | undefined;
    private static readonly DEFAULT_LOADED_CACHE_TTL_MS = 5000;
+    private static readonly DEFAULT_DOWNLOADED_CACHE_TTL_MS = 60_000;

    constructor(httpBaseUrl: string) {
        this.setBaseUrl(httpBaseUrl);
@@ -67,6 +97,7 @@ export class LMStudioClient implements ILMStudioClient {
            this._wsUrl = ws;
            this._sdk = undefined;
            this._loadedCache = undefined;
+            this._downloadedCache = undefined;
        }
    }

@@ -77,17 +108,53 @@ export class LMStudioClient implements ILMStudioClient {
        return this._sdk;
    }

-    async load(modelKey: string, signal?: AbortSignal): Promise<void> {
+    async load(modelKey: string, signal?: AbortSignal, loadConfig?: LMStudioLoadConfig): Promise<void> {
        try {
-            await this.getSdk().llm.load(modelKey, signal ? { signal } : undefined);
+            const opts: { signal?: AbortSignal; config?: LLMLoadModelConfig } = {};
+            if (signal) opts.signal = signal;
+            const config = this._buildLoadConfig(loadConfig);
+            if (Object.keys(config).length > 0) opts.config = config;
+            await this.getSdk().llm.load(modelKey, Object.keys(opts).length > 0 ? opts : undefined);
            this._loadedCache = undefined;
-            logInfo('LM Studio model loaded.', { modelKey });
+            // Loading does not change the downloaded-models set; leave _downloadedCache alone.
+            logInfo('LM Studio model loaded.', { modelKey, configKeys: Object.keys(config) });
        } catch (e: any) {
            const msg = e?.message ?? String(e);
            throw new LMStudioLifecycleError(`Failed to load LM Studio model "${modelKey}": ${msg}`, e);
        }
    }

+    /** Translate our flat LMStudioLoadConfig into LM Studio's nested LLMLoadModelConfig shape. */
+    private _buildLoadConfig(lc: LMStudioLoadConfig | undefined): LLMLoadModelConfig {
+        const out: LLMLoadModelConfig = {};
+        if (!lc) return out;
+        if (typeof lc.flashAttention === 'boolean') out.flashAttention = lc.flashAttention;
+        if (typeof lc.offloadKVCacheToGpu === 'boolean') out.offloadKVCacheToGpu = lc.offloadKVCacheToGpu;
+        if (typeof lc.keepModelInMemory === 'boolean') out.keepModelInMemory = lc.keepModelInMemory;
+        if (typeof lc.useFp16ForKVCache === 'boolean') out.useFp16ForKVCache = lc.useFp16ForKVCache;
+        if (typeof lc.evalBatchSize === 'number' && lc.evalBatchSize > 0) out.evalBatchSize = lc.evalBatchSize;
+        if (lc.gpuOffloadRatio !== undefined) {
+            // GPUSetting is deprecated but still accepted — wraps a single `ratio`.
+            out.gpu = { ratio: lc.gpuOffloadRatio as any };
+        }
+        return out;
+    }
+
+    async preloadDraftModel(draftModelKey: string): Promise<void> {
+        const key = (draftModelKey || '').trim();
+        if (!key) return;
+        try {
+            const llm: any = this.getSdk().llm;
+            if (typeof llm.unstable_preloadDraftModel === 'function') {
+                await llm.unstable_preloadDraftModel(key);
+                logInfo('LM Studio draft model preloaded.', { draftModelKey: key });
+            }
+        } catch (e: any) {
+            // Best-effort — the main model's respond({draftModel}) will still load it lazily.
+            logError('LM Studio draft model preload failed.', { draftModelKey: key, error: e?.message ?? String(e) });
+        }
+    }
+
    async unload(modelKey: string): Promise<void> {
        try {
            await this.getSdk().llm.unload(modelKey);
@@ -99,6 +166,12 @@ export class LMStudioClient implements ILMStudioClient {
        }
    }

+    /** Force the next downloaded/loaded-models call to re-fetch (use after install / remove). */
+    invalidateCaches(): void {
+        this._loadedCache = undefined;
+        this._downloadedCache = undefined;
+    }
+
    async listLoaded(): Promise<string[]> {
        try {
            const items: any[] = await this.getSdk().llm.listLoaded();
@@ -138,6 +211,20 @@ export class LMStudioClient implements ILMStudioClient {
        }
    }

+    async listDownloadedCached(ttlMs: number = LMStudioClient.DEFAULT_DOWNLOADED_CACHE_TTL_MS): Promise<string[]> {
+        const now = Date.now();
+        if (this._downloadedCache && this._downloadedCache.expiresAt > now) {
+            return this._downloadedCache.value.slice();
+        }
+        const value = await this.listDownloaded();
+        // Only cache non-empty results — an empty array often signals a transient SDK error,
+        // and caching that for 60s would hide a freshly-started LM Studio process.
+        if (value.length > 0) {
+            this._downloadedCache = { value, expiresAt: now + ttlMs };
+        }
+        return value.slice();
+    }
+
    async getModelHandle(modelKey: string, options?: { refresh?: boolean }): Promise<LLM> {
        try {
            if (options?.refresh) {