chore: v2.2.73 — ASTRA-DEBUG 로그 레벨 + webview CSP font-src 보강

- ASTRA-DEBUG 정상 흐름 로그를 console.error → logInfo/console.log 로 강등
  (chatHandlers, extension, slashRouter): DevTools에 ERR로 찍히던 오탐 제거
- sidebar webview에 명시적 CSP meta 추가 + font-src에 data: 허용
  (sidebar.html, sidebarProvider._getHtml): VS Code outer iframe이 codicon.ttf를
  data:font/ttf 로 inject하면서 기본 CSP에 막혀 매 prompt 마다 violation
  경고가 찍히던 문제 해소
- 누적된 LM Studio / agent / 컨텍스트 매니저 / 테스트 갱신 동반

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
g1nation
2026-05-23 15:52:19 +09:00
parent 36db170844
commit 0712014fcb
43 changed files with 2417 additions and 977 deletions
+94 -7
View File
@@ -1,8 +1,20 @@
import { LMStudioClient as SDKClient, LLM } from '@lmstudio/sdk';
import { LMStudioClient as SDKClient, LLM, type LLMLoadModelConfig } from '@lmstudio/sdk';
import { logError, logInfo } from '../utils';
/** Load-time options forwarded to LM Studio's `llm.load()`. Subset of `LLMLoadModelConfig`. */
export interface LMStudioLoadConfig {
flashAttention?: boolean;
/** "max" | "off" | number 0-1 */
gpuOffloadRatio?: 'max' | 'off' | number;
offloadKVCacheToGpu?: boolean;
keepModelInMemory?: boolean;
useFp16ForKVCache?: boolean;
/** 0 / undefined = engine default */
evalBatchSize?: number;
}
export interface ILMStudioClient {
load(modelKey: string, signal?: AbortSignal): Promise<void>;
load(modelKey: string, signal?: AbortSignal, loadConfig?: LMStudioLoadConfig): Promise<void>;
unload(modelKey: string): Promise<void>;
listLoaded(): Promise<string[]>;
/** Like listLoaded() but caches the result for `ttlMs` to avoid hammering the SDK. */
@@ -15,6 +27,10 @@ export interface ILMStudioClient {
* only returns loaded models when JIT is off).
*/
listDownloaded(): Promise<string[]>;
/** Cached variant; the downloaded list only changes when the user installs/removes a model. */
listDownloadedCached(ttlMs?: number): Promise<string[]>;
/** Pre-warm a draft model for speculative decoding. Idempotent + best-effort. */
preloadDraftModel?(draftModelKey: string): Promise<void>;
/**
* Resolve a chat-ready handle for an already-loaded (or just-loaded) model.
*
@@ -42,8 +58,20 @@ export function httpToWebSocketUrl(httpBaseUrl: string): string | undefined {
if (url.protocol === 'http:') url.protocol = 'ws:';
else if (url.protocol === 'https:') url.protocol = 'wss:';
else if (url.protocol !== 'ws:' && url.protocol !== 'wss:') return undefined;
if (url.pathname.endsWith('/v1')) url.pathname = url.pathname.slice(0, -3);
if (url.pathname.endsWith('/api')) url.pathname = url.pathname.slice(0, -4);
// Strip every REST-only path suffix LM Studio ships with so the SDK lands on the
// WebSocket root. Loop because /api/v0 → /api → '' should fully unwind.
const REST_SUFFIXES = ['/api/v0', '/api/v1', '/v1', '/api'];
let changed = true;
while (changed) {
changed = false;
for (const suffix of REST_SUFFIXES) {
if (url.pathname.endsWith(suffix)) {
url.pathname = url.pathname.slice(0, -suffix.length);
changed = true;
break;
}
}
}
const out = url.toString().replace(/\/+$/, '');
return out;
} catch {
@@ -55,7 +83,9 @@ export class LMStudioClient implements ILMStudioClient {
private _sdk: SDKClient | undefined;
private _wsUrl: string | undefined;
private _loadedCache: { value: string[]; expiresAt: number } | undefined;
private _downloadedCache: { value: string[]; expiresAt: number } | undefined;
private static readonly DEFAULT_LOADED_CACHE_TTL_MS = 5000;
private static readonly DEFAULT_DOWNLOADED_CACHE_TTL_MS = 60_000;
constructor(httpBaseUrl: string) {
this.setBaseUrl(httpBaseUrl);
@@ -67,6 +97,7 @@ export class LMStudioClient implements ILMStudioClient {
this._wsUrl = ws;
this._sdk = undefined;
this._loadedCache = undefined;
this._downloadedCache = undefined;
}
}
@@ -77,17 +108,53 @@ export class LMStudioClient implements ILMStudioClient {
return this._sdk;
}
async load(modelKey: string, signal?: AbortSignal): Promise<void> {
async load(modelKey: string, signal?: AbortSignal, loadConfig?: LMStudioLoadConfig): Promise<void> {
try {
await this.getSdk().llm.load(modelKey, signal ? { signal } : undefined);
const opts: { signal?: AbortSignal; config?: LLMLoadModelConfig } = {};
if (signal) opts.signal = signal;
const config = this._buildLoadConfig(loadConfig);
if (Object.keys(config).length > 0) opts.config = config;
await this.getSdk().llm.load(modelKey, Object.keys(opts).length > 0 ? opts : undefined);
this._loadedCache = undefined;
logInfo('LM Studio model loaded.', { modelKey });
// Loading does not change the downloaded-models set; leave _downloadedCache alone.
logInfo('LM Studio model loaded.', { modelKey, configKeys: Object.keys(config) });
} catch (e: any) {
const msg = e?.message ?? String(e);
throw new LMStudioLifecycleError(`Failed to load LM Studio model "${modelKey}": ${msg}`, e);
}
}
/** Translate our flat LMStudioLoadConfig into LM Studio's nested LLMLoadModelConfig shape. */
private _buildLoadConfig(lc: LMStudioLoadConfig | undefined): LLMLoadModelConfig {
const out: LLMLoadModelConfig = {};
if (!lc) return out;
if (typeof lc.flashAttention === 'boolean') out.flashAttention = lc.flashAttention;
if (typeof lc.offloadKVCacheToGpu === 'boolean') out.offloadKVCacheToGpu = lc.offloadKVCacheToGpu;
if (typeof lc.keepModelInMemory === 'boolean') out.keepModelInMemory = lc.keepModelInMemory;
if (typeof lc.useFp16ForKVCache === 'boolean') out.useFp16ForKVCache = lc.useFp16ForKVCache;
if (typeof lc.evalBatchSize === 'number' && lc.evalBatchSize > 0) out.evalBatchSize = lc.evalBatchSize;
if (lc.gpuOffloadRatio !== undefined) {
// GPUSetting is deprecated but still accepted — wraps a single `ratio`.
out.gpu = { ratio: lc.gpuOffloadRatio as any };
}
return out;
}
async preloadDraftModel(draftModelKey: string): Promise<void> {
const key = (draftModelKey || '').trim();
if (!key) return;
try {
const llm: any = this.getSdk().llm;
if (typeof llm.unstable_preloadDraftModel === 'function') {
await llm.unstable_preloadDraftModel(key);
logInfo('LM Studio draft model preloaded.', { draftModelKey: key });
}
} catch (e: any) {
// Best-effort — the main model's respond({draftModel}) will still load it lazily.
logError('LM Studio draft model preload failed.', { draftModelKey: key, error: e?.message ?? String(e) });
}
}
async unload(modelKey: string): Promise<void> {
try {
await this.getSdk().llm.unload(modelKey);
@@ -99,6 +166,12 @@ export class LMStudioClient implements ILMStudioClient {
}
}
/** Force the next downloaded/loaded-models call to re-fetch (use after install / remove). */
invalidateCaches(): void {
this._loadedCache = undefined;
this._downloadedCache = undefined;
}
async listLoaded(): Promise<string[]> {
try {
const items: any[] = await this.getSdk().llm.listLoaded();
@@ -138,6 +211,20 @@ export class LMStudioClient implements ILMStudioClient {
}
}
async listDownloadedCached(ttlMs: number = LMStudioClient.DEFAULT_DOWNLOADED_CACHE_TTL_MS): Promise<string[]> {
const now = Date.now();
if (this._downloadedCache && this._downloadedCache.expiresAt > now) {
return this._downloadedCache.value.slice();
}
const value = await this.listDownloaded();
// Only cache non-empty results — an empty array often signals a transient SDK error,
// and caching that for 60s would hide a freshly-started LM Studio process.
if (value.length > 0) {
this._downloadedCache = { value, expiresAt: now + ttlMs };
}
return value.slice();
}
async getModelHandle(modelKey: string, options?: { refresh?: boolean }): Promise<LLM> {
try {
if (options?.refresh) {
+11 -2
View File
@@ -1,4 +1,4 @@
import type { ILMStudioClient } from './client';
import type { ILMStudioClient, LMStudioLoadConfig } from './client';
import type { IActivityTracker } from './activityTracker';
import type { EngineKind } from '../utils';
import type { ISystemSpecsProvider, IModelMemoryEstimator } from '../system/specs';
@@ -9,6 +9,10 @@ export type LifecycleState = 'idle' | 'loading' | 'loaded' | 'streaming' | 'unlo
export interface LifecycleConfig {
idleTimeoutMs: number;
autoLoadOnSelect: boolean;
/** Forwarded to `llm.load()` config field. Omit to use engine defaults. */
loadConfig?: LMStudioLoadConfig;
/** When set, the lifecycle manager pre-warms this draft model after every successful load. */
draftModel?: string;
}
export interface LifecycleManagerDeps {
@@ -274,11 +278,16 @@ export class ModelLifecycleManager {
const ac = new AbortController();
this.loadAbort = ac;
try {
await this.deps.client.load(modelKey, ac.signal);
const cfg = this.deps.getConfig();
await this.deps.client.load(modelKey, ac.signal, cfg.loadConfig);
if (this.loadAbort !== ac) return; // superseded by a newer switch
this.loadAbort = undefined;
this.state = 'loaded';
this.resetIdleTimer();
// Pre-warm the draft model so the first speculative prediction doesn't pay a cold-load cost.
if (cfg.draftModel && this.deps.client.preloadDraftModel) {
void this.deps.client.preloadDraftModel(cfg.draftModel);
}
} catch (e: any) {
if (ac.signal.aborted) return; // superseded — newer switch owns state
logError('LM Studio model load failed.', { model: modelKey, error: e?.message ?? String(e) });
+98 -17
View File
@@ -7,6 +7,30 @@ export interface ChatStreamMessage {
content: string;
}
/** Shared sampling block. SDK and REST paths both read this — keep them in sync. */
export interface LmStudioSampling {
topP?: number;
topK?: number;
minP?: number;
repeatPenalty?: number;
}
/**
* Translate the sampling block into the OpenAI-compatible REST body extension that LM Studio
* understands. Ollama uses the same field names inside `options`. Returns an object you can
* spread into either body. Values <= 0 / <= 1 (penalty) are dropped so they fall back to engine
* defaults instead of effectively disabling sampling.
*/
export function samplingToRestBody(s: LmStudioSampling | undefined): Record<string, number> {
const out: Record<string, number> = {};
if (!s) return out;
if (typeof s.topP === 'number' && s.topP > 0 && s.topP <= 1) out.top_p = s.topP;
if (typeof s.topK === 'number' && s.topK > 0) out.top_k = s.topK;
if (typeof s.minP === 'number' && s.minP > 0 && s.minP <= 1) out.min_p = s.minP;
if (typeof s.repeatPenalty === 'number' && s.repeatPenalty > 1) out.repeat_penalty = s.repeatPenalty;
return out;
}
export interface ChatStreamRequest {
modelName: string;
messages: ChatStreamMessage[];
@@ -15,17 +39,39 @@ export interface ChatStreamRequest {
maxTokens?: number;
/** LM Studio context-overflow safety net used only if the prompt still exceeds the window. */
contextOverflowPolicy?: 'stopAtLimit' | 'truncateMiddle' | 'rollingWindow';
/** Sampling — defaults match small-model glitch-suppression presets. Each is omitted from the SDK call when undefined. */
topP?: number;
topK?: number;
minP?: number;
repeatPenalty?: number;
/** Draft model key for speculative decoding. Empty/undefined disables. */
draftModel?: string;
signal?: AbortSignal;
}
/** Subset of LM Studio's `PredictionResult.stats` we expose to callers. */
export interface ChatStreamStats {
tokensPerSecond?: number;
timeToFirstTokenSec?: number;
predictedTokensCount?: number;
promptTokensCount?: number;
totalTimeSec?: number;
/** Speculative decoding (only set when `draftModel` was used). */
draftModelKey?: string;
draftTokensCount?: number;
acceptedDraftTokensCount?: number;
}
/**
* One stream event. `token` carries generated text (possibly empty for the final event);
* `stopReason` is set on the *last* event only and is the SDK's `stats.stopReason`
* (e.g. `eosFound`, `maxPredictedTokensReached`, `contextLengthReached`, `userStopped`).
* `stats` is also set on the *last* event when LM Studio reports prediction stats.
*/
export interface ChatStreamEvent {
token: string;
stopReason?: string;
stats?: ChatStreamStats;
}
export interface IChatStreamer {
@@ -72,24 +118,25 @@ export class LMStudioStreamer implements IChatStreamer {
const model = await this.client.getModelHandle(trimmedModel, refresh ? { refresh: true } : undefined);
logInfo('LM Studio SDK chat stream started.', { model: trimmedModel, messageCount: req.messages.length, attempt });
const prediction = (model as any).respond(req.messages, {
// Sampling defaults match the historical glitch-suppression preset for small /
// quantized models (한글 토큰 깨짐 방지) but are now overridable per-call.
const respondOpts: any = {
temperature: req.temperature,
maxTokens: req.maxTokens ?? 4096,
// Glitch suppression: a small / quantized model samples wrong
// neighbour tokens (Korean syllable corruption like 붕괴→붕점,
// 핵심→핵점) when the distribution is left wide. A tight nucleus
// + top-k and a min-p floor cut the low-probability tail;
// repeatPenalty curbs stutter (것입니다서입니다).
topPSampling: 0.9,
topKSampling: 20,
minPSampling: 0.05,
repeatPenalty: 1.1,
// Safety net: if our own token budgeting still underestimated and the prompt
// exceeds the model's context window, decide whether the SDK should fail
// loudly (stopAtLimit — default) or silently drop content.
contextOverflowPolicy: req.contextOverflowPolicy ?? 'stopAtLimit',
signal: req.signal,
});
};
if (typeof req.topP === 'number') respondOpts.topPSampling = req.topP;
if (typeof req.topK === 'number' && req.topK > 0) respondOpts.topKSampling = req.topK;
if (typeof req.minP === 'number' && req.minP > 0) respondOpts.minPSampling = req.minP;
if (typeof req.repeatPenalty === 'number' && req.repeatPenalty > 1) respondOpts.repeatPenalty = req.repeatPenalty;
// Speculative decoding — LM Studio loads the draft model lazily on first use if needed
// (we also `preloadDraftModel` after main load to avoid that cold cost).
if (req.draftModel && req.draftModel.trim()) respondOpts.draftModel = req.draftModel.trim();
const prediction = (model as any).respond(req.messages, respondOpts);
// Bridge AbortSignal → prediction.cancel(): without this, an
// aborted request keeps generating on the LM Studio server. The
@@ -128,24 +175,58 @@ export class LMStudioStreamer implements IChatStreamer {
if (req.signal?.aborted) return;
// The prediction object is also a Promise<PredictionResult>; awaiting it after
// the stream drains gives us stats.stopReason so callers can tell a truncated
// answer (maxPredictedTokensReached / contextLengthReached) from a normal one.
// answer (maxPredictedTokensReached / contextLengthReached) from a normal one,
// plus throughput numbers (tok/s, TTFT) we surface to the UI.
let stopReason: string | undefined;
let stats: ChatStreamEvent['stats'];
try {
const result: any = await prediction;
stopReason = result?.stats?.stopReason;
if (stopReason) {
logInfo('LM Studio SDK chat stream finished.', { model: trimmedModel, stopReason, tokensYielded: yielded });
const s = result?.stats;
if (s) {
stats = {
tokensPerSecond: typeof s.tokensPerSecond === 'number' ? s.tokensPerSecond : undefined,
timeToFirstTokenSec: typeof s.timeToFirstTokenSec === 'number' ? s.timeToFirstTokenSec : undefined,
predictedTokensCount: typeof s.predictedTokensCount === 'number' ? s.predictedTokensCount : undefined,
promptTokensCount: typeof s.promptTokensCount === 'number' ? s.promptTokensCount : undefined,
totalTimeSec: typeof s.totalTimeSec === 'number' ? s.totalTimeSec : undefined,
draftModelKey: typeof s.usedDraftModelKey === 'string' ? s.usedDraftModelKey : undefined,
draftTokensCount: typeof s.totalDraftTokensCount === 'number' ? s.totalDraftTokensCount : undefined,
acceptedDraftTokensCount: typeof s.acceptedDraftTokensCount === 'number' ? s.acceptedDraftTokensCount : undefined,
};
}
if (stopReason || stats) {
logInfo('LM Studio SDK chat stream finished.', {
model: trimmedModel, stopReason, tokensYielded: yielded,
tokensPerSecond: stats?.tokensPerSecond, ttftSec: stats?.timeToFirstTokenSec,
});
}
} catch { /* result unavailable on some SDK versions — non-fatal */ }
// Empty-but-clean stream is treated like a dead handle on attempt 1:
// recreate the SDK and try once more. Same root cause (handle bound to
// a stale prediction) but no exception is thrown — just an empty stream.
if (yielded === 0 && attempt === 1) {
logInfo('Empty SDK stream with no error — retrying with a fresh SDK.', { model: trimmedModel });
continue;
}
// Don't claim `eosFound` if we couldn't actually read the stop reason — leave it
// undefined so the caller treats it as 'unknown' (and its mid-sentence heuristics kick in).
yield { token: '', stopReason };
yield { token: '', stopReason, stats };
return;
}
const errMsg = String(caught?.message ?? caught);
const handleDead = /\bdisposed\b/i.test(errMsg)
|| /lock\(\) request could not be registered/i.test(errMsg);
// Broaden the "handle is bound to a dead WebSocket binding" detection. All of
// these resolve with the same fix (recreate the SDK client so the next
// llm.model() lookup mints a fresh handle).
const handleDead =
/\bdisposed\b/i.test(errMsg)
|| /lock\(\) request could not be registered/i.test(errMsg)
|| /channel\s+closed/i.test(errMsg)
|| /WebSocket\s+(?:is\s+not\s+open|closed|disconnected)/i.test(errMsg)
|| /Connection\s+(?:lost|reset|closed)/i.test(errMsg)
|| /\bECONNRESET\b/i.test(errMsg)
|| /socket\s+hang\s*up/i.test(errMsg);
if (handleDead && yielded === 0 && attempt === 1) {
logInfo('Dead LM Studio handle detected — retrying with a fresh SDK.', { model: trimmedModel, error: errMsg });