chore: v2.2.73 — ASTRA-DEBUG 로그 레벨 + webview CSP font-src 보강

- ASTRA-DEBUG 정상 흐름 로그를 console.error → logInfo/console.log 로 강등 (chatHandlers, extension, slashRouter): DevTools에 ERR로 찍히던 오탐 제거 - sidebar webview에 명시적 CSP meta 추가 + font-src에 data: 허용 (sidebar.html, sidebarProvider._getHtml): VS Code outer iframe이 codicon.ttf를 data:font/ttf 로 inject하면서 기본 CSP에 막혀 매 prompt 마다 violation 경고가 찍히던 문제 해소 - 누적된 LM Studio / agent / 컨텍스트 매니저 / 테스트 갱신 동반 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-23 15:52:19 +09:00
parent 36db170844
commit 0712014fcb
43 changed files with 2417 additions and 977 deletions
@@ -7,6 +7,30 @@ export interface ChatStreamMessage {
    content: string;
 }

+/** Shared sampling block. SDK and REST paths both read this — keep them in sync. */
+export interface LmStudioSampling {
+    topP?: number;
+    topK?: number;
+    minP?: number;
+    repeatPenalty?: number;
+}
+
+/**
+ * Translate the sampling block into the OpenAI-compatible REST body extension that LM Studio
+ * understands. Ollama uses the same field names inside `options`. Returns an object you can
+ * spread into either body. Values <= 0 / <= 1 (penalty) are dropped so they fall back to engine
+ * defaults instead of effectively disabling sampling.
+ */
+export function samplingToRestBody(s: LmStudioSampling | undefined): Record<string, number> {
+    const out: Record<string, number> = {};
+    if (!s) return out;
+    if (typeof s.topP === 'number' && s.topP > 0 && s.topP <= 1) out.top_p = s.topP;
+    if (typeof s.topK === 'number' && s.topK > 0) out.top_k = s.topK;
+    if (typeof s.minP === 'number' && s.minP > 0 && s.minP <= 1) out.min_p = s.minP;
+    if (typeof s.repeatPenalty === 'number' && s.repeatPenalty > 1) out.repeat_penalty = s.repeatPenalty;
+    return out;
+}
+
 export interface ChatStreamRequest {
    modelName: string;
    messages: ChatStreamMessage[];
@@ -15,17 +39,39 @@ export interface ChatStreamRequest {
    maxTokens?: number;
    /** LM Studio context-overflow safety net used only if the prompt still exceeds the window. */
    contextOverflowPolicy?: 'stopAtLimit' | 'truncateMiddle' | 'rollingWindow';
+    /** Sampling — defaults match small-model glitch-suppression presets. Each is omitted from the SDK call when undefined. */
+    topP?: number;
+    topK?: number;
+    minP?: number;
+    repeatPenalty?: number;
+    /** Draft model key for speculative decoding. Empty/undefined disables. */
+    draftModel?: string;
    signal?: AbortSignal;
 }

+/** Subset of LM Studio's `PredictionResult.stats` we expose to callers. */
+export interface ChatStreamStats {
+    tokensPerSecond?: number;
+    timeToFirstTokenSec?: number;
+    predictedTokensCount?: number;
+    promptTokensCount?: number;
+    totalTimeSec?: number;
+    /** Speculative decoding (only set when `draftModel` was used). */
+    draftModelKey?: string;
+    draftTokensCount?: number;
+    acceptedDraftTokensCount?: number;
+}
+
 /**
 * One stream event. `token` carries generated text (possibly empty for the final event);
 * `stopReason` is set on the *last* event only and is the SDK's `stats.stopReason`
 * (e.g. `eosFound`, `maxPredictedTokensReached`, `contextLengthReached`, `userStopped`).
+ * `stats` is also set on the *last* event when LM Studio reports prediction stats.
 */
 export interface ChatStreamEvent {
    token: string;
    stopReason?: string;
+    stats?: ChatStreamStats;
 }

 export interface IChatStreamer {
@@ -72,24 +118,25 @@ export class LMStudioStreamer implements IChatStreamer {
            const model = await this.client.getModelHandle(trimmedModel, refresh ? { refresh: true } : undefined);
            logInfo('LM Studio SDK chat stream started.', { model: trimmedModel, messageCount: req.messages.length, attempt });

-            const prediction = (model as any).respond(req.messages, {
+            // Sampling defaults match the historical glitch-suppression preset for small /
+            // quantized models (한글 토큰 깨짐 방지) but are now overridable per-call.
+            const respondOpts: any = {
                temperature: req.temperature,
                maxTokens: req.maxTokens ?? 4096,
-                // Glitch suppression: a small / quantized model samples wrong
-                // neighbour tokens (Korean syllable corruption like 붕괴→붕점,
-                // 핵심→핵점) when the distribution is left wide. A tight nucleus
-                // + top-k and a min-p floor cut the low-probability tail;
-                // repeatPenalty curbs stutter (것입니다서입니다).
-                topPSampling: 0.9,
-                topKSampling: 20,
-                minPSampling: 0.05,
-                repeatPenalty: 1.1,
                // Safety net: if our own token budgeting still underestimated and the prompt
                // exceeds the model's context window, decide whether the SDK should fail
                // loudly (stopAtLimit — default) or silently drop content.
                contextOverflowPolicy: req.contextOverflowPolicy ?? 'stopAtLimit',
                signal: req.signal,
-            });
+            };
+            if (typeof req.topP === 'number') respondOpts.topPSampling = req.topP;
+            if (typeof req.topK === 'number' && req.topK > 0) respondOpts.topKSampling = req.topK;
+            if (typeof req.minP === 'number' && req.minP > 0) respondOpts.minPSampling = req.minP;
+            if (typeof req.repeatPenalty === 'number' && req.repeatPenalty > 1) respondOpts.repeatPenalty = req.repeatPenalty;
+            // Speculative decoding — LM Studio loads the draft model lazily on first use if needed
+            // (we also `preloadDraftModel` after main load to avoid that cold cost).
+            if (req.draftModel && req.draftModel.trim()) respondOpts.draftModel = req.draftModel.trim();
+            const prediction = (model as any).respond(req.messages, respondOpts);

            // Bridge AbortSignal → prediction.cancel(): without this, an
            // aborted request keeps generating on the LM Studio server. The
@@ -128,24 +175,58 @@ export class LMStudioStreamer implements IChatStreamer {
                if (req.signal?.aborted) return;
                // The prediction object is also a Promise<PredictionResult>; awaiting it after
                // the stream drains gives us stats.stopReason so callers can tell a truncated
-                // answer (maxPredictedTokensReached / contextLengthReached) from a normal one.
+                // answer (maxPredictedTokensReached / contextLengthReached) from a normal one,
+                // plus throughput numbers (tok/s, TTFT) we surface to the UI.
                let stopReason: string | undefined;
+                let stats: ChatStreamEvent['stats'];
                try {
                    const result: any = await prediction;
                    stopReason = result?.stats?.stopReason;
-                    if (stopReason) {
-                        logInfo('LM Studio SDK chat stream finished.', { model: trimmedModel, stopReason, tokensYielded: yielded });
+                    const s = result?.stats;
+                    if (s) {
+                        stats = {
+                            tokensPerSecond: typeof s.tokensPerSecond === 'number' ? s.tokensPerSecond : undefined,
+                            timeToFirstTokenSec: typeof s.timeToFirstTokenSec === 'number' ? s.timeToFirstTokenSec : undefined,
+                            predictedTokensCount: typeof s.predictedTokensCount === 'number' ? s.predictedTokensCount : undefined,
+                            promptTokensCount: typeof s.promptTokensCount === 'number' ? s.promptTokensCount : undefined,
+                            totalTimeSec: typeof s.totalTimeSec === 'number' ? s.totalTimeSec : undefined,
+                            draftModelKey: typeof s.usedDraftModelKey === 'string' ? s.usedDraftModelKey : undefined,
+                            draftTokensCount: typeof s.totalDraftTokensCount === 'number' ? s.totalDraftTokensCount : undefined,
+                            acceptedDraftTokensCount: typeof s.acceptedDraftTokensCount === 'number' ? s.acceptedDraftTokensCount : undefined,
+                        };
+                    }
+                    if (stopReason || stats) {
+                        logInfo('LM Studio SDK chat stream finished.', {
+                            model: trimmedModel, stopReason, tokensYielded: yielded,
+                            tokensPerSecond: stats?.tokensPerSecond, ttftSec: stats?.timeToFirstTokenSec,
+                        });
                    }
                } catch { /* result unavailable on some SDK versions — non-fatal */ }
+                // Empty-but-clean stream is treated like a dead handle on attempt 1:
+                // recreate the SDK and try once more. Same root cause (handle bound to
+                // a stale prediction) but no exception is thrown — just an empty stream.
+                if (yielded === 0 && attempt === 1) {
+                    logInfo('Empty SDK stream with no error — retrying with a fresh SDK.', { model: trimmedModel });
+                    continue;
+                }
                // Don't claim `eosFound` if we couldn't actually read the stop reason — leave it
                // undefined so the caller treats it as 'unknown' (and its mid-sentence heuristics kick in).
-                yield { token: '', stopReason };
+                yield { token: '', stopReason, stats };
                return;
            }

            const errMsg = String(caught?.message ?? caught);
-            const handleDead = /\bdisposed\b/i.test(errMsg)
-                || /lock\(\) request could not be registered/i.test(errMsg);
+            // Broaden the "handle is bound to a dead WebSocket binding" detection. All of
+            // these resolve with the same fix (recreate the SDK client so the next
+            // llm.model() lookup mints a fresh handle).
+            const handleDead =
+                /\bdisposed\b/i.test(errMsg)
+                || /lock\(\) request could not be registered/i.test(errMsg)
+                || /channel\s+closed/i.test(errMsg)
+                || /WebSocket\s+(?:is\s+not\s+open|closed|disconnected)/i.test(errMsg)
+                || /Connection\s+(?:lost|reset|closed)/i.test(errMsg)
+                || /\bECONNRESET\b/i.test(errMsg)
+                || /socket\s+hang\s*up/i.test(errMsg);

            if (handleDead && yielded === 0 && attempt === 1) {
                logInfo('Dead LM Studio handle detected — retrying with a fresh SDK.', { model: trimmedModel, error: errMsg });