release: v2.80.32 - LM Studio SDK resilience & auto-recovery
This commit is contained in:
+1
-1
@@ -1,5 +1,5 @@
|
||||
{
|
||||
"result": "Final report with inconsistencies. This should be long enough to pass validation.",
|
||||
"createdAt": 1778472017521,
|
||||
"createdAt": 1778473059932,
|
||||
"modelVersion": "unknown"
|
||||
}
|
||||
+1
-1
@@ -1,5 +1,5 @@
|
||||
{
|
||||
"result": "[CONFLICT WARNING] 성능이 200% 증가했습니다. vs 그러나 동시에 50% 감소했습니다. 최적화와 성능 저하가 동시에 발견됨.",
|
||||
"createdAt": 1778472017513,
|
||||
"createdAt": 1778473059931,
|
||||
"modelVersion": "unknown"
|
||||
}
|
||||
+1
-1
@@ -1,5 +1,5 @@
|
||||
{
|
||||
"result": "Detailed Execution Plan: 1. Research 2. Analyze 3. Write report with high quality.",
|
||||
"createdAt": 1778472017510,
|
||||
"createdAt": 1778473059930,
|
||||
"modelVersion": "unknown"
|
||||
}
|
||||
+2
-2
@@ -1,5 +1,5 @@
|
||||
{
|
||||
"result": "---\nid: stress_conflict_1778472017495\ndate: 2026-05-11T04:00:17.524Z\ntype: knowledge_artifact\nstandard: P-Reinforce v3.0\ntags: [automated, connect_ai, brain_sync]\n---\n\n## 📌 Brief Summary\nFinal report with inconsistencies. This should be long enough to pass validation.\n\nFinal report with inconsistencies. This should be long enough to pass validation.\n\n---\n## 💡 Astra의 선제적 제안 (Proactive Next Actions)\nFinal report with inconsistencies. This should be long enough to pass validation.\n---\n## 🛡️ Reliability & Audit Summary\n> [!NOTE]\n> 이 문서는 ConnectAI의 **Intelligent Resilience** 엔진에 의해 검증 및 정제되었습니다.\n\n| Metric | Value | Status |\n| :--- | :--- | :--- |\n| **Conflict Risk** | `60/100` | ⚠️ Medium |\n| **Fallbacks Used** | `0` | ✅ None |\n| **Auto Retries** | `0` | ✅ Stable |\n| **Deduplication** | `0` | Standard |\n| **Processing Time** | `0.0s` | ✅ Fast |\n\n### 🔍 Decision Audit Trail\n- **[PLANNER]** 전략 수립 중... (11ms)\n- **[RESEARCHER]** 핵심 정보 수집 및 분석 중... (4ms)\n- **[WRITER]** 최종 리포트 작성 및 편집 중... (7ms)\n",
|
||||
"createdAt": 1778472017524,
|
||||
"result": "---\nid: stress_conflict_1778473059918\ndate: 2026-05-11T04:17:39.932Z\ntype: knowledge_artifact\nstandard: P-Reinforce v3.0\ntags: [automated, connect_ai, brain_sync]\n---\n\n## 📌 Brief Summary\nFinal report with inconsistencies. This should be long enough to pass validation.\n\nFinal report with inconsistencies. This should be long enough to pass validation.\n\n---\n## 💡 Astra의 선제적 제안 (Proactive Next Actions)\nFinal report with inconsistencies. This should be long enough to pass validation.\n---\n## 🛡️ Reliability & Audit Summary\n> [!NOTE]\n> 이 문서는 ConnectAI의 **Intelligent Resilience** 엔진에 의해 검증 및 정제되었습니다.\n\n| Metric | Value | Status |\n| :--- | :--- | :--- |\n| **Conflict Risk** | `60/100` | ⚠️ Medium |\n| **Fallbacks Used** | `0` | ✅ None |\n| **Auto Retries** | `0` | ✅ Stable |\n| **Deduplication** | `0` | Standard |\n| **Processing Time** | `0.0s` | ✅ Fast |\n\n### 🔍 Decision Audit Trail\n- **[PLANNER]** 전략 수립 중... (11ms)\n- **[RESEARCHER]** 핵심 정보 수집 및 분석 중... (2ms)\n- **[WRITER]** 최종 리포트 작성 및 편집 중... (1ms)\n",
|
||||
"createdAt": 1778473059932,
|
||||
"modelVersion": "unknown"
|
||||
}
|
||||
+10
-10
@@ -1,8 +1,8 @@
|
||||
{
|
||||
"missionId": "stress_conflict_1778472017495",
|
||||
"missionId": "stress_conflict_1778473059918",
|
||||
"status": "completed",
|
||||
"startTime": "2026-05-11T04:00:17.495Z",
|
||||
"totalElapsedMs": 30,
|
||||
"startTime": "2026-05-11T04:17:39.918Z",
|
||||
"totalElapsedMs": 14,
|
||||
"results": {
|
||||
"planner": "Detailed Execution Plan: 1. Research 2. Analyze 3. Write report with high quality.",
|
||||
"researcher": "[CONFLICT WARNING] 성능이 200% 증가했습니다. vs 그러나 동시에 50% 감소했습니다. 최적화와 성능 저하가 동시에 발견됨.",
|
||||
@@ -18,28 +18,28 @@
|
||||
"to": "planner",
|
||||
"durationMs": 11,
|
||||
"message": "전략 수립 중...",
|
||||
"ts": "2026-05-11T04:00:17.506Z"
|
||||
"ts": "2026-05-11T04:17:39.929Z"
|
||||
},
|
||||
{
|
||||
"from": "planner",
|
||||
"to": "researcher",
|
||||
"durationMs": 4,
|
||||
"durationMs": 2,
|
||||
"message": "핵심 정보 수집 및 분석 중...",
|
||||
"ts": "2026-05-11T04:00:17.510Z"
|
||||
"ts": "2026-05-11T04:17:39.931Z"
|
||||
},
|
||||
{
|
||||
"from": "researcher",
|
||||
"to": "writer",
|
||||
"durationMs": 7,
|
||||
"durationMs": 1,
|
||||
"message": "최종 리포트 작성 및 편집 중...",
|
||||
"ts": "2026-05-11T04:00:17.517Z"
|
||||
"ts": "2026-05-11T04:17:39.932Z"
|
||||
},
|
||||
{
|
||||
"from": "writer",
|
||||
"to": "completed",
|
||||
"durationMs": 8,
|
||||
"durationMs": 0,
|
||||
"message": "미션 완료",
|
||||
"ts": "2026-05-11T04:00:17.525Z"
|
||||
"ts": "2026-05-11T04:17:39.932Z"
|
||||
}
|
||||
],
|
||||
"resilienceMetrics": {
|
||||
@@ -1,5 +1,14 @@
|
||||
# Astra Patch Notes
|
||||
|
||||
## v2.80.32 (2026-05-11)
|
||||
### 🛡️ LM Studio SDK Resilience & Auto-Recovery
|
||||
- **LM Studio SDK 안정성 강화:** 모델 핸들이 "disposed" 상태로 방치되어 발생하는 스트리밍 실패를 감지하고, SDK 클라이언트를 자동으로 재생성(`resetHandle`)하여 복구하는 로직을 도입했습니다.
|
||||
- **2단계 스트리밍 복구 프로세스:** 빈 응답 감지 시 (1) 핸들 초기화 후 스트리밍 재시도, (2) 실패 시 비스트리밍(POST) 폴백을 수행하는 다단계 복구 체계를 구축했습니다.
|
||||
- **지능형 진단 가이드:** 소형 모델(3B 이하) 및 대규모 컨텍스트 상황에서 첫 토큰부터 EOS를 뱉는 현상에 대한 구체적인 해결 가이드(LM Studio 로그 분석 기반)를 에러 메시지에 추가했습니다.
|
||||
- **신규 패키징:** `astra-2.80.32.vsix` 패키지를 생성하고 고부하 스트리밍 복구 시나리오에 대한 검증을 완료했습니다.
|
||||
|
||||
---
|
||||
|
||||
## v2.80.31 (2026-05-11)
|
||||
### 🧠 Logic Optimization & Skill Integration
|
||||
- **에이전트 응답 로직 간소화:** `src/agent.ts` 내의 복잡한 하드코딩 폴백 및 리뷰 판단 로직을 제거하여 모델 기반의 유연한 응답 생성을 강화했습니다.
|
||||
|
||||
+1
-1
@@ -2,7 +2,7 @@
|
||||
"name": "astra",
|
||||
"displayName": "Astra",
|
||||
"description": "The personal intelligence layer for Antigravity and VS Code. A private cognitive partner for deep project context, memory, and proactive strategic decision-making.",
|
||||
"version": "2.80.31",
|
||||
"version": "2.80.32",
|
||||
"publisher": "g1nation",
|
||||
"license": "MIT",
|
||||
"icon": "assets/icon.png",
|
||||
|
||||
+72
-22
@@ -591,33 +591,68 @@ export class AgentExecutor {
|
||||
|
||||
// ── Empty-response auto-recovery ──
|
||||
// Streaming failed silently (network blip, model cold-start, context
|
||||
// overflow, etc.). Before surfacing the error to the user, try one
|
||||
// non-streaming retry: many LM Studio failures are streaming-only
|
||||
// (the SSE channel drops mid-token while a single POST returns the
|
||||
// whole answer fine). This covers the most common "empty response"
|
||||
// pattern users hit without the user having to click anything.
|
||||
// overflow, etc.). Before surfacing the error to the user we try two
|
||||
// recovery steps in order:
|
||||
//
|
||||
// (1) When the empty stream came from the LM Studio SDK path, drop
|
||||
// the cached handle and retry streaming once. The SDK keeps a
|
||||
// per-model handle in its internal map; an aborted prediction
|
||||
// can leave that handle disposed so the next respond() returns
|
||||
// zero tokens cleanly (no error thrown, stream just ends).
|
||||
// A fresh WebSocket / handle lookup recovers from this without
|
||||
// us having to ask the user to retry.
|
||||
//
|
||||
// (2) Fall back to a single non-streaming POST. Many LM Studio
|
||||
// failures are streaming-only (the SSE channel drops mid-token
|
||||
// while one POST returns the whole answer fine).
|
||||
//
|
||||
// Only attempts recovery on loopDepth === 0 — we don't want to
|
||||
// ping-pong inside the autonomous action loop.
|
||||
if (!aiResponseText.trim() && !this.abortController?.signal.aborted && loopDepth === 0) {
|
||||
try {
|
||||
logInfo('Empty stream — trying non-streaming fallback.', { engine, model: actualModel, apiUrl });
|
||||
const fallback = await this.callNonStreaming({
|
||||
baseUrl: ollamaUrl,
|
||||
modelName: actualModel,
|
||||
engine,
|
||||
messages: messagesForRequest,
|
||||
temperature,
|
||||
signal: this.abortController?.signal,
|
||||
});
|
||||
if (fallback && fallback.trim()) {
|
||||
aiResponseText = fallback;
|
||||
logInfo('Non-streaming fallback recovered the answer.', { engine, model: actualModel, length: fallback.length });
|
||||
if (useLmStudioSdk && this.options.lmStudioStreamer?.resetHandle) {
|
||||
try {
|
||||
logInfo('Empty SDK stream — resetting LM Studio handle and retrying streaming once.', { model: actualModel });
|
||||
await this.options.lmStudioStreamer.resetHandle(actualModel);
|
||||
const retryStream = this.options.lmStudioStreamer.stream({
|
||||
modelName: actualModel,
|
||||
messages: messagesForRequest.map((m) => ({ role: m.role, content: m.content })),
|
||||
temperature,
|
||||
signal: this.abortController.signal,
|
||||
});
|
||||
let retryText = '';
|
||||
for await (const { token } of retryStream) {
|
||||
if (this.isStaleRun(runId)) return;
|
||||
if (token) retryText += token;
|
||||
}
|
||||
if (retryText.trim()) {
|
||||
aiResponseText = retryText;
|
||||
logInfo('Handle-reset retry recovered the answer.', { model: actualModel, length: retryText.length });
|
||||
}
|
||||
} catch (retryErr: any) {
|
||||
logError('Handle-reset retry failed.', { model: actualModel, error: retryErr?.message ?? String(retryErr) });
|
||||
}
|
||||
}
|
||||
|
||||
if (!aiResponseText.trim() && !this.abortController?.signal.aborted) {
|
||||
try {
|
||||
logInfo('Empty stream — trying non-streaming fallback.', { engine, model: actualModel, apiUrl });
|
||||
const fallback = await this.callNonStreaming({
|
||||
baseUrl: ollamaUrl,
|
||||
modelName: actualModel,
|
||||
engine,
|
||||
messages: messagesForRequest,
|
||||
temperature,
|
||||
signal: this.abortController?.signal,
|
||||
});
|
||||
if (fallback && fallback.trim()) {
|
||||
aiResponseText = fallback;
|
||||
logInfo('Non-streaming fallback recovered the answer.', { engine, model: actualModel, length: fallback.length });
|
||||
}
|
||||
} catch (recoverErr: any) {
|
||||
logError('Non-streaming fallback also failed.', {
|
||||
engine, model: actualModel, error: recoverErr?.message ?? String(recoverErr),
|
||||
});
|
||||
}
|
||||
} catch (recoverErr: any) {
|
||||
logError('Non-streaming fallback also failed.', {
|
||||
engine, model: actualModel, error: recoverErr?.message ?? String(recoverErr),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -668,6 +703,20 @@ export class AgentExecutor {
|
||||
promptCharCount, messageCount: messagesForRequest.length,
|
||||
fallbackTried: loopDepth === 0 ? 'yes' : 'no',
|
||||
});
|
||||
// Cheap heuristic: parse a parameter-count hint out of the
|
||||
// model identifier (e.g. "google/gemma-4-e2b", "qwen2-1.5b").
|
||||
// Anything <= 3B is small enough that long-context generation
|
||||
// commonly fails by emitting EOS as the first token even though
|
||||
// the server log shows prompt-eval succeeded with truncated=0.
|
||||
const smallModelMatch = actualModel.match(/(?<![0-9.])((?:[0-9]+\.)?[0-9]+)\s*[bB](?![a-zA-Z0-9])|[-_/]e?([0-9]+)b\b/i);
|
||||
const paramB = smallModelMatch
|
||||
? Number(smallModelMatch[1] ?? smallModelMatch[2])
|
||||
: Number.NaN;
|
||||
const looksSmall = Number.isFinite(paramB) && paramB <= 3;
|
||||
const promptIsLarge = promptCharCount > 60000; // ~15k tokens of English/code
|
||||
const contextLimitHint =
|
||||
'LM Studio 로그에 `n_tokens = N, truncated = 0` 인데 `eval time` 이 0ms 라면 모델이 첫 토큰부터 EOS 를 뱉은 것입니다. 보통 컨텍스트 한계 초과 또는 모델 용량 부족입니다. 더 큰 모델(7B+)로 교체하거나 컨텍스트를 줄여 보세요.';
|
||||
|
||||
this.webview.postMessage({
|
||||
type: 'error',
|
||||
value: [
|
||||
@@ -682,6 +731,7 @@ export class AgentExecutor {
|
||||
? ' • 프롬프트가 너무 큽니다 (16k chars 초과). Skill/Brain 컨텍스트를 좁혀 보세요.'
|
||||
: ' • 다른 모델로 전환하거나 LM Studio 서버를 재시작',
|
||||
' • Settings에서 maxContextSize 또는 memoryLongTermFiles 줄이기',
|
||||
...(looksSmall || promptIsLarge ? [' • ' + contextLimitHint] : []),
|
||||
].join('\n')
|
||||
});
|
||||
return;
|
||||
|
||||
+18
-3
@@ -7,8 +7,14 @@ export interface ILMStudioClient {
|
||||
listLoaded(): Promise<string[]>;
|
||||
/** Like listLoaded() but caches the result for `ttlMs` to avoid hammering the SDK. */
|
||||
listLoadedCached(ttlMs?: number): Promise<string[]>;
|
||||
/** Resolve a chat-ready handle for an already-loaded (or just-loaded) model. */
|
||||
getModelHandle(modelKey: string): Promise<LLM>;
|
||||
/**
|
||||
* Resolve a chat-ready handle for an already-loaded (or just-loaded) model.
|
||||
*
|
||||
* `options.refresh: true` drops the SDK + WebSocket so any disposed handle
|
||||
* sitting in the SDK's internal handle map is discarded. Use this after a
|
||||
* "Model is disposed!" or "lock() request could not be registered" error.
|
||||
*/
|
||||
getModelHandle(modelKey: string, options?: { refresh?: boolean }): Promise<LLM>;
|
||||
isReachable(): Promise<boolean>;
|
||||
setBaseUrl(httpBaseUrl: string): void;
|
||||
}
|
||||
@@ -111,8 +117,17 @@ export class LMStudioClient implements ILMStudioClient {
|
||||
}
|
||||
}
|
||||
|
||||
async getModelHandle(modelKey: string): Promise<LLM> {
|
||||
async getModelHandle(modelKey: string, options?: { refresh?: boolean }): Promise<LLM> {
|
||||
try {
|
||||
if (options?.refresh) {
|
||||
// Recreate the SDK + WebSocket so the SDK's internal handle
|
||||
// cache is dropped. The next llm.model() call mints a fresh
|
||||
// handle instead of returning the disposed one from the
|
||||
// previous (aborted) prediction.
|
||||
this._sdk = undefined;
|
||||
this._loadedCache = undefined;
|
||||
logInfo('LM Studio SDK handle refresh requested — dropped cached SDK client.', { modelKey });
|
||||
}
|
||||
return await this.getSdk().llm.model(modelKey);
|
||||
} catch (e: any) {
|
||||
const msg = e?.message ?? String(e);
|
||||
|
||||
+80
-31
@@ -18,6 +18,12 @@ export interface ChatStreamRequest {
|
||||
export interface IChatStreamer {
|
||||
/** Token-level streaming for an LM Studio chat completion via the WebSocket SDK. */
|
||||
stream(req: ChatStreamRequest): AsyncIterable<{ token: string }>;
|
||||
/**
|
||||
* Drop the SDK's cached handle for `modelName`. Callers invoke this when
|
||||
* the previous stream returned zero tokens with no error — a symptom of a
|
||||
* silently-disposed handle that needs a fresh WebSocket round-trip.
|
||||
*/
|
||||
resetHandle?(modelName: string): Promise<void>;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -39,41 +45,84 @@ export class LMStudioStreamer implements IChatStreamer {
|
||||
throw new LMStudioLifecycleError('LMStudioStreamer.stream called without a model name.');
|
||||
}
|
||||
|
||||
const model = await this.client.getModelHandle(trimmedModel);
|
||||
logInfo('LM Studio SDK chat stream started.', { model: trimmedModel, messageCount: req.messages.length });
|
||||
// One automatic retry path: when the first attempt blows up with a
|
||||
// "Model is disposed!" / "lock() request could not be registered"
|
||||
// error before any tokens have been yielded, we drop the cached SDK
|
||||
// handle and try once more. These errors are caused by a previous
|
||||
// aborted prediction leaving the SDK's internal handle map pointing
|
||||
// at a dead WebSocket binding — a fresh client.model() lookup minted
|
||||
// from a recreated SDK fixes it. We only retry when zero tokens have
|
||||
// streamed: if the consumer already saw partial output, restarting
|
||||
// would duplicate tokens.
|
||||
for (let attempt = 1; attempt <= 2; attempt++) {
|
||||
const refresh = attempt > 1;
|
||||
const model = await this.client.getModelHandle(trimmedModel, refresh ? { refresh: true } : undefined);
|
||||
logInfo('LM Studio SDK chat stream started.', { model: trimmedModel, messageCount: req.messages.length, attempt });
|
||||
|
||||
const prediction = (model as any).respond(req.messages, {
|
||||
temperature: req.temperature,
|
||||
maxTokens: req.maxTokens ?? 4096,
|
||||
signal: req.signal,
|
||||
});
|
||||
const prediction = (model as any).respond(req.messages, {
|
||||
temperature: req.temperature,
|
||||
maxTokens: req.maxTokens ?? 4096,
|
||||
signal: req.signal,
|
||||
});
|
||||
|
||||
// Bridge AbortSignal → prediction.cancel(): without this, an aborted
|
||||
// request keeps generating on the LM Studio server. The orphaned
|
||||
// prediction holds locks on the model handle, which is a known cause
|
||||
// of "lock() request could not be registered" on the very next
|
||||
// request — the reused handle is still bound to a dead prediction.
|
||||
const onAbort = () => {
|
||||
try { (prediction as any)?.cancel?.(); } catch { /* swallow — best effort */ }
|
||||
};
|
||||
if (req.signal) {
|
||||
if (req.signal.aborted) onAbort();
|
||||
else req.signal.addEventListener('abort', onAbort, { once: true });
|
||||
}
|
||||
|
||||
try {
|
||||
for await (const fragment of prediction as AsyncIterable<{ content: string }>) {
|
||||
if (req.signal?.aborted) return;
|
||||
const token = fragment?.content ?? '';
|
||||
if (token) yield { token };
|
||||
// Bridge AbortSignal → prediction.cancel(): without this, an
|
||||
// aborted request keeps generating on the LM Studio server. The
|
||||
// orphaned prediction holds locks on the model handle, which is
|
||||
// a known cause of "lock() request could not be registered" on
|
||||
// the very next request — the reused handle is still bound to a
|
||||
// dead prediction.
|
||||
const onAbort = () => {
|
||||
try { (prediction as any)?.cancel?.(); } catch { /* swallow — best effort */ }
|
||||
};
|
||||
if (req.signal) {
|
||||
if (req.signal.aborted) onAbort();
|
||||
else req.signal.addEventListener('abort', onAbort, { once: true });
|
||||
}
|
||||
|
||||
let yielded = 0;
|
||||
let caught: any = null;
|
||||
try {
|
||||
for await (const fragment of prediction as AsyncIterable<{ content: string }>) {
|
||||
if (req.signal?.aborted) return;
|
||||
const token = fragment?.content ?? '';
|
||||
if (token) {
|
||||
yielded++;
|
||||
yield { token };
|
||||
}
|
||||
}
|
||||
} catch (err: any) {
|
||||
if (req.signal?.aborted) return;
|
||||
if (err?.name === 'AbortError') return;
|
||||
caught = err;
|
||||
} finally {
|
||||
req.signal?.removeEventListener?.('abort', onAbort);
|
||||
}
|
||||
|
||||
if (!caught) return;
|
||||
|
||||
const errMsg = String(caught?.message ?? caught);
|
||||
const handleDead = /\bdisposed\b/i.test(errMsg)
|
||||
|| /lock\(\) request could not be registered/i.test(errMsg);
|
||||
|
||||
if (handleDead && yielded === 0 && attempt === 1) {
|
||||
logInfo('Dead LM Studio handle detected — retrying with a fresh SDK.', { model: trimmedModel, error: errMsg });
|
||||
continue;
|
||||
}
|
||||
|
||||
logError('LM Studio SDK chat stream failed.', { model: trimmedModel, error: errMsg, attempt });
|
||||
throw caught;
|
||||
}
|
||||
}
|
||||
|
||||
async resetHandle(modelName: string): Promise<void> {
|
||||
const trimmed = (modelName || '').trim();
|
||||
if (!trimmed) return;
|
||||
try {
|
||||
await this.client.getModelHandle(trimmed, { refresh: true });
|
||||
} catch (err: any) {
|
||||
if (req.signal?.aborted) return;
|
||||
if (err?.name === 'AbortError') return;
|
||||
logError('LM Studio SDK chat stream failed.', { model: trimmedModel, error: err?.message ?? String(err) });
|
||||
throw err;
|
||||
} finally {
|
||||
req.signal?.removeEventListener?.('abort', onAbort);
|
||||
// Best effort — caller will see the next stream() attempt fail
|
||||
// with a normal error path if the refresh itself was broken.
|
||||
logError('LM Studio handle reset failed.', { model: trimmed, error: err?.message ?? String(err) });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user