diff --git a/.astra/tests/stress/.astra/cache/259a37934ead3910a8722b82054d46d2ca2057b05c488be1dcf439166ac5a9a1.json b/.astra/tests/stress/.astra/cache/259a37934ead3910a8722b82054d46d2ca2057b05c488be1dcf439166ac5a9a1.json index e907708..ea784bd 100644 --- a/.astra/tests/stress/.astra/cache/259a37934ead3910a8722b82054d46d2ca2057b05c488be1dcf439166ac5a9a1.json +++ b/.astra/tests/stress/.astra/cache/259a37934ead3910a8722b82054d46d2ca2057b05c488be1dcf439166ac5a9a1.json @@ -1,5 +1,5 @@ { "result": "Final report with inconsistencies. This should be long enough to pass validation.", - "createdAt": 1778472017521, + "createdAt": 1778473059932, "modelVersion": "unknown" } \ No newline at end of file diff --git a/.astra/tests/stress/.astra/cache/65775be352df43297b63c7af59c9f4f39d2bc368f77456c37b5eef9a94a66b5c.json b/.astra/tests/stress/.astra/cache/65775be352df43297b63c7af59c9f4f39d2bc368f77456c37b5eef9a94a66b5c.json index c650970..e2e93c5 100644 --- a/.astra/tests/stress/.astra/cache/65775be352df43297b63c7af59c9f4f39d2bc368f77456c37b5eef9a94a66b5c.json +++ b/.astra/tests/stress/.astra/cache/65775be352df43297b63c7af59c9f4f39d2bc368f77456c37b5eef9a94a66b5c.json @@ -1,5 +1,5 @@ { "result": "[CONFLICT WARNING] 성능이 200% 증가했습니다. vs 그러나 동시에 50% 감소했습니다. 최적화와 성능 저하가 동시에 발견됨.", - "createdAt": 1778472017513, + "createdAt": 1778473059931, "modelVersion": "unknown" } \ No newline at end of file diff --git a/.astra/tests/stress/.astra/cache/6894d26c5b0a55d25d756a473225c7a44d7661af673b24e3f49551a7a2e50280.json b/.astra/tests/stress/.astra/cache/6894d26c5b0a55d25d756a473225c7a44d7661af673b24e3f49551a7a2e50280.json index 7cfd235..9295d2f 100644 --- a/.astra/tests/stress/.astra/cache/6894d26c5b0a55d25d756a473225c7a44d7661af673b24e3f49551a7a2e50280.json +++ b/.astra/tests/stress/.astra/cache/6894d26c5b0a55d25d756a473225c7a44d7661af673b24e3f49551a7a2e50280.json @@ -1,5 +1,5 @@ { "result": "Detailed Execution Plan: 1. Research 2. Analyze 3. Write report with high quality.", - "createdAt": 1778472017510, + "createdAt": 1778473059930, "modelVersion": "unknown" } \ No newline at end of file diff --git a/.astra/tests/stress/.astra/cache/88cb61499f88ed38165b64bd3e8adc543795e4b427b64540a49c9ab27c7fe213.json b/.astra/tests/stress/.astra/cache/88cb61499f88ed38165b64bd3e8adc543795e4b427b64540a49c9ab27c7fe213.json index 0851eef..2d67301 100644 --- a/.astra/tests/stress/.astra/cache/88cb61499f88ed38165b64bd3e8adc543795e4b427b64540a49c9ab27c7fe213.json +++ b/.astra/tests/stress/.astra/cache/88cb61499f88ed38165b64bd3e8adc543795e4b427b64540a49c9ab27c7fe213.json @@ -1,5 +1,5 @@ { - "result": "---\nid: stress_conflict_1778472017495\ndate: 2026-05-11T04:00:17.524Z\ntype: knowledge_artifact\nstandard: P-Reinforce v3.0\ntags: [automated, connect_ai, brain_sync]\n---\n\n## 📌 Brief Summary\nFinal report with inconsistencies. This should be long enough to pass validation.\n\nFinal report with inconsistencies. This should be long enough to pass validation.\n\n---\n## 💡 Astra의 선제적 제안 (Proactive Next Actions)\nFinal report with inconsistencies. This should be long enough to pass validation.\n---\n## 🛡️ Reliability & Audit Summary\n> [!NOTE]\n> 이 문서는 ConnectAI의 **Intelligent Resilience** 엔진에 의해 검증 및 정제되었습니다.\n\n| Metric | Value | Status |\n| :--- | :--- | :--- |\n| **Conflict Risk** | `60/100` | ⚠️ Medium |\n| **Fallbacks Used** | `0` | ✅ None |\n| **Auto Retries** | `0` | ✅ Stable |\n| **Deduplication** | `0` | Standard |\n| **Processing Time** | `0.0s` | ✅ Fast |\n\n### 🔍 Decision Audit Trail\n- **[PLANNER]** 전략 수립 중... (11ms)\n- **[RESEARCHER]** 핵심 정보 수집 및 분석 중... (4ms)\n- **[WRITER]** 최종 리포트 작성 및 편집 중... (7ms)\n", - "createdAt": 1778472017524, + "result": "---\nid: stress_conflict_1778473059918\ndate: 2026-05-11T04:17:39.932Z\ntype: knowledge_artifact\nstandard: P-Reinforce v3.0\ntags: [automated, connect_ai, brain_sync]\n---\n\n## 📌 Brief Summary\nFinal report with inconsistencies. This should be long enough to pass validation.\n\nFinal report with inconsistencies. This should be long enough to pass validation.\n\n---\n## 💡 Astra의 선제적 제안 (Proactive Next Actions)\nFinal report with inconsistencies. This should be long enough to pass validation.\n---\n## 🛡️ Reliability & Audit Summary\n> [!NOTE]\n> 이 문서는 ConnectAI의 **Intelligent Resilience** 엔진에 의해 검증 및 정제되었습니다.\n\n| Metric | Value | Status |\n| :--- | :--- | :--- |\n| **Conflict Risk** | `60/100` | ⚠️ Medium |\n| **Fallbacks Used** | `0` | ✅ None |\n| **Auto Retries** | `0` | ✅ Stable |\n| **Deduplication** | `0` | Standard |\n| **Processing Time** | `0.0s` | ✅ Fast |\n\n### 🔍 Decision Audit Trail\n- **[PLANNER]** 전략 수립 중... (11ms)\n- **[RESEARCHER]** 핵심 정보 수집 및 분석 중... (2ms)\n- **[WRITER]** 최종 리포트 작성 및 편집 중... (1ms)\n", + "createdAt": 1778473059932, "modelVersion": "unknown" } \ No newline at end of file diff --git a/.astra/tests/stress/.astra/missions/stress_conflict_1778472017495.json b/.astra/tests/stress/.astra/missions/stress_conflict_1778473059918.json similarity index 80% rename from .astra/tests/stress/.astra/missions/stress_conflict_1778472017495.json rename to .astra/tests/stress/.astra/missions/stress_conflict_1778473059918.json index 344c6b1..18a1a99 100644 --- a/.astra/tests/stress/.astra/missions/stress_conflict_1778472017495.json +++ b/.astra/tests/stress/.astra/missions/stress_conflict_1778473059918.json @@ -1,8 +1,8 @@ { - "missionId": "stress_conflict_1778472017495", + "missionId": "stress_conflict_1778473059918", "status": "completed", - "startTime": "2026-05-11T04:00:17.495Z", - "totalElapsedMs": 30, + "startTime": "2026-05-11T04:17:39.918Z", + "totalElapsedMs": 14, "results": { "planner": "Detailed Execution Plan: 1. Research 2. Analyze 3. Write report with high quality.", "researcher": "[CONFLICT WARNING] 성능이 200% 증가했습니다. vs 그러나 동시에 50% 감소했습니다. 최적화와 성능 저하가 동시에 발견됨.", @@ -18,28 +18,28 @@ "to": "planner", "durationMs": 11, "message": "전략 수립 중...", - "ts": "2026-05-11T04:00:17.506Z" + "ts": "2026-05-11T04:17:39.929Z" }, { "from": "planner", "to": "researcher", - "durationMs": 4, + "durationMs": 2, "message": "핵심 정보 수집 및 분석 중...", - "ts": "2026-05-11T04:00:17.510Z" + "ts": "2026-05-11T04:17:39.931Z" }, { "from": "researcher", "to": "writer", - "durationMs": 7, + "durationMs": 1, "message": "최종 리포트 작성 및 편집 중...", - "ts": "2026-05-11T04:00:17.517Z" + "ts": "2026-05-11T04:17:39.932Z" }, { "from": "writer", "to": "completed", - "durationMs": 8, + "durationMs": 0, "message": "미션 완료", - "ts": "2026-05-11T04:00:17.525Z" + "ts": "2026-05-11T04:17:39.932Z" } ], "resilienceMetrics": { diff --git a/PATCHNOTES.md b/PATCHNOTES.md index 284efc4..5a775cc 100644 --- a/PATCHNOTES.md +++ b/PATCHNOTES.md @@ -1,5 +1,14 @@ # Astra Patch Notes +## v2.80.32 (2026-05-11) +### 🛡️ LM Studio SDK Resilience & Auto-Recovery +- **LM Studio SDK 안정성 강화:** 모델 핸들이 "disposed" 상태로 방치되어 발생하는 스트리밍 실패를 감지하고, SDK 클라이언트를 자동으로 재생성(`resetHandle`)하여 복구하는 로직을 도입했습니다. +- **2단계 스트리밍 복구 프로세스:** 빈 응답 감지 시 (1) 핸들 초기화 후 스트리밍 재시도, (2) 실패 시 비스트리밍(POST) 폴백을 수행하는 다단계 복구 체계를 구축했습니다. +- **지능형 진단 가이드:** 소형 모델(3B 이하) 및 대규모 컨텍스트 상황에서 첫 토큰부터 EOS를 뱉는 현상에 대한 구체적인 해결 가이드(LM Studio 로그 분석 기반)를 에러 메시지에 추가했습니다. +- **신규 패키징:** `astra-2.80.32.vsix` 패키지를 생성하고 고부하 스트리밍 복구 시나리오에 대한 검증을 완료했습니다. + +--- + ## v2.80.31 (2026-05-11) ### 🧠 Logic Optimization & Skill Integration - **에이전트 응답 로직 간소화:** `src/agent.ts` 내의 복잡한 하드코딩 폴백 및 리뷰 판단 로직을 제거하여 모델 기반의 유연한 응답 생성을 강화했습니다. diff --git a/package.json b/package.json index 8a47ced..b289a34 100644 --- a/package.json +++ b/package.json @@ -2,7 +2,7 @@ "name": "astra", "displayName": "Astra", "description": "The personal intelligence layer for Antigravity and VS Code. A private cognitive partner for deep project context, memory, and proactive strategic decision-making.", - "version": "2.80.31", + "version": "2.80.32", "publisher": "g1nation", "license": "MIT", "icon": "assets/icon.png", diff --git a/src/agent.ts b/src/agent.ts index 22d6d70..8663911 100644 --- a/src/agent.ts +++ b/src/agent.ts @@ -591,33 +591,68 @@ export class AgentExecutor { // ── Empty-response auto-recovery ── // Streaming failed silently (network blip, model cold-start, context - // overflow, etc.). Before surfacing the error to the user, try one - // non-streaming retry: many LM Studio failures are streaming-only - // (the SSE channel drops mid-token while a single POST returns the - // whole answer fine). This covers the most common "empty response" - // pattern users hit without the user having to click anything. + // overflow, etc.). Before surfacing the error to the user we try two + // recovery steps in order: + // + // (1) When the empty stream came from the LM Studio SDK path, drop + // the cached handle and retry streaming once. The SDK keeps a + // per-model handle in its internal map; an aborted prediction + // can leave that handle disposed so the next respond() returns + // zero tokens cleanly (no error thrown, stream just ends). + // A fresh WebSocket / handle lookup recovers from this without + // us having to ask the user to retry. + // + // (2) Fall back to a single non-streaming POST. Many LM Studio + // failures are streaming-only (the SSE channel drops mid-token + // while one POST returns the whole answer fine). // // Only attempts recovery on loopDepth === 0 — we don't want to // ping-pong inside the autonomous action loop. if (!aiResponseText.trim() && !this.abortController?.signal.aborted && loopDepth === 0) { - try { - logInfo('Empty stream — trying non-streaming fallback.', { engine, model: actualModel, apiUrl }); - const fallback = await this.callNonStreaming({ - baseUrl: ollamaUrl, - modelName: actualModel, - engine, - messages: messagesForRequest, - temperature, - signal: this.abortController?.signal, - }); - if (fallback && fallback.trim()) { - aiResponseText = fallback; - logInfo('Non-streaming fallback recovered the answer.', { engine, model: actualModel, length: fallback.length }); + if (useLmStudioSdk && this.options.lmStudioStreamer?.resetHandle) { + try { + logInfo('Empty SDK stream — resetting LM Studio handle and retrying streaming once.', { model: actualModel }); + await this.options.lmStudioStreamer.resetHandle(actualModel); + const retryStream = this.options.lmStudioStreamer.stream({ + modelName: actualModel, + messages: messagesForRequest.map((m) => ({ role: m.role, content: m.content })), + temperature, + signal: this.abortController.signal, + }); + let retryText = ''; + for await (const { token } of retryStream) { + if (this.isStaleRun(runId)) return; + if (token) retryText += token; + } + if (retryText.trim()) { + aiResponseText = retryText; + logInfo('Handle-reset retry recovered the answer.', { model: actualModel, length: retryText.length }); + } + } catch (retryErr: any) { + logError('Handle-reset retry failed.', { model: actualModel, error: retryErr?.message ?? String(retryErr) }); + } + } + + if (!aiResponseText.trim() && !this.abortController?.signal.aborted) { + try { + logInfo('Empty stream — trying non-streaming fallback.', { engine, model: actualModel, apiUrl }); + const fallback = await this.callNonStreaming({ + baseUrl: ollamaUrl, + modelName: actualModel, + engine, + messages: messagesForRequest, + temperature, + signal: this.abortController?.signal, + }); + if (fallback && fallback.trim()) { + aiResponseText = fallback; + logInfo('Non-streaming fallback recovered the answer.', { engine, model: actualModel, length: fallback.length }); + } + } catch (recoverErr: any) { + logError('Non-streaming fallback also failed.', { + engine, model: actualModel, error: recoverErr?.message ?? String(recoverErr), + }); } - } catch (recoverErr: any) { - logError('Non-streaming fallback also failed.', { - engine, model: actualModel, error: recoverErr?.message ?? String(recoverErr), - }); } } @@ -668,6 +703,20 @@ export class AgentExecutor { promptCharCount, messageCount: messagesForRequest.length, fallbackTried: loopDepth === 0 ? 'yes' : 'no', }); + // Cheap heuristic: parse a parameter-count hint out of the + // model identifier (e.g. "google/gemma-4-e2b", "qwen2-1.5b"). + // Anything <= 3B is small enough that long-context generation + // commonly fails by emitting EOS as the first token even though + // the server log shows prompt-eval succeeded with truncated=0. + const smallModelMatch = actualModel.match(/(? 60000; // ~15k tokens of English/code + const contextLimitHint = + 'LM Studio 로그에 `n_tokens = N, truncated = 0` 인데 `eval time` 이 0ms 라면 모델이 첫 토큰부터 EOS 를 뱉은 것입니다. 보통 컨텍스트 한계 초과 또는 모델 용량 부족입니다. 더 큰 모델(7B+)로 교체하거나 컨텍스트를 줄여 보세요.'; + this.webview.postMessage({ type: 'error', value: [ @@ -682,6 +731,7 @@ export class AgentExecutor { ? ' • 프롬프트가 너무 큽니다 (16k chars 초과). Skill/Brain 컨텍스트를 좁혀 보세요.' : ' • 다른 모델로 전환하거나 LM Studio 서버를 재시작', ' • Settings에서 maxContextSize 또는 memoryLongTermFiles 줄이기', + ...(looksSmall || promptIsLarge ? [' • ' + contextLimitHint] : []), ].join('\n') }); return; diff --git a/src/lmstudio/client.ts b/src/lmstudio/client.ts index 45e576a..17bfc24 100644 --- a/src/lmstudio/client.ts +++ b/src/lmstudio/client.ts @@ -7,8 +7,14 @@ export interface ILMStudioClient { listLoaded(): Promise; /** Like listLoaded() but caches the result for `ttlMs` to avoid hammering the SDK. */ listLoadedCached(ttlMs?: number): Promise; - /** Resolve a chat-ready handle for an already-loaded (or just-loaded) model. */ - getModelHandle(modelKey: string): Promise; + /** + * Resolve a chat-ready handle for an already-loaded (or just-loaded) model. + * + * `options.refresh: true` drops the SDK + WebSocket so any disposed handle + * sitting in the SDK's internal handle map is discarded. Use this after a + * "Model is disposed!" or "lock() request could not be registered" error. + */ + getModelHandle(modelKey: string, options?: { refresh?: boolean }): Promise; isReachable(): Promise; setBaseUrl(httpBaseUrl: string): void; } @@ -111,8 +117,17 @@ export class LMStudioClient implements ILMStudioClient { } } - async getModelHandle(modelKey: string): Promise { + async getModelHandle(modelKey: string, options?: { refresh?: boolean }): Promise { try { + if (options?.refresh) { + // Recreate the SDK + WebSocket so the SDK's internal handle + // cache is dropped. The next llm.model() call mints a fresh + // handle instead of returning the disposed one from the + // previous (aborted) prediction. + this._sdk = undefined; + this._loadedCache = undefined; + logInfo('LM Studio SDK handle refresh requested — dropped cached SDK client.', { modelKey }); + } return await this.getSdk().llm.model(modelKey); } catch (e: any) { const msg = e?.message ?? String(e); diff --git a/src/lmstudio/streamer.ts b/src/lmstudio/streamer.ts index 88592db..a8d4978 100644 --- a/src/lmstudio/streamer.ts +++ b/src/lmstudio/streamer.ts @@ -18,6 +18,12 @@ export interface ChatStreamRequest { export interface IChatStreamer { /** Token-level streaming for an LM Studio chat completion via the WebSocket SDK. */ stream(req: ChatStreamRequest): AsyncIterable<{ token: string }>; + /** + * Drop the SDK's cached handle for `modelName`. Callers invoke this when + * the previous stream returned zero tokens with no error — a symptom of a + * silently-disposed handle that needs a fresh WebSocket round-trip. + */ + resetHandle?(modelName: string): Promise; } /** @@ -39,41 +45,84 @@ export class LMStudioStreamer implements IChatStreamer { throw new LMStudioLifecycleError('LMStudioStreamer.stream called without a model name.'); } - const model = await this.client.getModelHandle(trimmedModel); - logInfo('LM Studio SDK chat stream started.', { model: trimmedModel, messageCount: req.messages.length }); + // One automatic retry path: when the first attempt blows up with a + // "Model is disposed!" / "lock() request could not be registered" + // error before any tokens have been yielded, we drop the cached SDK + // handle and try once more. These errors are caused by a previous + // aborted prediction leaving the SDK's internal handle map pointing + // at a dead WebSocket binding — a fresh client.model() lookup minted + // from a recreated SDK fixes it. We only retry when zero tokens have + // streamed: if the consumer already saw partial output, restarting + // would duplicate tokens. + for (let attempt = 1; attempt <= 2; attempt++) { + const refresh = attempt > 1; + const model = await this.client.getModelHandle(trimmedModel, refresh ? { refresh: true } : undefined); + logInfo('LM Studio SDK chat stream started.', { model: trimmedModel, messageCount: req.messages.length, attempt }); - const prediction = (model as any).respond(req.messages, { - temperature: req.temperature, - maxTokens: req.maxTokens ?? 4096, - signal: req.signal, - }); + const prediction = (model as any).respond(req.messages, { + temperature: req.temperature, + maxTokens: req.maxTokens ?? 4096, + signal: req.signal, + }); - // Bridge AbortSignal → prediction.cancel(): without this, an aborted - // request keeps generating on the LM Studio server. The orphaned - // prediction holds locks on the model handle, which is a known cause - // of "lock() request could not be registered" on the very next - // request — the reused handle is still bound to a dead prediction. - const onAbort = () => { - try { (prediction as any)?.cancel?.(); } catch { /* swallow — best effort */ } - }; - if (req.signal) { - if (req.signal.aborted) onAbort(); - else req.signal.addEventListener('abort', onAbort, { once: true }); - } - - try { - for await (const fragment of prediction as AsyncIterable<{ content: string }>) { - if (req.signal?.aborted) return; - const token = fragment?.content ?? ''; - if (token) yield { token }; + // Bridge AbortSignal → prediction.cancel(): without this, an + // aborted request keeps generating on the LM Studio server. The + // orphaned prediction holds locks on the model handle, which is + // a known cause of "lock() request could not be registered" on + // the very next request — the reused handle is still bound to a + // dead prediction. + const onAbort = () => { + try { (prediction as any)?.cancel?.(); } catch { /* swallow — best effort */ } + }; + if (req.signal) { + if (req.signal.aborted) onAbort(); + else req.signal.addEventListener('abort', onAbort, { once: true }); } + + let yielded = 0; + let caught: any = null; + try { + for await (const fragment of prediction as AsyncIterable<{ content: string }>) { + if (req.signal?.aborted) return; + const token = fragment?.content ?? ''; + if (token) { + yielded++; + yield { token }; + } + } + } catch (err: any) { + if (req.signal?.aborted) return; + if (err?.name === 'AbortError') return; + caught = err; + } finally { + req.signal?.removeEventListener?.('abort', onAbort); + } + + if (!caught) return; + + const errMsg = String(caught?.message ?? caught); + const handleDead = /\bdisposed\b/i.test(errMsg) + || /lock\(\) request could not be registered/i.test(errMsg); + + if (handleDead && yielded === 0 && attempt === 1) { + logInfo('Dead LM Studio handle detected — retrying with a fresh SDK.', { model: trimmedModel, error: errMsg }); + continue; + } + + logError('LM Studio SDK chat stream failed.', { model: trimmedModel, error: errMsg, attempt }); + throw caught; + } + } + + async resetHandle(modelName: string): Promise { + const trimmed = (modelName || '').trim(); + if (!trimmed) return; + try { + await this.client.getModelHandle(trimmed, { refresh: true }); } catch (err: any) { - if (req.signal?.aborted) return; - if (err?.name === 'AbortError') return; - logError('LM Studio SDK chat stream failed.', { model: trimmedModel, error: err?.message ?? String(err) }); - throw err; - } finally { - req.signal?.removeEventListener?.('abort', onAbort); + // Best effort — caller will see the next stream() attempt fail + // with a normal error path if the refresh itself was broken. + logError('LM Studio handle reset failed.', { model: trimmed, error: err?.message ?? String(err) }); } } }