From 6c4bc3494f139a49778ea37b838e2f1f72c37dda Mon Sep 17 00:00:00 2001
From: g1nation <g1nation@users.noreply.github.com>
Date: Tue, 12 May 2026 23:55:00 +0900
Subject: [PATCH] chore: version up to 2.80.37 and package with response
 recovery

---
 ...d46d2ca2057b05c488be1dcf439166ac5a9a1.json |   2 +-
 ...9f4f39d2bc368f77456c37b5eef9a94a66b5c.json |   2 +-
 ...5c7a44d7661af673b24e3f49551a7a2e50280.json |   2 +-
 ...adc543795e4b427b64540a49c9ab27c7fe213.json |   4 +-
 ...son => stress_conflict_1778597639274.json} |  18 +-
 PATCHNOTES.md                                 |   9 +
 package.json                                  |  19 +-
 src/agent.ts                                  | 100 ++++++++-
 src/config.ts                                 |  12 +-
 src/core/responseRecovery.ts                  | 193 ++++++++++++++++++
 src/lib/contextManager.ts                     |   8 +-
 tests/responseRecovery.test.ts                | 118 +++++++++++
 12 files changed, 466 insertions(+), 21 deletions(-)
 rename .astra/tests/stress/.astra/missions/{stress_conflict_1778596848186.json => stress_conflict_1778597639274.json} (81%)
 create mode 100644 src/core/responseRecovery.ts
 create mode 100644 tests/responseRecovery.test.ts

diff --git a/.astra/tests/stress/.astra/cache/259a37934ead3910a8722b82054d46d2ca2057b05c488be1dcf439166ac5a9a1.json b/.astra/tests/stress/.astra/cache/259a37934ead3910a8722b82054d46d2ca2057b05c488be1dcf439166ac5a9a1.json
index e18df33..9382dd9 100644
--- a/.astra/tests/stress/.astra/cache/259a37934ead3910a8722b82054d46d2ca2057b05c488be1dcf439166ac5a9a1.json
+++ b/.astra/tests/stress/.astra/cache/259a37934ead3910a8722b82054d46d2ca2057b05c488be1dcf439166ac5a9a1.json
@@ -1,5 +1,5 @@
 {
   "result": "Final report with inconsistencies. This should be long enough to pass validation.",
-  "createdAt": 1778596848199,
+  "createdAt": 1778597639298,
   "modelVersion": "unknown"
 }
\ No newline at end of file
diff --git a/.astra/tests/stress/.astra/cache/65775be352df43297b63c7af59c9f4f39d2bc368f77456c37b5eef9a94a66b5c.json b/.astra/tests/stress/.astra/cache/65775be352df43297b63c7af59c9f4f39d2bc368f77456c37b5eef9a94a66b5c.json
index 8d6529a..412413e 100644
--- a/.astra/tests/stress/.astra/cache/65775be352df43297b63c7af59c9f4f39d2bc368f77456c37b5eef9a94a66b5c.json
+++ b/.astra/tests/stress/.astra/cache/65775be352df43297b63c7af59c9f4f39d2bc368f77456c37b5eef9a94a66b5c.json
@@ -1,5 +1,5 @@
 {
   "result": "[CONFLICT WARNING] 성능이 200% 증가했습니다. vs 그러나 동시에 50% 감소했습니다. 최적화와 성능 저하가 동시에 발견됨.",
-  "createdAt": 1778596848198,
+  "createdAt": 1778597639290,
   "modelVersion": "unknown"
 }
\ No newline at end of file
diff --git a/.astra/tests/stress/.astra/cache/6894d26c5b0a55d25d756a473225c7a44d7661af673b24e3f49551a7a2e50280.json b/.astra/tests/stress/.astra/cache/6894d26c5b0a55d25d756a473225c7a44d7661af673b24e3f49551a7a2e50280.json
index f43a6fe..5e801f7 100644
--- a/.astra/tests/stress/.astra/cache/6894d26c5b0a55d25d756a473225c7a44d7661af673b24e3f49551a7a2e50280.json
+++ b/.astra/tests/stress/.astra/cache/6894d26c5b0a55d25d756a473225c7a44d7661af673b24e3f49551a7a2e50280.json
@@ -1,5 +1,5 @@
 {
   "result": "Detailed Execution Plan: 1. Research 2. Analyze 3. Write report with high quality.",
-  "createdAt": 1778596848197,
+  "createdAt": 1778597639286,
   "modelVersion": "unknown"
 }
\ No newline at end of file
diff --git a/.astra/tests/stress/.astra/cache/88cb61499f88ed38165b64bd3e8adc543795e4b427b64540a49c9ab27c7fe213.json b/.astra/tests/stress/.astra/cache/88cb61499f88ed38165b64bd3e8adc543795e4b427b64540a49c9ab27c7fe213.json
index b014709..6dc943e 100644
--- a/.astra/tests/stress/.astra/cache/88cb61499f88ed38165b64bd3e8adc543795e4b427b64540a49c9ab27c7fe213.json
+++ b/.astra/tests/stress/.astra/cache/88cb61499f88ed38165b64bd3e8adc543795e4b427b64540a49c9ab27c7fe213.json
@@ -1,5 +1,5 @@
 {
-  "result": "---\nid: stress_conflict_1778596848186\ndate: 2026-05-12T14:40:48.199Z\ntype: knowledge_artifact\nstandard: P-Reinforce v3.0\ntags: [automated, connect_ai, brain_sync]\n---\n\n## 📌 Brief Summary\nFinal report with inconsistencies. This should be long enough to pass validation.\n\nFinal report with inconsistencies. This should be long enough to pass validation.\n\n---\n## 💡 Astra의 선제적 제안 (Proactive Next Actions)\nFinal report with inconsistencies. This should be long enough to pass validation.\n---\n## 🛡️ Reliability & Audit Summary\n> [!NOTE]\n> 이 문서는 ConnectAI의 **Intelligent Resilience** 엔진에 의해 검증 및 정제되었습니다.\n\n| Metric | Value | Status |\n| :--- | :--- | :--- |\n| **Conflict Risk** | `60/100` | ⚠️ Medium |\n| **Fallbacks Used** | `0` | ✅ None |\n| **Auto Retries** | `0` | ✅ Stable |\n| **Deduplication** | `0` | Standard |\n| **Processing Time** | `0.0s` | ✅ Fast |\n\n### 🔍 Decision Audit Trail\n- **[PLANNER]** 전략 수립 중... (11ms)\n- **[RESEARCHER]** 핵심 정보 수집 및 분석 중... (1ms)\n- **[WRITER]** 최종 리포트 작성 및 편집 중... (1ms)\n",
-  "createdAt": 1778596848199,
+  "result": "---\nid: stress_conflict_1778597639274\ndate: 2026-05-12T14:53:59.302Z\ntype: knowledge_artifact\nstandard: P-Reinforce v3.0\ntags: [automated, connect_ai, brain_sync]\n---\n\n## 📌 Brief Summary\nFinal report with inconsistencies. This should be long enough to pass validation.\n\nFinal report with inconsistencies. This should be long enough to pass validation.\n\n---\n## 💡 Astra의 선제적 제안 (Proactive Next Actions)\nFinal report with inconsistencies. This should be long enough to pass validation.\n---\n## 🛡️ Reliability & Audit Summary\n> [!NOTE]\n> 이 문서는 ConnectAI의 **Intelligent Resilience** 엔진에 의해 검증 및 정제되었습니다.\n\n| Metric | Value | Status |\n| :--- | :--- | :--- |\n| **Conflict Risk** | `60/100` | ⚠️ Medium |\n| **Fallbacks Used** | `0` | ✅ None |\n| **Auto Retries** | `0` | ✅ Stable |\n| **Deduplication** | `0` | Standard |\n| **Processing Time** | `0.0s` | ✅ Fast |\n\n### 🔍 Decision Audit Trail\n- **[PLANNER]** 전략 수립 중... (11ms)\n- **[RESEARCHER]** 핵심 정보 수집 및 분석 중... (1ms)\n- **[WRITER]** 최종 리포트 작성 및 편집 중... (8ms)\n",
+  "createdAt": 1778597639302,
   "modelVersion": "unknown"
 }
\ No newline at end of file
diff --git a/.astra/tests/stress/.astra/missions/stress_conflict_1778596848186.json b/.astra/tests/stress/.astra/missions/stress_conflict_1778597639274.json
similarity index 81%
rename from .astra/tests/stress/.astra/missions/stress_conflict_1778596848186.json
rename to .astra/tests/stress/.astra/missions/stress_conflict_1778597639274.json
index 427dba9..5110b18 100644
--- a/.astra/tests/stress/.astra/missions/stress_conflict_1778596848186.json
+++ b/.astra/tests/stress/.astra/missions/stress_conflict_1778597639274.json
@@ -1,8 +1,8 @@
 {
-  "missionId": "stress_conflict_1778596848186",
+  "missionId": "stress_conflict_1778597639274",
   "status": "completed",
-  "startTime": "2026-05-12T14:40:48.186Z",
-  "totalElapsedMs": 13,
+  "startTime": "2026-05-12T14:53:59.274Z",
+  "totalElapsedMs": 28,
   "results": {
     "planner": "Detailed Execution Plan: 1. Research 2. Analyze 3. Write report with high quality.",
     "researcher": "[CONFLICT WARNING] 성능이 200% 증가했습니다. vs 그러나 동시에 50% 감소했습니다. 최적화와 성능 저하가 동시에 발견됨.",
@@ -18,28 +18,28 @@
       "to": "planner",
       "durationMs": 11,
       "message": "전략 수립 중...",
-      "ts": "2026-05-12T14:40:48.197Z"
+      "ts": "2026-05-12T14:53:59.285Z"
     },
     {
       "from": "planner",
       "to": "researcher",
       "durationMs": 1,
       "message": "핵심 정보 수집 및 분석 중...",
-      "ts": "2026-05-12T14:40:48.198Z"
+      "ts": "2026-05-12T14:53:59.286Z"
     },
     {
       "from": "researcher",
       "to": "writer",
-      "durationMs": 1,
+      "durationMs": 8,
       "message": "최종 리포트 작성 및 편집 중...",
-      "ts": "2026-05-12T14:40:48.199Z"
+      "ts": "2026-05-12T14:53:59.294Z"
     },
     {
       "from": "writer",
       "to": "completed",
-      "durationMs": 0,
+      "durationMs": 8,
       "message": "미션 완료",
-      "ts": "2026-05-12T14:40:48.199Z"
+      "ts": "2026-05-12T14:53:59.302Z"
     }
   ],
   "resilienceMetrics": {
diff --git a/PATCHNOTES.md b/PATCHNOTES.md
index bcbb38c..b9bd1a0 100644
--- a/PATCHNOTES.md
+++ b/PATCHNOTES.md
@@ -1,5 +1,14 @@
 # Astra Patch Notes
 
+## v2.80.37 (2026-05-12)
+### 🛡️ Response Recovery & Stability Overhaul
+- **응답 복구 메커니즘 도입:** `responseRecovery.ts` 및 관련 테스트 코드를 통해 AI 모델의 비정상 응답이나 스트리밍 중단 시 자동으로 상태를 복구하고 재시도하는 강력한 회복 탄력성을 구축했습니다.
+- **컨텍스트 매니저 고도화:** `contextManager.ts`를 수정하여 대규모 프로젝트 분석 시 토큰 사용 효율을 높이고 컨텍스트 누락을 최소화했습니다.
+- **에이전트 실행 안정성 강화:** `agent.ts` 및 `config.ts` 내의 타임아웃 및 에러 처리 로직을 개선하여 고부하 상황에서의 작동 안정성을 확보했습니다.
+- **신규 패키징:** `astra-2.80.37.vsix` 패키지를 생성하여 불확실한 AI 응답 환경에서도 신뢰할 수 있는 실행 환경을 통합했습니다.
+
+---
+
 ## v2.80.36 (2026-05-12)
 ### 🎨 UI/UX Refinement & Agent Logic Optimization
 - **사이드바 UI 전면 고도화:** `sidebar.html`, `sidebar.js`, `sidebar.css`를 갱신하여 더 매끄러운 애니메이션과 직관적인 컴포넌트 인터랙션을 구현했습니다.
diff --git a/package.json b/package.json
index 86eb290..51060a0 100644
--- a/package.json
+++ b/package.json
@@ -2,7 +2,7 @@
   "name": "astra",
   "displayName": "Astra",
   "description": "The personal intelligence layer for Antigravity and VS Code. A private cognitive partner for deep project context, memory, and proactive strategic decision-making.",
-  "version": "2.80.36",
+  "version": "2.80.37",
   "publisher": "g1nation",
   "license": "MIT",
   "icon": "assets/icon.png",
@@ -229,6 +229,23 @@
           "minimum": 0,
           "description": "When a small model (≤4B parameters, detected from the model name) is selected, budget the prompt against this smaller effective context window instead of g1nation.contextLength — small models often emit an empty/EOS response on prompts that nominally fit but exceed their real capability. Set 0 to disable. Default: 8192"
         },
+        "g1nation.autoContinueOnOutputLimit": {
+          "type": "boolean",
+          "default": true,
+          "description": "When a reply is cut off because it hit the output-token limit, Astra continues it internally (compressed request — original question + the answer so far, not the whole context again) and shows one merged answer, instead of asking you to say \"이어서 작성해줘\". Default: true"
+        },
+        "g1nation.maxAutoContinuations": {
+          "type": "number",
+          "default": 3,
+          "minimum": 0,
+          "maximum": 10,
+          "description": "Maximum number of automatic continuation rounds per reply (prevents runaway loops). Set 0 to disable auto-continuation. Default: 3"
+        },
+        "g1nation.finalOnlyRetryOnThoughtLeak": {
+          "type": "boolean",
+          "default": true,
+          "description": "If the model emits only hidden reasoning (<think>, <|channel|>thought, \"Thinking Process:\" …) and no user-visible answer, Astra silently re-asks it for the final answer only. Hidden reasoning is never shown either way. Default: true"
+        },
         "g1nation.lmStudio.idleTimeoutMs": {
           "type": "number",
           "default": 300000,
diff --git a/src/agent.ts b/src/agent.ts
index f4d8f3d..abab8e4 100644
--- a/src/agent.ts
+++ b/src/agent.ts
@@ -41,6 +41,15 @@ import { MemoryManager } from './memory';
 import { RetrievalOrchestrator } from './retrieval';
 import { buildLessonChecklistBlock, isQaRegressionFeedback, findUnaddressedChecklistItems } from './retrieval/lessonHelpers';
 import { resolveScopeForAgent } from './skills/agentKnowledgeMap';
+import {
+    extractVisibleFinal,
+    shouldFinalOnlyRetry,
+    shouldAutoContinue,
+    mergeContinuationParts,
+    buildContinuationUserPrompt,
+    FINAL_ONLY_DIRECTIVE,
+    CONTINUATION_SYSTEM_PROMPT,
+} from './core/responseRecovery';
 import {
     estimateTokens,
     estimateMessagesTokens,
@@ -846,11 +855,95 @@ export class AgentExecutor {
                 }
             }
 
+            // ── Thought Quarantine + Final-only Retry + Auto-Continuation ──
+            // The user is waiting for an answer, not for a chance to manage the generation engine:
+            //   (a) hidden reasoning (Harmony channels, <think>…, "Thinking Process:") never reaches
+            //       the screen — stripped here, and from what executeActions / chatHistory see;
+            //   (b) if the model emitted *only* reasoning → silently retry, final-answer-only;
+            //   (c) if the answer was cut off at the output ceiling → continue it internally with a
+            //       *compressed* request (original question + the answer so far), up to N rounds.
+            let cleaned = extractVisibleFinal(aiResponseText);
+            if (cleaned.hadHiddenReasoning) {
+                logInfo('Stripped hidden reasoning from the model output.', {
+                    model: actualModel, hiddenChars: cleaned.hiddenReasoning.length,
+                    visibleChars: cleaned.visible.length, hadFinalChannel: cleaned.hadFinalChannel,
+                    thoughtOnly: cleaned.wasThoughtOnly,
+                });
+            }
+
+            // (b) Final-only retry — the reply was reasoning-only, no visible answer.
+            if (shouldFinalOnlyRetry(cleaned)
+                && config.finalOnlyRetryOnThoughtLeak
+                && loopDepth === 0
+                && !this.abortController?.signal.aborted) {
+                try {
+                    this.webview.postMessage({ type: 'autoContinue', value: '답변을 정리하는 중입니다...' });
+                    const retryMsgs: ChatMessage[] = messagesForRequest.map((m, i) =>
+                        i === 0 ? { ...m, content: `${m.content}\n${FINAL_ONLY_DIRECTIVE}` } : m);
+                    const r = await this.callNonStreaming({
+                        baseUrl: ollamaUrl, modelName: actualModel, engine, messages: retryMsgs,
+                        temperature, maxTokens: maxOutputTokens, contextLength: ctxLimits.contextLength,
+                        signal: this.abortController?.signal,
+                    });
+                    if (r.stopReason) finishStopReason = r.stopReason;
+                    const rc = extractVisibleFinal(r.text);
+                    if (rc.visible.trim()) {
+                        logInfo('Final-only retry recovered a visible answer.', { model: actualModel, length: rc.visible.length });
+                        aiResponseText = r.text;
+                        cleaned = rc;
+                    }
+                } catch (e: any) {
+                    logError('Final-only retry failed.', { model: actualModel, error: e?.message ?? String(e) });
+                }
+            }
+
+            // (c) Auto-continuation — the visible answer hit the output-token ceiling.
+            let continuationCount = 0;
+            if (config.autoContinueOnOutputLimit && config.maxAutoContinuations > 0 && loopDepth === 0) {
+                const originalUserPrompt = prompt || (this.chatHistory.find(m => m.role === 'user' && typeof m.content === 'string')?.content as string) || '';
+                let lastOutputTokens = estimateTokens(cleaned.visible);
+                while (
+                    shouldAutoContinue(classifyStopReason(finishStopReason), cleaned.visible, lastOutputTokens, maxOutputTokens)
+                    && continuationCount < config.maxAutoContinuations
+                    && !this.abortController?.signal.aborted
+                    && !this.isStaleRun(runId)
+                ) {
+                    continuationCount++;
+                    this.webview.postMessage({ type: 'autoContinue', value: `답변이 길어 이어서 정리하는 중입니다... (${continuationCount}/${config.maxAutoContinuations})` });
+                    try {
+                        const contMsgs: ChatMessage[] = [
+                            { role: 'system', content: CONTINUATION_SYSTEM_PROMPT, internal: true },
+                            { role: 'user', content: buildContinuationUserPrompt(originalUserPrompt, cleaned.visible) },
+                        ];
+                        const contMax = computeOutputBudget(estimateMessagesTokens(contMsgs), ctxLimits).maxOutputTokens;
+                        const cr = await this.callNonStreaming({
+                            baseUrl: ollamaUrl, modelName: actualModel, engine, messages: contMsgs,
+                            temperature, maxTokens: contMax, contextLength: ctxLimits.contextLength,
+                            signal: this.abortController?.signal,
+                        });
+                        finishStopReason = cr.stopReason;
+                        const ccl = extractVisibleFinal(cr.text);
+                        if (!ccl.visible.trim()) {
+                            logInfo('Continuation produced no visible text — stopping.', { model: actualModel, round: continuationCount });
+                            break;
+                        }
+                        cleaned = { ...cleaned, visible: mergeContinuationParts(cleaned.visible, ccl.visible), wasThoughtOnly: false };
+                        lastOutputTokens = estimateTokens(ccl.visible);
+                        logInfo('Auto-continued the answer.', { model: actualModel, round: continuationCount, addedChars: ccl.visible.length, totalChars: cleaned.visible.length, contStopReason: cr.stopReason });
+                    } catch (e: any) {
+                        logError('Auto-continuation failed.', { model: actualModel, round: continuationCount, error: e?.message ?? String(e) });
+                        break;
+                    }
+                }
+                if (this.isStaleRun(runId)) return;
+            }
+            const cleanedVisible = cleaned.visible;
+
             // 5. Execute Actions
-            const rationale = this.parseRationale(aiResponseText);
+            const rationale = this.parseRationale(cleanedVisible);
             let assistantContent = this.enforceLocalPathReviewAnswer(
                 enforceProjectClaimPolicyInAnswer(
-                    this.sanitizeAssistantContent(aiResponseText),
+                    this.sanitizeAssistantContent(cleanedVisible),
                     secondBrainTrace
                 ),
                 localPathContext
@@ -900,7 +993,8 @@ export class AgentExecutor {
             this.emitHistoryChanged();
 
             this.statusBarManager.updateStatus(AgentStatus.Executing);
-            const report = await this.executeActions(aiResponseText, rootPath, activeBrain);
+            // Action tags are honored only from the visible final answer — never from hidden reasoning.
+            const report = await this.executeActions(cleanedVisible, rootPath, activeBrain);
             if (!assistantContent.trim() && report.length === 0) {
                 const promptCharCount = messagesForRequest.reduce((sum, m) => sum + (m.content?.length ?? 0), 0);
                 logError('Model returned an empty response without actions.', {
diff --git a/src/config.ts b/src/config.ts
index df96e76..a801736 100644
--- a/src/config.ts
+++ b/src/config.ts
@@ -38,6 +38,13 @@ export interface IAgentConfig {
     autoCompactHistory: boolean;
     /** 작은 모델(≤4B) 감지 시 예산 계산에 쓸 유효 context window 상한. 0 = 비활성화. */
     smallModelContextCap: number;
+    // ─── 응답 복구 (Thought Quarantine / Auto-Continuation) ───
+    /** 답변이 출력 토큰 한계에 걸리면 사용자 개입 없이 내부적으로 이어서 생성. */
+    autoContinueOnOutputLimit: boolean;
+    /** 자동 이어쓰기 최대 횟수 (무한 반복 방지). 0 = 비활성화. */
+    maxAutoContinuations: number;
+    /** 모델이 내부 사고만 출력하고 답변이 없으면 "최종 답변만" 지시로 1회 재생성. */
+    finalOnlyRetryOnThoughtLeak: boolean;
 }
 
 // ─── 경로 정규화 유틸리티 ───
@@ -115,7 +122,10 @@ export function getConfig(): IAgentConfig {
             return v === 'truncateMiddle' || v === 'rollingWindow' ? v : 'stopAtLimit';
         })(),
         autoCompactHistory: cfg.get<boolean>('autoCompactHistory', true),
-        smallModelContextCap: Math.max(0, cfg.get<number>('smallModelContextCap', 8192))
+        smallModelContextCap: Math.max(0, cfg.get<number>('smallModelContextCap', 8192)),
+        autoContinueOnOutputLimit: cfg.get<boolean>('autoContinueOnOutputLimit', true),
+        maxAutoContinuations: Math.max(0, Math.min(10, cfg.get<number>('maxAutoContinuations', 3))),
+        finalOnlyRetryOnThoughtLeak: cfg.get<boolean>('finalOnlyRetryOnThoughtLeak', true)
     };
 }
 
diff --git a/src/core/responseRecovery.ts b/src/core/responseRecovery.ts
new file mode 100644
index 0000000..af2ebd4
--- /dev/null
+++ b/src/core/responseRecovery.ts
@@ -0,0 +1,193 @@
+/**
+ * ============================================================
+ * Response Recovery — Thought Quarantine + Final-only Retry + Auto-Continuation
+ *
+ * The user already asked their question; they're waiting for an answer, not for a chance to
+ * babysit the generation engine. So:
+ *   - Hidden reasoning (Harmony `<|channel|>thought/analysis`, `<think>…</think>`, leading
+ *     "Thinking Process:" blocks — closed *or* unclosed) never reaches the screen.
+ *   - If the model emitted only hidden reasoning and no visible answer → retry, final-answer-only.
+ *   - If the answer was cut off at the output-token limit → continue it internally (compressed
+ *     request — original question + the visible answer so far, not the whole context/RAG again),
+ *     up to N times, then show one merged answer.
+ *
+ * This module is pure (no vscode / fs). `AgentExecutor` orchestrates the retries/continuations.
+ * ============================================================
+ */
+
+import { estimateTokens, type GenerationStopKind } from '../lib/contextManager';
+
+export interface CleanedAssistantOutput {
+    raw: string;
+    /** User-facing final answer with hidden reasoning removed. */
+    visible: string;
+    /** The stripped reasoning — for logs only, never shown to the user. */
+    hiddenReasoning: string;
+    hadHiddenReasoning: boolean;
+    /** The model emitted an explicit Harmony `final` channel. */
+    hadFinalChannel: boolean;
+    /** Raw had content, but it was *all* hidden reasoning — nothing to show → caller should retry. */
+    wasThoughtOnly: boolean;
+}
+
+const HIDDEN_CHANNEL_NAMES = '(?:thought|analysis|analyze|commentary|reasoning|reason|critic|reflection|plan|planning)';
+// Leading bare CoT marker — colon-required so we don't nuke a legit "## Thinking Process" section heading.
+const LEADING_THOUGHT_HEADER_RE =
+    /^\s*(?:thinking\s*process|thought\s*process|chain[- ]of[- ]thought|reasoning\s*steps?|내부\s*사고|사고\s*과정|생각\s*과정|추론\s*과정)\s*[:：]\s*(?:\r?\n|$)/i;
+
+/** Strip Harmony / gpt-oss control tokens (`<|channel|>analysis`, `<|start|>assistant`, `<|message|>`, `<|end|>`, …). */
+function dropControlTokens(s: string): string {
+    return s
+        // `<|channel|>NAME` and `<|start|>NAME` — the name follows the tag, outside the pipes.
+        .replace(/<\|?(?:channel|start)\|?>\s*[A-Za-z_]*/gi, '')
+        // `<|message|>` / `<|end|>` / `<|return|>` / `<|assistant|>` / any other fully-piped control token.
+        .replace(/<\|[^>]{0,40}\|>/g, '')
+        // single- / no-pipe variants of the no-name tokens.
+        .replace(/<\|?(?:end|return|message)\|?>/gi, '')
+        .replace(/\n{3,}/g, '\n\n')
+        .trim();
+}
+
+/**
+ * Split the raw model output into the visible final answer and (discarded) hidden reasoning.
+ * Robust to *unclosed* hidden channels — a model that runs out of tokens mid-thought leaves an
+ * open `<|channel|>thought …` with no closing token; we treat everything from that marker to EOS
+ * as hidden.
+ */
+export function extractVisibleFinal(raw: string): CleanedAssistantOutput {
+    const text = raw == null ? '' : String(raw);
+    const out: CleanedAssistantOutput = {
+        raw: text, visible: text.trim(), hiddenReasoning: '',
+        hadHiddenReasoning: false, hadFinalChannel: false, wasThoughtOnly: false,
+    };
+    if (!out.visible) { out.visible = ''; return out; }
+
+    const hidden: string[] = [];
+    const capture = (m: string): string => { const t = (m || '').trim(); if (t) hidden.push(t); return ''; };
+
+    let s = text;
+
+    // (A) If a Harmony `final` channel exists, the answer is what follows the LAST `final` marker,
+    //     up to the next control token or EOS. Everything before it is reasoning.
+    const finalMatches = [...s.matchAll(/<\|?channel\|?>\s*final\b\s*(?:<\|?message\|?>)?/gi)];
+    if (finalMatches.length > 0) {
+        out.hadFinalChannel = true;
+        const fm = finalMatches[finalMatches.length - 1];
+        const start = (fm.index ?? 0) + fm[0].length;
+        const before = dropControlTokens(s.slice(0, fm.index ?? 0));
+        if (before) { hidden.push(before); out.hadHiddenReasoning = true; }
+        const after = s.slice(start);
+        const cut = after.search(/<\|?(?:channel|start|end|return)\|?>/i);
+        s = cut >= 0 ? after.slice(0, cut) : after;
+    } else {
+        // (B) No final channel. Strip hidden channels — closed (followed by another control token) or
+        //     unclosed (running to EOS).
+        s = s.replace(
+            new RegExp(`<\\|?channel\\|?>\\s*${HIDDEN_CHANNEL_NAMES}\\b[\\s\\S]*?(?=<\\|?(?:channel|start)\\|?>|$)`, 'gi'),
+            capture
+        );
+        // <think>/<thinking>/<analysis>/<reasoning>/<scratchpad> blocks — closed first, then unclosed-to-EOS.
+        s = s.replace(/<(think(?:ing)?|analysis|reasoning|scratchpad|reflection)>[\s\S]*?<\/\1>/gi, capture);
+        s = s.replace(/<(?:think(?:ing)?|analysis|reasoning|scratchpad|reflection)>[\s\S]*$/gi, capture);
+        // (C) Leading bare "Thinking Process:" block — only when it's at the very top. Cut up to the
+        //     first plausible answer boundary (a heading, a "## 요약"-style line, "---", "답변:" …);
+        //     if there's no such boundary, the whole thing was reasoning.
+        const lead = s.match(LEADING_THOUGHT_HEADER_RE);
+        if (lead && (lead.index ?? 0) === 0) {
+            const rest = s.slice(lead[0].length);
+            const boundary = rest.search(
+                /\n(?:#{1,6}\s|\*\*[^*\n]{1,40}\*\*\s*[:：]|---\s*\r?\n|##?\s*(?:요약|결론|답변|정리|제안)|답변\s*[:：]|결론\s*[:：]|최종\s*답변|🔎|✅)/
+            );
+            if (boundary >= 0) {
+                hidden.push((lead[0] + rest.slice(0, boundary)).trim());
+                s = rest.slice(boundary + 1);
+            } else {
+                hidden.push(s.trim());
+                s = '';
+            }
+        }
+    }
+
+    s = dropControlTokens(s);
+    // Drop a now-leading bare marker line that survived (e.g. "Thinking Process:" with content already gone).
+    s = s.replace(LEADING_THOUGHT_HEADER_RE, '').trim();
+
+    out.visible = s;
+    out.hiddenReasoning = hidden.filter(Boolean).join('\n\n---\n\n');
+    out.hadHiddenReasoning = out.hadHiddenReasoning || hidden.some((p) => p && p.trim());
+    out.wasThoughtOnly = !out.visible && out.hadHiddenReasoning;
+    return out;
+}
+
+/** Should we silently re-ask the model for a final answer only (the last reply was all reasoning)? */
+export function shouldFinalOnlyRetry(cleaned: CleanedAssistantOutput): boolean {
+    return cleaned.wasThoughtOnly;
+}
+
+/**
+ * Should we silently continue from where the answer was cut off? Only when it actually hit the
+ * output-token ceiling and we already have a non-trivial visible answer to continue from.
+ */
+export function shouldAutoContinue(
+    stopKind: GenerationStopKind,
+    visibleAnswer: string,
+    outputTokens: number,
+    maxOutputTokens: number
+): boolean {
+    if (stopKind !== 'output-limit') return false;
+    if (!visibleAnswer || visibleAnswer.trim().length < 40) return false;
+    if (!Number.isFinite(maxOutputTokens) || maxOutputTokens <= 0) return true;
+    return outputTokens >= Math.floor(maxOutputTokens * 0.8);
+}
+
+/** Appended to the system prompt for a final-only retry — the previous reply was reasoning-only. */
+export const FINAL_ONLY_DIRECTIVE = [
+    '',
+    '[FINAL ANSWER ONLY]',
+    'Your previous reply contained only hidden reasoning (thought / analysis / channel markers) and no user-visible answer.',
+    'Reply again with the FINAL ANSWER only — directly answer the user, in Korean.',
+    'Do NOT include: <think>, <analysis>, <|channel|> markers, "Thinking Process:", planning notes, or any hidden reasoning.',
+].join('\n');
+
+/** A short, self-contained system prompt for a continuation request (we deliberately drop the big context). */
+export const CONTINUATION_SYSTEM_PROMPT = [
+    'You are continuing a user-visible final answer that was cut off mid-way because it hit the output limit.',
+    'Output the FINAL ANSWER continuation only — in Korean. Do NOT repeat what was already written.',
+    'Do NOT include <think>, <analysis>, <|channel|> markers, "Thinking Process:", or any hidden reasoning.',
+    'Use the same assumptions and context as the answer so far; do not restart.',
+].join('\n');
+
+/** Build the user message for a continuation request — original question + the answer so far (tail only). */
+export function buildContinuationUserPrompt(originalUserPrompt: string, visibleSoFar: string, tailChars = 1400): string {
+    const tail = visibleSoFar.length > tailChars ? '…' + visibleSoFar.slice(-tailChars) : visibleSoFar;
+    return [
+        'Original user request:',
+        (originalUserPrompt || '').trim() || '(unavailable)',
+        '',
+        'The answer so far (end of it — continue directly from here, do not repeat it):',
+        '"""',
+        tail.trim(),
+        '"""',
+        '',
+        'Continue the answer from exactly where it stopped. Korean. Final answer only.',
+    ].join('\n');
+}
+
+/** Join a continuation onto the previous visible answer, removing any verbatim overlap. */
+export function mergeContinuationParts(prev: string, next: string): string {
+    const a = (prev || '').replace(/\s+$/, '');
+    let b = (next || '').replace(/^\s+/, '');
+    if (!b) return a;
+    if (!a) return b;
+    // Drop a leading chunk of `b` that the model re-stated verbatim from the end of `a`.
+    const maxOverlap = Math.min(400, a.length, b.length);
+    for (let len = maxOverlap; len >= 16; len--) {
+        if (a.slice(-len) === b.slice(0, len)) { b = b.slice(len).replace(/^\s+/, ''); break; }
+    }
+    // If `a` ended mid-sentence (no terminal punctuation) just splice; otherwise add a paragraph break.
+    const aEndsClean = /[.!?。！？\n)\]”"'`]\s*$/.test(a);
+    return aEndsClean ? a + '\n\n' + b : a + b;
+}
+
+/** Rough token count of a string — re-exported helper so callers don't need contextManager directly. */
+export const countTokens = estimateTokens;
diff --git a/src/lib/contextManager.ts b/src/lib/contextManager.ts
index adfca1b..f79e472 100644
--- a/src/lib/contextManager.ts
+++ b/src/lib/contextManager.ts
@@ -239,11 +239,15 @@ export function classifyStopReason(raw: string | null | undefined): GenerationSt
     return 'unknown';
 }
 
-/** 잘린 응답일 때 사용자에게 덧붙일 한 줄 안내. 정상 종료면 빈 문자열. */
+/**
+ * 잘린 응답일 때 사용자에게 덧붙일 한 줄 안내. 정상 종료면 빈 문자열.
+ * (output-limit 은 Astra 가 먼저 자동 이어쓰기를 시도하므로, 이 안내는 그래도 다 못 채웠을 때만 보입니다.
+ *  그래서 "이어서 작성해줘" 같은 사용자 액션을 요구하지 않습니다.)
+ */
 export function truncationNotice(kind: GenerationStopKind): string {
     switch (kind) {
         case 'output-limit':
-            return '\n\n> ⚠️ 답변이 출력 토큰 한계에 도달해 잘렸습니다. "이어서 작성해줘" 라고 요청하면 계속 생성합니다.';
+            return '\n\n> ⚠️ 답변이 길어 자동으로 이어 정리했지만 여전히 길이 한계에 닿았습니다. 더 좁은 주제로 나눠 질문하시면 완전한 답변을 받을 수 있어요.';
         case 'context-overflow':
             return '\n\n> ⚠️ 입력 컨텍스트가 모델의 context window 를 초과했습니다. 대화를 새로 시작하거나(`/newChat`) Settings 에서 `g1nation.contextLength` 를 모델 실제 값으로 맞추고, Brain/Skill 컨텍스트를 줄여보세요.';
         case 'error':
diff --git a/tests/responseRecovery.test.ts b/tests/responseRecovery.test.ts
new file mode 100644
index 0000000..0b6ddf8
--- /dev/null
+++ b/tests/responseRecovery.test.ts
@@ -0,0 +1,118 @@
+import {
+    extractVisibleFinal,
+    shouldFinalOnlyRetry,
+    shouldAutoContinue,
+    mergeContinuationParts,
+    buildContinuationUserPrompt,
+} from '../src/core/responseRecovery';
+
+describe('responseRecovery.extractVisibleFinal — thought quarantine', () => {
+    it('leaves a plain answer untouched', () => {
+        const out = extractVisibleFinal('안녕하세요! 무엇을 도와드릴까요?');
+        expect(out.visible).toBe('안녕하세요! 무엇을 도와드릴까요?');
+        expect(out.hadHiddenReasoning).toBe(false);
+        expect(out.wasThoughtOnly).toBe(false);
+    });
+
+    it('keeps only the Harmony `final` channel and discards analysis', () => {
+        const raw = '<|channel|>analysis<|message|>Let me think about this carefully...<|end|><|start|>assistant<|channel|>final<|message|>최종 답변입니다.';
+        const out = extractVisibleFinal(raw);
+        expect(out.visible).toBe('최종 답변입니다.');
+        expect(out.hadFinalChannel).toBe(true);
+        expect(out.hadHiddenReasoning).toBe(true);
+        expect(out.hiddenReasoning).toContain('think about this');
+        expect(out.wasThoughtOnly).toBe(false);
+    });
+
+    it('strips an UNCLOSED thought channel (model ran out of tokens mid-thought) → thought-only', () => {
+        const raw = '<|channel>thought\nThinking Process:\nLet me figure out how to approach this and';
+        const out = extractVisibleFinal(raw);
+        expect(out.visible).toBe('');
+        expect(out.hadHiddenReasoning).toBe(true);
+        expect(out.wasThoughtOnly).toBe(true);
+        expect(shouldFinalOnlyRetry(out)).toBe(true);
+    });
+
+    it('strips a closed <think>…</think> block', () => {
+        const out = extractVisibleFinal('<think>reasoning here, multi\nline</think>\n\n실제 답변입니다.');
+        expect(out.visible).toBe('실제 답변입니다.');
+        expect(out.hadHiddenReasoning).toBe(true);
+    });
+
+    it('strips an unclosed <think> running to EOS → thought-only', () => {
+        const out = extractVisibleFinal("<think>I'm thinking and then I run out of");
+        expect(out.visible).toBe('');
+        expect(out.wasThoughtOnly).toBe(true);
+    });
+
+    it('strips a leading "Thinking Process:" block up to the answer boundary', () => {
+        const out = extractVisibleFinal('Thinking Process:\nStep 1: consider X\nStep 2: consider Y\n## 요약\n실제 답변 본문입니다.');
+        expect(out.visible).toContain('## 요약');
+        expect(out.visible).toContain('실제 답변 본문');
+        expect(out.visible).not.toContain('Step 1');
+        expect(out.hadHiddenReasoning).toBe(true);
+    });
+
+    it('treats a leading "Thinking Process:" with no answer boundary as thought-only', () => {
+        const out = extractVisibleFinal('Thinking Process:\nStep 1...\nStep 2... and I ran out of tokens here');
+        expect(out.visible).toBe('');
+        expect(out.wasThoughtOnly).toBe(true);
+    });
+
+    it('does NOT strip a legitimate "## Thinking Process" markdown heading (no colon)', () => {
+        const out = extractVisibleFinal('## Thinking Process\n여기서는 사고 과정 자체를 설명하는 답변입니다.');
+        expect(out.visible).toContain('## Thinking Process');
+        expect(out.visible).toContain('사고 과정 자체를 설명');
+        expect(out.hadHiddenReasoning).toBe(false);
+    });
+
+    it('handles empty / whitespace input', () => {
+        expect(extractVisibleFinal('').visible).toBe('');
+        expect(extractVisibleFinal('   \n  ').visible).toBe('');
+        expect(extractVisibleFinal(null as any).visible).toBe('');
+        expect(extractVisibleFinal('').wasThoughtOnly).toBe(false);
+    });
+});
+
+describe('responseRecovery.shouldAutoContinue', () => {
+    it('continues only when output-limit AND a real visible answer AND near the cap', () => {
+        expect(shouldAutoContinue('output-limit', 'x'.repeat(200), 3500, 4096)).toBe(true);
+        expect(shouldAutoContinue('output-limit', 'short', 4000, 4096)).toBe(false);          // no real answer
+        expect(shouldAutoContinue('output-limit', 'x'.repeat(200), 100, 4096)).toBe(false);   // didn't actually hit the cap
+        expect(shouldAutoContinue('complete', 'x'.repeat(200), 4000, 4096)).toBe(false);
+        expect(shouldAutoContinue('context-overflow', 'x'.repeat(200), 4000, 4096)).toBe(false);
+        expect(shouldAutoContinue('error', 'x'.repeat(200), 4000, 4096)).toBe(false);
+    });
+});
+
+describe('responseRecovery.mergeContinuationParts', () => {
+    it('handles empty inputs', () => {
+        expect(mergeContinuationParts('', 'hello')).toBe('hello');
+        expect(mergeContinuationParts('hello', '')).toBe('hello');
+        expect(mergeContinuationParts('', '')).toBe('');
+    });
+    it('joins with a paragraph break when the previous part ended cleanly', () => {
+        expect(mergeContinuationParts('첫 번째 부분.', '두 번째 부분.')).toBe('첫 번째 부분.\n\n두 번째 부분.');
+    });
+    it('removes a verbatim overlap the continuation re-stated, splicing mid-sentence', () => {
+        const a = 'the answer continues here and here';
+        const b = 'continues here and here, then more';
+        expect(mergeContinuationParts(a, b)).toBe('the answer continues here and here, then more');
+    });
+});
+
+describe('responseRecovery.buildContinuationUserPrompt', () => {
+    it('includes the original question and the tail of the answer so far', () => {
+        const p = buildContinuationUserPrompt('원래 질문은 무엇인가?', 'a'.repeat(50) + 'TAIL_MARKER');
+        expect(p).toContain('원래 질문은 무엇인가?');
+        expect(p).toContain('TAIL_MARKER');
+        expect(p).toMatch(/continue/i);
+    });
+    it('truncates a long answer-so-far to its tail', () => {
+        const long = 'HEAD_MARKER' + 'b'.repeat(3000) + 'TAIL_MARKER';
+        const p = buildContinuationUserPrompt('q', long, 1400);
+        expect(p).toContain('TAIL_MARKER');
+        expect(p).not.toContain('HEAD_MARKER');
+        expect(p).toContain('…');
+    });
+});