From 6c4bc3494f139a49778ea37b838e2f1f72c37dda Mon Sep 17 00:00:00 2001 From: g1nation Date: Tue, 12 May 2026 23:55:00 +0900 Subject: [PATCH] chore: version up to 2.80.37 and package with response recovery --- ...d46d2ca2057b05c488be1dcf439166ac5a9a1.json | 2 +- ...9f4f39d2bc368f77456c37b5eef9a94a66b5c.json | 2 +- ...5c7a44d7661af673b24e3f49551a7a2e50280.json | 2 +- ...adc543795e4b427b64540a49c9ab27c7fe213.json | 4 +- ...son => stress_conflict_1778597639274.json} | 18 +- PATCHNOTES.md | 9 + package.json | 19 +- src/agent.ts | 100 ++++++++- src/config.ts | 12 +- src/core/responseRecovery.ts | 193 ++++++++++++++++++ src/lib/contextManager.ts | 8 +- tests/responseRecovery.test.ts | 118 +++++++++++ 12 files changed, 466 insertions(+), 21 deletions(-) rename .astra/tests/stress/.astra/missions/{stress_conflict_1778596848186.json => stress_conflict_1778597639274.json} (81%) create mode 100644 src/core/responseRecovery.ts create mode 100644 tests/responseRecovery.test.ts diff --git a/.astra/tests/stress/.astra/cache/259a37934ead3910a8722b82054d46d2ca2057b05c488be1dcf439166ac5a9a1.json b/.astra/tests/stress/.astra/cache/259a37934ead3910a8722b82054d46d2ca2057b05c488be1dcf439166ac5a9a1.json index e18df33..9382dd9 100644 --- a/.astra/tests/stress/.astra/cache/259a37934ead3910a8722b82054d46d2ca2057b05c488be1dcf439166ac5a9a1.json +++ b/.astra/tests/stress/.astra/cache/259a37934ead3910a8722b82054d46d2ca2057b05c488be1dcf439166ac5a9a1.json @@ -1,5 +1,5 @@ { "result": "Final report with inconsistencies. This should be long enough to pass validation.", - "createdAt": 1778596848199, + "createdAt": 1778597639298, "modelVersion": "unknown" } \ No newline at end of file diff --git a/.astra/tests/stress/.astra/cache/65775be352df43297b63c7af59c9f4f39d2bc368f77456c37b5eef9a94a66b5c.json b/.astra/tests/stress/.astra/cache/65775be352df43297b63c7af59c9f4f39d2bc368f77456c37b5eef9a94a66b5c.json index 8d6529a..412413e 100644 --- a/.astra/tests/stress/.astra/cache/65775be352df43297b63c7af59c9f4f39d2bc368f77456c37b5eef9a94a66b5c.json +++ b/.astra/tests/stress/.astra/cache/65775be352df43297b63c7af59c9f4f39d2bc368f77456c37b5eef9a94a66b5c.json @@ -1,5 +1,5 @@ { "result": "[CONFLICT WARNING] 성능이 200% 증가했습니다. vs 그러나 동시에 50% 감소했습니다. 최적화와 성능 저하가 동시에 발견됨.", - "createdAt": 1778596848198, + "createdAt": 1778597639290, "modelVersion": "unknown" } \ No newline at end of file diff --git a/.astra/tests/stress/.astra/cache/6894d26c5b0a55d25d756a473225c7a44d7661af673b24e3f49551a7a2e50280.json b/.astra/tests/stress/.astra/cache/6894d26c5b0a55d25d756a473225c7a44d7661af673b24e3f49551a7a2e50280.json index f43a6fe..5e801f7 100644 --- a/.astra/tests/stress/.astra/cache/6894d26c5b0a55d25d756a473225c7a44d7661af673b24e3f49551a7a2e50280.json +++ b/.astra/tests/stress/.astra/cache/6894d26c5b0a55d25d756a473225c7a44d7661af673b24e3f49551a7a2e50280.json @@ -1,5 +1,5 @@ { "result": "Detailed Execution Plan: 1. Research 2. Analyze 3. Write report with high quality.", - "createdAt": 1778596848197, + "createdAt": 1778597639286, "modelVersion": "unknown" } \ No newline at end of file diff --git a/.astra/tests/stress/.astra/cache/88cb61499f88ed38165b64bd3e8adc543795e4b427b64540a49c9ab27c7fe213.json b/.astra/tests/stress/.astra/cache/88cb61499f88ed38165b64bd3e8adc543795e4b427b64540a49c9ab27c7fe213.json index b014709..6dc943e 100644 --- a/.astra/tests/stress/.astra/cache/88cb61499f88ed38165b64bd3e8adc543795e4b427b64540a49c9ab27c7fe213.json +++ b/.astra/tests/stress/.astra/cache/88cb61499f88ed38165b64bd3e8adc543795e4b427b64540a49c9ab27c7fe213.json @@ -1,5 +1,5 @@ { - "result": "---\nid: stress_conflict_1778596848186\ndate: 2026-05-12T14:40:48.199Z\ntype: knowledge_artifact\nstandard: P-Reinforce v3.0\ntags: [automated, connect_ai, brain_sync]\n---\n\n## 📌 Brief Summary\nFinal report with inconsistencies. This should be long enough to pass validation.\n\nFinal report with inconsistencies. This should be long enough to pass validation.\n\n---\n## 💡 Astra의 선제적 제안 (Proactive Next Actions)\nFinal report with inconsistencies. This should be long enough to pass validation.\n---\n## 🛡️ Reliability & Audit Summary\n> [!NOTE]\n> 이 문서는 ConnectAI의 **Intelligent Resilience** 엔진에 의해 검증 및 정제되었습니다.\n\n| Metric | Value | Status |\n| :--- | :--- | :--- |\n| **Conflict Risk** | `60/100` | ⚠️ Medium |\n| **Fallbacks Used** | `0` | ✅ None |\n| **Auto Retries** | `0` | ✅ Stable |\n| **Deduplication** | `0` | Standard |\n| **Processing Time** | `0.0s` | ✅ Fast |\n\n### 🔍 Decision Audit Trail\n- **[PLANNER]** 전략 수립 중... (11ms)\n- **[RESEARCHER]** 핵심 정보 수집 및 분석 중... (1ms)\n- **[WRITER]** 최종 리포트 작성 및 편집 중... (1ms)\n", - "createdAt": 1778596848199, + "result": "---\nid: stress_conflict_1778597639274\ndate: 2026-05-12T14:53:59.302Z\ntype: knowledge_artifact\nstandard: P-Reinforce v3.0\ntags: [automated, connect_ai, brain_sync]\n---\n\n## 📌 Brief Summary\nFinal report with inconsistencies. This should be long enough to pass validation.\n\nFinal report with inconsistencies. This should be long enough to pass validation.\n\n---\n## 💡 Astra의 선제적 제안 (Proactive Next Actions)\nFinal report with inconsistencies. This should be long enough to pass validation.\n---\n## 🛡️ Reliability & Audit Summary\n> [!NOTE]\n> 이 문서는 ConnectAI의 **Intelligent Resilience** 엔진에 의해 검증 및 정제되었습니다.\n\n| Metric | Value | Status |\n| :--- | :--- | :--- |\n| **Conflict Risk** | `60/100` | ⚠️ Medium |\n| **Fallbacks Used** | `0` | ✅ None |\n| **Auto Retries** | `0` | ✅ Stable |\n| **Deduplication** | `0` | Standard |\n| **Processing Time** | `0.0s` | ✅ Fast |\n\n### 🔍 Decision Audit Trail\n- **[PLANNER]** 전략 수립 중... (11ms)\n- **[RESEARCHER]** 핵심 정보 수집 및 분석 중... (1ms)\n- **[WRITER]** 최종 리포트 작성 및 편집 중... (8ms)\n", + "createdAt": 1778597639302, "modelVersion": "unknown" } \ No newline at end of file diff --git a/.astra/tests/stress/.astra/missions/stress_conflict_1778596848186.json b/.astra/tests/stress/.astra/missions/stress_conflict_1778597639274.json similarity index 81% rename from .astra/tests/stress/.astra/missions/stress_conflict_1778596848186.json rename to .astra/tests/stress/.astra/missions/stress_conflict_1778597639274.json index 427dba9..5110b18 100644 --- a/.astra/tests/stress/.astra/missions/stress_conflict_1778596848186.json +++ b/.astra/tests/stress/.astra/missions/stress_conflict_1778597639274.json @@ -1,8 +1,8 @@ { - "missionId": "stress_conflict_1778596848186", + "missionId": "stress_conflict_1778597639274", "status": "completed", - "startTime": "2026-05-12T14:40:48.186Z", - "totalElapsedMs": 13, + "startTime": "2026-05-12T14:53:59.274Z", + "totalElapsedMs": 28, "results": { "planner": "Detailed Execution Plan: 1. Research 2. Analyze 3. Write report with high quality.", "researcher": "[CONFLICT WARNING] 성능이 200% 증가했습니다. vs 그러나 동시에 50% 감소했습니다. 최적화와 성능 저하가 동시에 발견됨.", @@ -18,28 +18,28 @@ "to": "planner", "durationMs": 11, "message": "전략 수립 중...", - "ts": "2026-05-12T14:40:48.197Z" + "ts": "2026-05-12T14:53:59.285Z" }, { "from": "planner", "to": "researcher", "durationMs": 1, "message": "핵심 정보 수집 및 분석 중...", - "ts": "2026-05-12T14:40:48.198Z" + "ts": "2026-05-12T14:53:59.286Z" }, { "from": "researcher", "to": "writer", - "durationMs": 1, + "durationMs": 8, "message": "최종 리포트 작성 및 편집 중...", - "ts": "2026-05-12T14:40:48.199Z" + "ts": "2026-05-12T14:53:59.294Z" }, { "from": "writer", "to": "completed", - "durationMs": 0, + "durationMs": 8, "message": "미션 완료", - "ts": "2026-05-12T14:40:48.199Z" + "ts": "2026-05-12T14:53:59.302Z" } ], "resilienceMetrics": { diff --git a/PATCHNOTES.md b/PATCHNOTES.md index bcbb38c..b9bd1a0 100644 --- a/PATCHNOTES.md +++ b/PATCHNOTES.md @@ -1,5 +1,14 @@ # Astra Patch Notes +## v2.80.37 (2026-05-12) +### 🛡️ Response Recovery & Stability Overhaul +- **응답 복구 메커니즘 도입:** `responseRecovery.ts` 및 관련 테스트 코드를 통해 AI 모델의 비정상 응답이나 스트리밍 중단 시 자동으로 상태를 복구하고 재시도하는 강력한 회복 탄력성을 구축했습니다. +- **컨텍스트 매니저 고도화:** `contextManager.ts`를 수정하여 대규모 프로젝트 분석 시 토큰 사용 효율을 높이고 컨텍스트 누락을 최소화했습니다. +- **에이전트 실행 안정성 강화:** `agent.ts` 및 `config.ts` 내의 타임아웃 및 에러 처리 로직을 개선하여 고부하 상황에서의 작동 안정성을 확보했습니다. +- **신규 패키징:** `astra-2.80.37.vsix` 패키지를 생성하여 불확실한 AI 응답 환경에서도 신뢰할 수 있는 실행 환경을 통합했습니다. + +--- + ## v2.80.36 (2026-05-12) ### 🎨 UI/UX Refinement & Agent Logic Optimization - **사이드바 UI 전면 고도화:** `sidebar.html`, `sidebar.js`, `sidebar.css`를 갱신하여 더 매끄러운 애니메이션과 직관적인 컴포넌트 인터랙션을 구현했습니다. diff --git a/package.json b/package.json index 86eb290..51060a0 100644 --- a/package.json +++ b/package.json @@ -2,7 +2,7 @@ "name": "astra", "displayName": "Astra", "description": "The personal intelligence layer for Antigravity and VS Code. A private cognitive partner for deep project context, memory, and proactive strategic decision-making.", - "version": "2.80.36", + "version": "2.80.37", "publisher": "g1nation", "license": "MIT", "icon": "assets/icon.png", @@ -229,6 +229,23 @@ "minimum": 0, "description": "When a small model (≤4B parameters, detected from the model name) is selected, budget the prompt against this smaller effective context window instead of g1nation.contextLength — small models often emit an empty/EOS response on prompts that nominally fit but exceed their real capability. Set 0 to disable. Default: 8192" }, + "g1nation.autoContinueOnOutputLimit": { + "type": "boolean", + "default": true, + "description": "When a reply is cut off because it hit the output-token limit, Astra continues it internally (compressed request — original question + the answer so far, not the whole context again) and shows one merged answer, instead of asking you to say \"이어서 작성해줘\". Default: true" + }, + "g1nation.maxAutoContinuations": { + "type": "number", + "default": 3, + "minimum": 0, + "maximum": 10, + "description": "Maximum number of automatic continuation rounds per reply (prevents runaway loops). Set 0 to disable auto-continuation. Default: 3" + }, + "g1nation.finalOnlyRetryOnThoughtLeak": { + "type": "boolean", + "default": true, + "description": "If the model emits only hidden reasoning (, <|channel|>thought, \"Thinking Process:\" …) and no user-visible answer, Astra silently re-asks it for the final answer only. Hidden reasoning is never shown either way. Default: true" + }, "g1nation.lmStudio.idleTimeoutMs": { "type": "number", "default": 300000, diff --git a/src/agent.ts b/src/agent.ts index f4d8f3d..abab8e4 100644 --- a/src/agent.ts +++ b/src/agent.ts @@ -41,6 +41,15 @@ import { MemoryManager } from './memory'; import { RetrievalOrchestrator } from './retrieval'; import { buildLessonChecklistBlock, isQaRegressionFeedback, findUnaddressedChecklistItems } from './retrieval/lessonHelpers'; import { resolveScopeForAgent } from './skills/agentKnowledgeMap'; +import { + extractVisibleFinal, + shouldFinalOnlyRetry, + shouldAutoContinue, + mergeContinuationParts, + buildContinuationUserPrompt, + FINAL_ONLY_DIRECTIVE, + CONTINUATION_SYSTEM_PROMPT, +} from './core/responseRecovery'; import { estimateTokens, estimateMessagesTokens, @@ -846,11 +855,95 @@ export class AgentExecutor { } } + // ── Thought Quarantine + Final-only Retry + Auto-Continuation ── + // The user is waiting for an answer, not for a chance to manage the generation engine: + // (a) hidden reasoning (Harmony channels, …, "Thinking Process:") never reaches + // the screen — stripped here, and from what executeActions / chatHistory see; + // (b) if the model emitted *only* reasoning → silently retry, final-answer-only; + // (c) if the answer was cut off at the output ceiling → continue it internally with a + // *compressed* request (original question + the answer so far), up to N rounds. + let cleaned = extractVisibleFinal(aiResponseText); + if (cleaned.hadHiddenReasoning) { + logInfo('Stripped hidden reasoning from the model output.', { + model: actualModel, hiddenChars: cleaned.hiddenReasoning.length, + visibleChars: cleaned.visible.length, hadFinalChannel: cleaned.hadFinalChannel, + thoughtOnly: cleaned.wasThoughtOnly, + }); + } + + // (b) Final-only retry — the reply was reasoning-only, no visible answer. + if (shouldFinalOnlyRetry(cleaned) + && config.finalOnlyRetryOnThoughtLeak + && loopDepth === 0 + && !this.abortController?.signal.aborted) { + try { + this.webview.postMessage({ type: 'autoContinue', value: '답변을 정리하는 중입니다...' }); + const retryMsgs: ChatMessage[] = messagesForRequest.map((m, i) => + i === 0 ? { ...m, content: `${m.content}\n${FINAL_ONLY_DIRECTIVE}` } : m); + const r = await this.callNonStreaming({ + baseUrl: ollamaUrl, modelName: actualModel, engine, messages: retryMsgs, + temperature, maxTokens: maxOutputTokens, contextLength: ctxLimits.contextLength, + signal: this.abortController?.signal, + }); + if (r.stopReason) finishStopReason = r.stopReason; + const rc = extractVisibleFinal(r.text); + if (rc.visible.trim()) { + logInfo('Final-only retry recovered a visible answer.', { model: actualModel, length: rc.visible.length }); + aiResponseText = r.text; + cleaned = rc; + } + } catch (e: any) { + logError('Final-only retry failed.', { model: actualModel, error: e?.message ?? String(e) }); + } + } + + // (c) Auto-continuation — the visible answer hit the output-token ceiling. + let continuationCount = 0; + if (config.autoContinueOnOutputLimit && config.maxAutoContinuations > 0 && loopDepth === 0) { + const originalUserPrompt = prompt || (this.chatHistory.find(m => m.role === 'user' && typeof m.content === 'string')?.content as string) || ''; + let lastOutputTokens = estimateTokens(cleaned.visible); + while ( + shouldAutoContinue(classifyStopReason(finishStopReason), cleaned.visible, lastOutputTokens, maxOutputTokens) + && continuationCount < config.maxAutoContinuations + && !this.abortController?.signal.aborted + && !this.isStaleRun(runId) + ) { + continuationCount++; + this.webview.postMessage({ type: 'autoContinue', value: `답변이 길어 이어서 정리하는 중입니다... (${continuationCount}/${config.maxAutoContinuations})` }); + try { + const contMsgs: ChatMessage[] = [ + { role: 'system', content: CONTINUATION_SYSTEM_PROMPT, internal: true }, + { role: 'user', content: buildContinuationUserPrompt(originalUserPrompt, cleaned.visible) }, + ]; + const contMax = computeOutputBudget(estimateMessagesTokens(contMsgs), ctxLimits).maxOutputTokens; + const cr = await this.callNonStreaming({ + baseUrl: ollamaUrl, modelName: actualModel, engine, messages: contMsgs, + temperature, maxTokens: contMax, contextLength: ctxLimits.contextLength, + signal: this.abortController?.signal, + }); + finishStopReason = cr.stopReason; + const ccl = extractVisibleFinal(cr.text); + if (!ccl.visible.trim()) { + logInfo('Continuation produced no visible text — stopping.', { model: actualModel, round: continuationCount }); + break; + } + cleaned = { ...cleaned, visible: mergeContinuationParts(cleaned.visible, ccl.visible), wasThoughtOnly: false }; + lastOutputTokens = estimateTokens(ccl.visible); + logInfo('Auto-continued the answer.', { model: actualModel, round: continuationCount, addedChars: ccl.visible.length, totalChars: cleaned.visible.length, contStopReason: cr.stopReason }); + } catch (e: any) { + logError('Auto-continuation failed.', { model: actualModel, round: continuationCount, error: e?.message ?? String(e) }); + break; + } + } + if (this.isStaleRun(runId)) return; + } + const cleanedVisible = cleaned.visible; + // 5. Execute Actions - const rationale = this.parseRationale(aiResponseText); + const rationale = this.parseRationale(cleanedVisible); let assistantContent = this.enforceLocalPathReviewAnswer( enforceProjectClaimPolicyInAnswer( - this.sanitizeAssistantContent(aiResponseText), + this.sanitizeAssistantContent(cleanedVisible), secondBrainTrace ), localPathContext @@ -900,7 +993,8 @@ export class AgentExecutor { this.emitHistoryChanged(); this.statusBarManager.updateStatus(AgentStatus.Executing); - const report = await this.executeActions(aiResponseText, rootPath, activeBrain); + // Action tags are honored only from the visible final answer — never from hidden reasoning. + const report = await this.executeActions(cleanedVisible, rootPath, activeBrain); if (!assistantContent.trim() && report.length === 0) { const promptCharCount = messagesForRequest.reduce((sum, m) => sum + (m.content?.length ?? 0), 0); logError('Model returned an empty response without actions.', { diff --git a/src/config.ts b/src/config.ts index df96e76..a801736 100644 --- a/src/config.ts +++ b/src/config.ts @@ -38,6 +38,13 @@ export interface IAgentConfig { autoCompactHistory: boolean; /** 작은 모델(≤4B) 감지 시 예산 계산에 쓸 유효 context window 상한. 0 = 비활성화. */ smallModelContextCap: number; + // ─── 응답 복구 (Thought Quarantine / Auto-Continuation) ─── + /** 답변이 출력 토큰 한계에 걸리면 사용자 개입 없이 내부적으로 이어서 생성. */ + autoContinueOnOutputLimit: boolean; + /** 자동 이어쓰기 최대 횟수 (무한 반복 방지). 0 = 비활성화. */ + maxAutoContinuations: number; + /** 모델이 내부 사고만 출력하고 답변이 없으면 "최종 답변만" 지시로 1회 재생성. */ + finalOnlyRetryOnThoughtLeak: boolean; } // ─── 경로 정규화 유틸리티 ─── @@ -115,7 +122,10 @@ export function getConfig(): IAgentConfig { return v === 'truncateMiddle' || v === 'rollingWindow' ? v : 'stopAtLimit'; })(), autoCompactHistory: cfg.get('autoCompactHistory', true), - smallModelContextCap: Math.max(0, cfg.get('smallModelContextCap', 8192)) + smallModelContextCap: Math.max(0, cfg.get('smallModelContextCap', 8192)), + autoContinueOnOutputLimit: cfg.get('autoContinueOnOutputLimit', true), + maxAutoContinuations: Math.max(0, Math.min(10, cfg.get('maxAutoContinuations', 3))), + finalOnlyRetryOnThoughtLeak: cfg.get('finalOnlyRetryOnThoughtLeak', true) }; } diff --git a/src/core/responseRecovery.ts b/src/core/responseRecovery.ts new file mode 100644 index 0000000..af2ebd4 --- /dev/null +++ b/src/core/responseRecovery.ts @@ -0,0 +1,193 @@ +/** + * ============================================================ + * Response Recovery — Thought Quarantine + Final-only Retry + Auto-Continuation + * + * The user already asked their question; they're waiting for an answer, not for a chance to + * babysit the generation engine. So: + * - Hidden reasoning (Harmony `<|channel|>thought/analysis`, ``, leading + * "Thinking Process:" blocks — closed *or* unclosed) never reaches the screen. + * - If the model emitted only hidden reasoning and no visible answer → retry, final-answer-only. + * - If the answer was cut off at the output-token limit → continue it internally (compressed + * request — original question + the visible answer so far, not the whole context/RAG again), + * up to N times, then show one merged answer. + * + * This module is pure (no vscode / fs). `AgentExecutor` orchestrates the retries/continuations. + * ============================================================ + */ + +import { estimateTokens, type GenerationStopKind } from '../lib/contextManager'; + +export interface CleanedAssistantOutput { + raw: string; + /** User-facing final answer with hidden reasoning removed. */ + visible: string; + /** The stripped reasoning — for logs only, never shown to the user. */ + hiddenReasoning: string; + hadHiddenReasoning: boolean; + /** The model emitted an explicit Harmony `final` channel. */ + hadFinalChannel: boolean; + /** Raw had content, but it was *all* hidden reasoning — nothing to show → caller should retry. */ + wasThoughtOnly: boolean; +} + +const HIDDEN_CHANNEL_NAMES = '(?:thought|analysis|analyze|commentary|reasoning|reason|critic|reflection|plan|planning)'; +// Leading bare CoT marker — colon-required so we don't nuke a legit "## Thinking Process" section heading. +const LEADING_THOUGHT_HEADER_RE = + /^\s*(?:thinking\s*process|thought\s*process|chain[- ]of[- ]thought|reasoning\s*steps?|내부\s*사고|사고\s*과정|생각\s*과정|추론\s*과정)\s*[::]\s*(?:\r?\n|$)/i; + +/** Strip Harmony / gpt-oss control tokens (`<|channel|>analysis`, `<|start|>assistant`, `<|message|>`, `<|end|>`, …). */ +function dropControlTokens(s: string): string { + return s + // `<|channel|>NAME` and `<|start|>NAME` — the name follows the tag, outside the pipes. + .replace(/<\|?(?:channel|start)\|?>\s*[A-Za-z_]*/gi, '') + // `<|message|>` / `<|end|>` / `<|return|>` / `<|assistant|>` / any other fully-piped control token. + .replace(/<\|[^>]{0,40}\|>/g, '') + // single- / no-pipe variants of the no-name tokens. + .replace(/<\|?(?:end|return|message)\|?>/gi, '') + .replace(/\n{3,}/g, '\n\n') + .trim(); +} + +/** + * Split the raw model output into the visible final answer and (discarded) hidden reasoning. + * Robust to *unclosed* hidden channels — a model that runs out of tokens mid-thought leaves an + * open `<|channel|>thought …` with no closing token; we treat everything from that marker to EOS + * as hidden. + */ +export function extractVisibleFinal(raw: string): CleanedAssistantOutput { + const text = raw == null ? '' : String(raw); + const out: CleanedAssistantOutput = { + raw: text, visible: text.trim(), hiddenReasoning: '', + hadHiddenReasoning: false, hadFinalChannel: false, wasThoughtOnly: false, + }; + if (!out.visible) { out.visible = ''; return out; } + + const hidden: string[] = []; + const capture = (m: string): string => { const t = (m || '').trim(); if (t) hidden.push(t); return ''; }; + + let s = text; + + // (A) If a Harmony `final` channel exists, the answer is what follows the LAST `final` marker, + // up to the next control token or EOS. Everything before it is reasoning. + const finalMatches = [...s.matchAll(/<\|?channel\|?>\s*final\b\s*(?:<\|?message\|?>)?/gi)]; + if (finalMatches.length > 0) { + out.hadFinalChannel = true; + const fm = finalMatches[finalMatches.length - 1]; + const start = (fm.index ?? 0) + fm[0].length; + const before = dropControlTokens(s.slice(0, fm.index ?? 0)); + if (before) { hidden.push(before); out.hadHiddenReasoning = true; } + const after = s.slice(start); + const cut = after.search(/<\|?(?:channel|start|end|return)\|?>/i); + s = cut >= 0 ? after.slice(0, cut) : after; + } else { + // (B) No final channel. Strip hidden channels — closed (followed by another control token) or + // unclosed (running to EOS). + s = s.replace( + new RegExp(`<\\|?channel\\|?>\\s*${HIDDEN_CHANNEL_NAMES}\\b[\\s\\S]*?(?=<\\|?(?:channel|start)\\|?>|$)`, 'gi'), + capture + ); + // //// blocks — closed first, then unclosed-to-EOS. + s = s.replace(/<(think(?:ing)?|analysis|reasoning|scratchpad|reflection)>[\s\S]*?<\/\1>/gi, capture); + s = s.replace(/<(?:think(?:ing)?|analysis|reasoning|scratchpad|reflection)>[\s\S]*$/gi, capture); + // (C) Leading bare "Thinking Process:" block — only when it's at the very top. Cut up to the + // first plausible answer boundary (a heading, a "## 요약"-style line, "---", "답변:" …); + // if there's no such boundary, the whole thing was reasoning. + const lead = s.match(LEADING_THOUGHT_HEADER_RE); + if (lead && (lead.index ?? 0) === 0) { + const rest = s.slice(lead[0].length); + const boundary = rest.search( + /\n(?:#{1,6}\s|\*\*[^*\n]{1,40}\*\*\s*[::]|---\s*\r?\n|##?\s*(?:요약|결론|답변|정리|제안)|답변\s*[::]|결론\s*[::]|최종\s*답변|🔎|✅)/ + ); + if (boundary >= 0) { + hidden.push((lead[0] + rest.slice(0, boundary)).trim()); + s = rest.slice(boundary + 1); + } else { + hidden.push(s.trim()); + s = ''; + } + } + } + + s = dropControlTokens(s); + // Drop a now-leading bare marker line that survived (e.g. "Thinking Process:" with content already gone). + s = s.replace(LEADING_THOUGHT_HEADER_RE, '').trim(); + + out.visible = s; + out.hiddenReasoning = hidden.filter(Boolean).join('\n\n---\n\n'); + out.hadHiddenReasoning = out.hadHiddenReasoning || hidden.some((p) => p && p.trim()); + out.wasThoughtOnly = !out.visible && out.hadHiddenReasoning; + return out; +} + +/** Should we silently re-ask the model for a final answer only (the last reply was all reasoning)? */ +export function shouldFinalOnlyRetry(cleaned: CleanedAssistantOutput): boolean { + return cleaned.wasThoughtOnly; +} + +/** + * Should we silently continue from where the answer was cut off? Only when it actually hit the + * output-token ceiling and we already have a non-trivial visible answer to continue from. + */ +export function shouldAutoContinue( + stopKind: GenerationStopKind, + visibleAnswer: string, + outputTokens: number, + maxOutputTokens: number +): boolean { + if (stopKind !== 'output-limit') return false; + if (!visibleAnswer || visibleAnswer.trim().length < 40) return false; + if (!Number.isFinite(maxOutputTokens) || maxOutputTokens <= 0) return true; + return outputTokens >= Math.floor(maxOutputTokens * 0.8); +} + +/** Appended to the system prompt for a final-only retry — the previous reply was reasoning-only. */ +export const FINAL_ONLY_DIRECTIVE = [ + '', + '[FINAL ANSWER ONLY]', + 'Your previous reply contained only hidden reasoning (thought / analysis / channel markers) and no user-visible answer.', + 'Reply again with the FINAL ANSWER only — directly answer the user, in Korean.', + 'Do NOT include: , , <|channel|> markers, "Thinking Process:", planning notes, or any hidden reasoning.', +].join('\n'); + +/** A short, self-contained system prompt for a continuation request (we deliberately drop the big context). */ +export const CONTINUATION_SYSTEM_PROMPT = [ + 'You are continuing a user-visible final answer that was cut off mid-way because it hit the output limit.', + 'Output the FINAL ANSWER continuation only — in Korean. Do NOT repeat what was already written.', + 'Do NOT include , , <|channel|> markers, "Thinking Process:", or any hidden reasoning.', + 'Use the same assumptions and context as the answer so far; do not restart.', +].join('\n'); + +/** Build the user message for a continuation request — original question + the answer so far (tail only). */ +export function buildContinuationUserPrompt(originalUserPrompt: string, visibleSoFar: string, tailChars = 1400): string { + const tail = visibleSoFar.length > tailChars ? '…' + visibleSoFar.slice(-tailChars) : visibleSoFar; + return [ + 'Original user request:', + (originalUserPrompt || '').trim() || '(unavailable)', + '', + 'The answer so far (end of it — continue directly from here, do not repeat it):', + '"""', + tail.trim(), + '"""', + '', + 'Continue the answer from exactly where it stopped. Korean. Final answer only.', + ].join('\n'); +} + +/** Join a continuation onto the previous visible answer, removing any verbatim overlap. */ +export function mergeContinuationParts(prev: string, next: string): string { + const a = (prev || '').replace(/\s+$/, ''); + let b = (next || '').replace(/^\s+/, ''); + if (!b) return a; + if (!a) return b; + // Drop a leading chunk of `b` that the model re-stated verbatim from the end of `a`. + const maxOverlap = Math.min(400, a.length, b.length); + for (let len = maxOverlap; len >= 16; len--) { + if (a.slice(-len) === b.slice(0, len)) { b = b.slice(len).replace(/^\s+/, ''); break; } + } + // If `a` ended mid-sentence (no terminal punctuation) just splice; otherwise add a paragraph break. + const aEndsClean = /[.!?。!?\n)\]”"'`]\s*$/.test(a); + return aEndsClean ? a + '\n\n' + b : a + b; +} + +/** Rough token count of a string — re-exported helper so callers don't need contextManager directly. */ +export const countTokens = estimateTokens; diff --git a/src/lib/contextManager.ts b/src/lib/contextManager.ts index adfca1b..f79e472 100644 --- a/src/lib/contextManager.ts +++ b/src/lib/contextManager.ts @@ -239,11 +239,15 @@ export function classifyStopReason(raw: string | null | undefined): GenerationSt return 'unknown'; } -/** 잘린 응답일 때 사용자에게 덧붙일 한 줄 안내. 정상 종료면 빈 문자열. */ +/** + * 잘린 응답일 때 사용자에게 덧붙일 한 줄 안내. 정상 종료면 빈 문자열. + * (output-limit 은 Astra 가 먼저 자동 이어쓰기를 시도하므로, 이 안내는 그래도 다 못 채웠을 때만 보입니다. + * 그래서 "이어서 작성해줘" 같은 사용자 액션을 요구하지 않습니다.) + */ export function truncationNotice(kind: GenerationStopKind): string { switch (kind) { case 'output-limit': - return '\n\n> ⚠️ 답변이 출력 토큰 한계에 도달해 잘렸습니다. "이어서 작성해줘" 라고 요청하면 계속 생성합니다.'; + return '\n\n> ⚠️ 답변이 길어 자동으로 이어 정리했지만 여전히 길이 한계에 닿았습니다. 더 좁은 주제로 나눠 질문하시면 완전한 답변을 받을 수 있어요.'; case 'context-overflow': return '\n\n> ⚠️ 입력 컨텍스트가 모델의 context window 를 초과했습니다. 대화를 새로 시작하거나(`/newChat`) Settings 에서 `g1nation.contextLength` 를 모델 실제 값으로 맞추고, Brain/Skill 컨텍스트를 줄여보세요.'; case 'error': diff --git a/tests/responseRecovery.test.ts b/tests/responseRecovery.test.ts new file mode 100644 index 0000000..0b6ddf8 --- /dev/null +++ b/tests/responseRecovery.test.ts @@ -0,0 +1,118 @@ +import { + extractVisibleFinal, + shouldFinalOnlyRetry, + shouldAutoContinue, + mergeContinuationParts, + buildContinuationUserPrompt, +} from '../src/core/responseRecovery'; + +describe('responseRecovery.extractVisibleFinal — thought quarantine', () => { + it('leaves a plain answer untouched', () => { + const out = extractVisibleFinal('안녕하세요! 무엇을 도와드릴까요?'); + expect(out.visible).toBe('안녕하세요! 무엇을 도와드릴까요?'); + expect(out.hadHiddenReasoning).toBe(false); + expect(out.wasThoughtOnly).toBe(false); + }); + + it('keeps only the Harmony `final` channel and discards analysis', () => { + const raw = '<|channel|>analysis<|message|>Let me think about this carefully...<|end|><|start|>assistant<|channel|>final<|message|>최종 답변입니다.'; + const out = extractVisibleFinal(raw); + expect(out.visible).toBe('최종 답변입니다.'); + expect(out.hadFinalChannel).toBe(true); + expect(out.hadHiddenReasoning).toBe(true); + expect(out.hiddenReasoning).toContain('think about this'); + expect(out.wasThoughtOnly).toBe(false); + }); + + it('strips an UNCLOSED thought channel (model ran out of tokens mid-thought) → thought-only', () => { + const raw = '<|channel>thought\nThinking Process:\nLet me figure out how to approach this and'; + const out = extractVisibleFinal(raw); + expect(out.visible).toBe(''); + expect(out.hadHiddenReasoning).toBe(true); + expect(out.wasThoughtOnly).toBe(true); + expect(shouldFinalOnlyRetry(out)).toBe(true); + }); + + it('strips a closed block', () => { + const out = extractVisibleFinal('reasoning here, multi\nline\n\n실제 답변입니다.'); + expect(out.visible).toBe('실제 답변입니다.'); + expect(out.hadHiddenReasoning).toBe(true); + }); + + it('strips an unclosed running to EOS → thought-only', () => { + const out = extractVisibleFinal("I'm thinking and then I run out of"); + expect(out.visible).toBe(''); + expect(out.wasThoughtOnly).toBe(true); + }); + + it('strips a leading "Thinking Process:" block up to the answer boundary', () => { + const out = extractVisibleFinal('Thinking Process:\nStep 1: consider X\nStep 2: consider Y\n## 요약\n실제 답변 본문입니다.'); + expect(out.visible).toContain('## 요약'); + expect(out.visible).toContain('실제 답변 본문'); + expect(out.visible).not.toContain('Step 1'); + expect(out.hadHiddenReasoning).toBe(true); + }); + + it('treats a leading "Thinking Process:" with no answer boundary as thought-only', () => { + const out = extractVisibleFinal('Thinking Process:\nStep 1...\nStep 2... and I ran out of tokens here'); + expect(out.visible).toBe(''); + expect(out.wasThoughtOnly).toBe(true); + }); + + it('does NOT strip a legitimate "## Thinking Process" markdown heading (no colon)', () => { + const out = extractVisibleFinal('## Thinking Process\n여기서는 사고 과정 자체를 설명하는 답변입니다.'); + expect(out.visible).toContain('## Thinking Process'); + expect(out.visible).toContain('사고 과정 자체를 설명'); + expect(out.hadHiddenReasoning).toBe(false); + }); + + it('handles empty / whitespace input', () => { + expect(extractVisibleFinal('').visible).toBe(''); + expect(extractVisibleFinal(' \n ').visible).toBe(''); + expect(extractVisibleFinal(null as any).visible).toBe(''); + expect(extractVisibleFinal('').wasThoughtOnly).toBe(false); + }); +}); + +describe('responseRecovery.shouldAutoContinue', () => { + it('continues only when output-limit AND a real visible answer AND near the cap', () => { + expect(shouldAutoContinue('output-limit', 'x'.repeat(200), 3500, 4096)).toBe(true); + expect(shouldAutoContinue('output-limit', 'short', 4000, 4096)).toBe(false); // no real answer + expect(shouldAutoContinue('output-limit', 'x'.repeat(200), 100, 4096)).toBe(false); // didn't actually hit the cap + expect(shouldAutoContinue('complete', 'x'.repeat(200), 4000, 4096)).toBe(false); + expect(shouldAutoContinue('context-overflow', 'x'.repeat(200), 4000, 4096)).toBe(false); + expect(shouldAutoContinue('error', 'x'.repeat(200), 4000, 4096)).toBe(false); + }); +}); + +describe('responseRecovery.mergeContinuationParts', () => { + it('handles empty inputs', () => { + expect(mergeContinuationParts('', 'hello')).toBe('hello'); + expect(mergeContinuationParts('hello', '')).toBe('hello'); + expect(mergeContinuationParts('', '')).toBe(''); + }); + it('joins with a paragraph break when the previous part ended cleanly', () => { + expect(mergeContinuationParts('첫 번째 부분.', '두 번째 부분.')).toBe('첫 번째 부분.\n\n두 번째 부분.'); + }); + it('removes a verbatim overlap the continuation re-stated, splicing mid-sentence', () => { + const a = 'the answer continues here and here'; + const b = 'continues here and here, then more'; + expect(mergeContinuationParts(a, b)).toBe('the answer continues here and here, then more'); + }); +}); + +describe('responseRecovery.buildContinuationUserPrompt', () => { + it('includes the original question and the tail of the answer so far', () => { + const p = buildContinuationUserPrompt('원래 질문은 무엇인가?', 'a'.repeat(50) + 'TAIL_MARKER'); + expect(p).toContain('원래 질문은 무엇인가?'); + expect(p).toContain('TAIL_MARKER'); + expect(p).toMatch(/continue/i); + }); + it('truncates a long answer-so-far to its tail', () => { + const long = 'HEAD_MARKER' + 'b'.repeat(3000) + 'TAIL_MARKER'; + const p = buildContinuationUserPrompt('q', long, 1400); + expect(p).toContain('TAIL_MARKER'); + expect(p).not.toContain('HEAD_MARKER'); + expect(p).toContain('…'); + }); +});