chore: version up to 2.80.37 and package with response recovery

This commit is contained in:
g1nation
2026-05-12 23:55:00 +09:00
parent e0af15767a
commit 6c4bc3494f
12 changed files with 466 additions and 21 deletions
+97 -3
View File
@@ -41,6 +41,15 @@ import { MemoryManager } from './memory';
import { RetrievalOrchestrator } from './retrieval';
import { buildLessonChecklistBlock, isQaRegressionFeedback, findUnaddressedChecklistItems } from './retrieval/lessonHelpers';
import { resolveScopeForAgent } from './skills/agentKnowledgeMap';
import {
extractVisibleFinal,
shouldFinalOnlyRetry,
shouldAutoContinue,
mergeContinuationParts,
buildContinuationUserPrompt,
FINAL_ONLY_DIRECTIVE,
CONTINUATION_SYSTEM_PROMPT,
} from './core/responseRecovery';
import {
estimateTokens,
estimateMessagesTokens,
@@ -846,11 +855,95 @@ export class AgentExecutor {
}
}
// ── Thought Quarantine + Final-only Retry + Auto-Continuation ──
// The user is waiting for an answer, not for a chance to manage the generation engine:
// (a) hidden reasoning (Harmony channels, <think>…, "Thinking Process:") never reaches
// the screen — stripped here, and from what executeActions / chatHistory see;
// (b) if the model emitted *only* reasoning → silently retry, final-answer-only;
// (c) if the answer was cut off at the output ceiling → continue it internally with a
// *compressed* request (original question + the answer so far), up to N rounds.
let cleaned = extractVisibleFinal(aiResponseText);
if (cleaned.hadHiddenReasoning) {
logInfo('Stripped hidden reasoning from the model output.', {
model: actualModel, hiddenChars: cleaned.hiddenReasoning.length,
visibleChars: cleaned.visible.length, hadFinalChannel: cleaned.hadFinalChannel,
thoughtOnly: cleaned.wasThoughtOnly,
});
}
// (b) Final-only retry — the reply was reasoning-only, no visible answer.
if (shouldFinalOnlyRetry(cleaned)
&& config.finalOnlyRetryOnThoughtLeak
&& loopDepth === 0
&& !this.abortController?.signal.aborted) {
try {
this.webview.postMessage({ type: 'autoContinue', value: '답변을 정리하는 중입니다...' });
const retryMsgs: ChatMessage[] = messagesForRequest.map((m, i) =>
i === 0 ? { ...m, content: `${m.content}\n${FINAL_ONLY_DIRECTIVE}` } : m);
const r = await this.callNonStreaming({
baseUrl: ollamaUrl, modelName: actualModel, engine, messages: retryMsgs,
temperature, maxTokens: maxOutputTokens, contextLength: ctxLimits.contextLength,
signal: this.abortController?.signal,
});
if (r.stopReason) finishStopReason = r.stopReason;
const rc = extractVisibleFinal(r.text);
if (rc.visible.trim()) {
logInfo('Final-only retry recovered a visible answer.', { model: actualModel, length: rc.visible.length });
aiResponseText = r.text;
cleaned = rc;
}
} catch (e: any) {
logError('Final-only retry failed.', { model: actualModel, error: e?.message ?? String(e) });
}
}
// (c) Auto-continuation — the visible answer hit the output-token ceiling.
let continuationCount = 0;
if (config.autoContinueOnOutputLimit && config.maxAutoContinuations > 0 && loopDepth === 0) {
const originalUserPrompt = prompt || (this.chatHistory.find(m => m.role === 'user' && typeof m.content === 'string')?.content as string) || '';
let lastOutputTokens = estimateTokens(cleaned.visible);
while (
shouldAutoContinue(classifyStopReason(finishStopReason), cleaned.visible, lastOutputTokens, maxOutputTokens)
&& continuationCount < config.maxAutoContinuations
&& !this.abortController?.signal.aborted
&& !this.isStaleRun(runId)
) {
continuationCount++;
this.webview.postMessage({ type: 'autoContinue', value: `답변이 길어 이어서 정리하는 중입니다... (${continuationCount}/${config.maxAutoContinuations})` });
try {
const contMsgs: ChatMessage[] = [
{ role: 'system', content: CONTINUATION_SYSTEM_PROMPT, internal: true },
{ role: 'user', content: buildContinuationUserPrompt(originalUserPrompt, cleaned.visible) },
];
const contMax = computeOutputBudget(estimateMessagesTokens(contMsgs), ctxLimits).maxOutputTokens;
const cr = await this.callNonStreaming({
baseUrl: ollamaUrl, modelName: actualModel, engine, messages: contMsgs,
temperature, maxTokens: contMax, contextLength: ctxLimits.contextLength,
signal: this.abortController?.signal,
});
finishStopReason = cr.stopReason;
const ccl = extractVisibleFinal(cr.text);
if (!ccl.visible.trim()) {
logInfo('Continuation produced no visible text — stopping.', { model: actualModel, round: continuationCount });
break;
}
cleaned = { ...cleaned, visible: mergeContinuationParts(cleaned.visible, ccl.visible), wasThoughtOnly: false };
lastOutputTokens = estimateTokens(ccl.visible);
logInfo('Auto-continued the answer.', { model: actualModel, round: continuationCount, addedChars: ccl.visible.length, totalChars: cleaned.visible.length, contStopReason: cr.stopReason });
} catch (e: any) {
logError('Auto-continuation failed.', { model: actualModel, round: continuationCount, error: e?.message ?? String(e) });
break;
}
}
if (this.isStaleRun(runId)) return;
}
const cleanedVisible = cleaned.visible;
// 5. Execute Actions
const rationale = this.parseRationale(aiResponseText);
const rationale = this.parseRationale(cleanedVisible);
let assistantContent = this.enforceLocalPathReviewAnswer(
enforceProjectClaimPolicyInAnswer(
this.sanitizeAssistantContent(aiResponseText),
this.sanitizeAssistantContent(cleanedVisible),
secondBrainTrace
),
localPathContext
@@ -900,7 +993,8 @@ export class AgentExecutor {
this.emitHistoryChanged();
this.statusBarManager.updateStatus(AgentStatus.Executing);
const report = await this.executeActions(aiResponseText, rootPath, activeBrain);
// Action tags are honored only from the visible final answer — never from hidden reasoning.
const report = await this.executeActions(cleanedVisible, rootPath, activeBrain);
if (!assistantContent.trim() && report.length === 0) {
const promptCharCount = messagesForRequest.reduce((sum, m) => sum + (m.content?.length ?? 0), 0);
logError('Model returned an empty response without actions.', {
+11 -1
View File
@@ -38,6 +38,13 @@ export interface IAgentConfig {
autoCompactHistory: boolean;
/** 작은 모델(≤4B) 감지 시 예산 계산에 쓸 유효 context window 상한. 0 = 비활성화. */
smallModelContextCap: number;
// ─── 응답 복구 (Thought Quarantine / Auto-Continuation) ───
/** 답변이 출력 토큰 한계에 걸리면 사용자 개입 없이 내부적으로 이어서 생성. */
autoContinueOnOutputLimit: boolean;
/** 자동 이어쓰기 최대 횟수 (무한 반복 방지). 0 = 비활성화. */
maxAutoContinuations: number;
/** 모델이 내부 사고만 출력하고 답변이 없으면 "최종 답변만" 지시로 1회 재생성. */
finalOnlyRetryOnThoughtLeak: boolean;
}
// ─── 경로 정규화 유틸리티 ───
@@ -115,7 +122,10 @@ export function getConfig(): IAgentConfig {
return v === 'truncateMiddle' || v === 'rollingWindow' ? v : 'stopAtLimit';
})(),
autoCompactHistory: cfg.get<boolean>('autoCompactHistory', true),
smallModelContextCap: Math.max(0, cfg.get<number>('smallModelContextCap', 8192))
smallModelContextCap: Math.max(0, cfg.get<number>('smallModelContextCap', 8192)),
autoContinueOnOutputLimit: cfg.get<boolean>('autoContinueOnOutputLimit', true),
maxAutoContinuations: Math.max(0, Math.min(10, cfg.get<number>('maxAutoContinuations', 3))),
finalOnlyRetryOnThoughtLeak: cfg.get<boolean>('finalOnlyRetryOnThoughtLeak', true)
};
}
+193
View File
@@ -0,0 +1,193 @@
/**
* ============================================================
* Response Recovery — Thought Quarantine + Final-only Retry + Auto-Continuation
*
* The user already asked their question; they're waiting for an answer, not for a chance to
* babysit the generation engine. So:
* - Hidden reasoning (Harmony `<|channel|>thought/analysis`, `<think>…</think>`, leading
* "Thinking Process:" blocks — closed *or* unclosed) never reaches the screen.
* - If the model emitted only hidden reasoning and no visible answer → retry, final-answer-only.
* - If the answer was cut off at the output-token limit → continue it internally (compressed
* request — original question + the visible answer so far, not the whole context/RAG again),
* up to N times, then show one merged answer.
*
* This module is pure (no vscode / fs). `AgentExecutor` orchestrates the retries/continuations.
* ============================================================
*/
import { estimateTokens, type GenerationStopKind } from '../lib/contextManager';
export interface CleanedAssistantOutput {
raw: string;
/** User-facing final answer with hidden reasoning removed. */
visible: string;
/** The stripped reasoning — for logs only, never shown to the user. */
hiddenReasoning: string;
hadHiddenReasoning: boolean;
/** The model emitted an explicit Harmony `final` channel. */
hadFinalChannel: boolean;
/** Raw had content, but it was *all* hidden reasoning — nothing to show → caller should retry. */
wasThoughtOnly: boolean;
}
const HIDDEN_CHANNEL_NAMES = '(?:thought|analysis|analyze|commentary|reasoning|reason|critic|reflection|plan|planning)';
// Leading bare CoT marker — colon-required so we don't nuke a legit "## Thinking Process" section heading.
const LEADING_THOUGHT_HEADER_RE =
/^\s*(?:thinking\s*process|thought\s*process|chain[- ]of[- ]thought|reasoning\s*steps?|내부\s*사고|사고\s*과정|생각\s*과정|추론\s*과정)\s*[:]\s*(?:\r?\n|$)/i;
/** Strip Harmony / gpt-oss control tokens (`<|channel|>analysis`, `<|start|>assistant`, `<|message|>`, `<|end|>`, …). */
function dropControlTokens(s: string): string {
return s
// `<|channel|>NAME` and `<|start|>NAME` — the name follows the tag, outside the pipes.
.replace(/<\|?(?:channel|start)\|?>\s*[A-Za-z_]*/gi, '')
// `<|message|>` / `<|end|>` / `<|return|>` / `<|assistant|>` / any other fully-piped control token.
.replace(/<\|[^>]{0,40}\|>/g, '')
// single- / no-pipe variants of the no-name tokens.
.replace(/<\|?(?:end|return|message)\|?>/gi, '')
.replace(/\n{3,}/g, '\n\n')
.trim();
}
/**
* Split the raw model output into the visible final answer and (discarded) hidden reasoning.
* Robust to *unclosed* hidden channels — a model that runs out of tokens mid-thought leaves an
* open `<|channel|>thought …` with no closing token; we treat everything from that marker to EOS
* as hidden.
*/
export function extractVisibleFinal(raw: string): CleanedAssistantOutput {
const text = raw == null ? '' : String(raw);
const out: CleanedAssistantOutput = {
raw: text, visible: text.trim(), hiddenReasoning: '',
hadHiddenReasoning: false, hadFinalChannel: false, wasThoughtOnly: false,
};
if (!out.visible) { out.visible = ''; return out; }
const hidden: string[] = [];
const capture = (m: string): string => { const t = (m || '').trim(); if (t) hidden.push(t); return ''; };
let s = text;
// (A) If a Harmony `final` channel exists, the answer is what follows the LAST `final` marker,
// up to the next control token or EOS. Everything before it is reasoning.
const finalMatches = [...s.matchAll(/<\|?channel\|?>\s*final\b\s*(?:<\|?message\|?>)?/gi)];
if (finalMatches.length > 0) {
out.hadFinalChannel = true;
const fm = finalMatches[finalMatches.length - 1];
const start = (fm.index ?? 0) + fm[0].length;
const before = dropControlTokens(s.slice(0, fm.index ?? 0));
if (before) { hidden.push(before); out.hadHiddenReasoning = true; }
const after = s.slice(start);
const cut = after.search(/<\|?(?:channel|start|end|return)\|?>/i);
s = cut >= 0 ? after.slice(0, cut) : after;
} else {
// (B) No final channel. Strip hidden channels — closed (followed by another control token) or
// unclosed (running to EOS).
s = s.replace(
new RegExp(`<\\|?channel\\|?>\\s*${HIDDEN_CHANNEL_NAMES}\\b[\\s\\S]*?(?=<\\|?(?:channel|start)\\|?>|$)`, 'gi'),
capture
);
// <think>/<thinking>/<analysis>/<reasoning>/<scratchpad> blocks — closed first, then unclosed-to-EOS.
s = s.replace(/<(think(?:ing)?|analysis|reasoning|scratchpad|reflection)>[\s\S]*?<\/\1>/gi, capture);
s = s.replace(/<(?:think(?:ing)?|analysis|reasoning|scratchpad|reflection)>[\s\S]*$/gi, capture);
// (C) Leading bare "Thinking Process:" block — only when it's at the very top. Cut up to the
// first plausible answer boundary (a heading, a "## 요약"-style line, "---", "답변:" …);
// if there's no such boundary, the whole thing was reasoning.
const lead = s.match(LEADING_THOUGHT_HEADER_RE);
if (lead && (lead.index ?? 0) === 0) {
const rest = s.slice(lead[0].length);
const boundary = rest.search(
/\n(?:#{1,6}\s|\*\*[^*\n]{1,40}\*\*\s*[:]|---\s*\r?\n|##?\s*(?:요약|결론|답변|정리|제안)|답변\s*[:]|결론\s*[:]|최종\s*답변|🔎|✅)/
);
if (boundary >= 0) {
hidden.push((lead[0] + rest.slice(0, boundary)).trim());
s = rest.slice(boundary + 1);
} else {
hidden.push(s.trim());
s = '';
}
}
}
s = dropControlTokens(s);
// Drop a now-leading bare marker line that survived (e.g. "Thinking Process:" with content already gone).
s = s.replace(LEADING_THOUGHT_HEADER_RE, '').trim();
out.visible = s;
out.hiddenReasoning = hidden.filter(Boolean).join('\n\n---\n\n');
out.hadHiddenReasoning = out.hadHiddenReasoning || hidden.some((p) => p && p.trim());
out.wasThoughtOnly = !out.visible && out.hadHiddenReasoning;
return out;
}
/** Should we silently re-ask the model for a final answer only (the last reply was all reasoning)? */
export function shouldFinalOnlyRetry(cleaned: CleanedAssistantOutput): boolean {
return cleaned.wasThoughtOnly;
}
/**
* Should we silently continue from where the answer was cut off? Only when it actually hit the
* output-token ceiling and we already have a non-trivial visible answer to continue from.
*/
export function shouldAutoContinue(
stopKind: GenerationStopKind,
visibleAnswer: string,
outputTokens: number,
maxOutputTokens: number
): boolean {
if (stopKind !== 'output-limit') return false;
if (!visibleAnswer || visibleAnswer.trim().length < 40) return false;
if (!Number.isFinite(maxOutputTokens) || maxOutputTokens <= 0) return true;
return outputTokens >= Math.floor(maxOutputTokens * 0.8);
}
/** Appended to the system prompt for a final-only retry — the previous reply was reasoning-only. */
export const FINAL_ONLY_DIRECTIVE = [
'',
'[FINAL ANSWER ONLY]',
'Your previous reply contained only hidden reasoning (thought / analysis / channel markers) and no user-visible answer.',
'Reply again with the FINAL ANSWER only — directly answer the user, in Korean.',
'Do NOT include: <think>, <analysis>, <|channel|> markers, "Thinking Process:", planning notes, or any hidden reasoning.',
].join('\n');
/** A short, self-contained system prompt for a continuation request (we deliberately drop the big context). */
export const CONTINUATION_SYSTEM_PROMPT = [
'You are continuing a user-visible final answer that was cut off mid-way because it hit the output limit.',
'Output the FINAL ANSWER continuation only — in Korean. Do NOT repeat what was already written.',
'Do NOT include <think>, <analysis>, <|channel|> markers, "Thinking Process:", or any hidden reasoning.',
'Use the same assumptions and context as the answer so far; do not restart.',
].join('\n');
/** Build the user message for a continuation request — original question + the answer so far (tail only). */
export function buildContinuationUserPrompt(originalUserPrompt: string, visibleSoFar: string, tailChars = 1400): string {
const tail = visibleSoFar.length > tailChars ? '…' + visibleSoFar.slice(-tailChars) : visibleSoFar;
return [
'Original user request:',
(originalUserPrompt || '').trim() || '(unavailable)',
'',
'The answer so far (end of it — continue directly from here, do not repeat it):',
'"""',
tail.trim(),
'"""',
'',
'Continue the answer from exactly where it stopped. Korean. Final answer only.',
].join('\n');
}
/** Join a continuation onto the previous visible answer, removing any verbatim overlap. */
export function mergeContinuationParts(prev: string, next: string): string {
const a = (prev || '').replace(/\s+$/, '');
let b = (next || '').replace(/^\s+/, '');
if (!b) return a;
if (!a) return b;
// Drop a leading chunk of `b` that the model re-stated verbatim from the end of `a`.
const maxOverlap = Math.min(400, a.length, b.length);
for (let len = maxOverlap; len >= 16; len--) {
if (a.slice(-len) === b.slice(0, len)) { b = b.slice(len).replace(/^\s+/, ''); break; }
}
// If `a` ended mid-sentence (no terminal punctuation) just splice; otherwise add a paragraph break.
const aEndsClean = /[.!?。!?\n)\]”"'`]\s*$/.test(a);
return aEndsClean ? a + '\n\n' + b : a + b;
}
/** Rough token count of a string — re-exported helper so callers don't need contextManager directly. */
export const countTokens = estimateTokens;
+6 -2
View File
@@ -239,11 +239,15 @@ export function classifyStopReason(raw: string | null | undefined): GenerationSt
return 'unknown';
}
/** 잘린 응답일 때 사용자에게 덧붙일 한 줄 안내. 정상 종료면 빈 문자열. */
/**
* 잘린 응답일 때 사용자에게 덧붙일 한 줄 안내. 정상 종료면 빈 문자열.
* (output-limit 은 Astra 가 먼저 자동 이어쓰기를 시도하므로, 이 안내는 그래도 다 못 채웠을 때만 보입니다.
* 그래서 "이어서 작성해줘" 같은 사용자 액션을 요구하지 않습니다.)
*/
export function truncationNotice(kind: GenerationStopKind): string {
switch (kind) {
case 'output-limit':
return '\n\n> ⚠️ 답변이 출력 토큰 한계에 도달해 잘렸습니다. "이어서 작성해줘" 라고 요청하면 계속 생성합니다.';
return '\n\n> ⚠️ 답변이 길어 자동으로 이어 정리했지만 여전히 길이 한계에 닿았습니다. 더 좁은 주제로 나눠 질문하시면 완전한 답변을 받을 수 있어요.';
case 'context-overflow':
return '\n\n> ⚠️ 입력 컨텍스트가 모델의 context window 를 초과했습니다. 대화를 새로 시작하거나(`/newChat`) Settings 에서 `g1nation.contextLength` 를 모델 실제 값으로 맞추고, Brain/Skill 컨텍스트를 줄여보세요.';
case 'error':