chore: version up to 2.80.37 and package with response recovery
This commit is contained in:
@@ -0,0 +1,193 @@
|
||||
/**
|
||||
* ============================================================
|
||||
* Response Recovery — Thought Quarantine + Final-only Retry + Auto-Continuation
|
||||
*
|
||||
* The user already asked their question; they're waiting for an answer, not for a chance to
|
||||
* babysit the generation engine. So:
|
||||
* - Hidden reasoning (Harmony `<|channel|>thought/analysis`, `<think>…</think>`, leading
|
||||
* "Thinking Process:" blocks — closed *or* unclosed) never reaches the screen.
|
||||
* - If the model emitted only hidden reasoning and no visible answer → retry, final-answer-only.
|
||||
* - If the answer was cut off at the output-token limit → continue it internally (compressed
|
||||
* request — original question + the visible answer so far, not the whole context/RAG again),
|
||||
* up to N times, then show one merged answer.
|
||||
*
|
||||
* This module is pure (no vscode / fs). `AgentExecutor` orchestrates the retries/continuations.
|
||||
* ============================================================
|
||||
*/
|
||||
|
||||
import { estimateTokens, type GenerationStopKind } from '../lib/contextManager';
|
||||
|
||||
export interface CleanedAssistantOutput {
|
||||
raw: string;
|
||||
/** User-facing final answer with hidden reasoning removed. */
|
||||
visible: string;
|
||||
/** The stripped reasoning — for logs only, never shown to the user. */
|
||||
hiddenReasoning: string;
|
||||
hadHiddenReasoning: boolean;
|
||||
/** The model emitted an explicit Harmony `final` channel. */
|
||||
hadFinalChannel: boolean;
|
||||
/** Raw had content, but it was *all* hidden reasoning — nothing to show → caller should retry. */
|
||||
wasThoughtOnly: boolean;
|
||||
}
|
||||
|
||||
const HIDDEN_CHANNEL_NAMES = '(?:thought|analysis|analyze|commentary|reasoning|reason|critic|reflection|plan|planning)';
|
||||
// Leading bare CoT marker — colon-required so we don't nuke a legit "## Thinking Process" section heading.
|
||||
const LEADING_THOUGHT_HEADER_RE =
|
||||
/^\s*(?:thinking\s*process|thought\s*process|chain[- ]of[- ]thought|reasoning\s*steps?|내부\s*사고|사고\s*과정|생각\s*과정|추론\s*과정)\s*[::]\s*(?:\r?\n|$)/i;
|
||||
|
||||
/** Strip Harmony / gpt-oss control tokens (`<|channel|>analysis`, `<|start|>assistant`, `<|message|>`, `<|end|>`, …). */
|
||||
function dropControlTokens(s: string): string {
|
||||
return s
|
||||
// `<|channel|>NAME` and `<|start|>NAME` — the name follows the tag, outside the pipes.
|
||||
.replace(/<\|?(?:channel|start)\|?>\s*[A-Za-z_]*/gi, '')
|
||||
// `<|message|>` / `<|end|>` / `<|return|>` / `<|assistant|>` / any other fully-piped control token.
|
||||
.replace(/<\|[^>]{0,40}\|>/g, '')
|
||||
// single- / no-pipe variants of the no-name tokens.
|
||||
.replace(/<\|?(?:end|return|message)\|?>/gi, '')
|
||||
.replace(/\n{3,}/g, '\n\n')
|
||||
.trim();
|
||||
}
|
||||
|
||||
/**
|
||||
* Split the raw model output into the visible final answer and (discarded) hidden reasoning.
|
||||
* Robust to *unclosed* hidden channels — a model that runs out of tokens mid-thought leaves an
|
||||
* open `<|channel|>thought …` with no closing token; we treat everything from that marker to EOS
|
||||
* as hidden.
|
||||
*/
|
||||
export function extractVisibleFinal(raw: string): CleanedAssistantOutput {
|
||||
const text = raw == null ? '' : String(raw);
|
||||
const out: CleanedAssistantOutput = {
|
||||
raw: text, visible: text.trim(), hiddenReasoning: '',
|
||||
hadHiddenReasoning: false, hadFinalChannel: false, wasThoughtOnly: false,
|
||||
};
|
||||
if (!out.visible) { out.visible = ''; return out; }
|
||||
|
||||
const hidden: string[] = [];
|
||||
const capture = (m: string): string => { const t = (m || '').trim(); if (t) hidden.push(t); return ''; };
|
||||
|
||||
let s = text;
|
||||
|
||||
// (A) If a Harmony `final` channel exists, the answer is what follows the LAST `final` marker,
|
||||
// up to the next control token or EOS. Everything before it is reasoning.
|
||||
const finalMatches = [...s.matchAll(/<\|?channel\|?>\s*final\b\s*(?:<\|?message\|?>)?/gi)];
|
||||
if (finalMatches.length > 0) {
|
||||
out.hadFinalChannel = true;
|
||||
const fm = finalMatches[finalMatches.length - 1];
|
||||
const start = (fm.index ?? 0) + fm[0].length;
|
||||
const before = dropControlTokens(s.slice(0, fm.index ?? 0));
|
||||
if (before) { hidden.push(before); out.hadHiddenReasoning = true; }
|
||||
const after = s.slice(start);
|
||||
const cut = after.search(/<\|?(?:channel|start|end|return)\|?>/i);
|
||||
s = cut >= 0 ? after.slice(0, cut) : after;
|
||||
} else {
|
||||
// (B) No final channel. Strip hidden channels — closed (followed by another control token) or
|
||||
// unclosed (running to EOS).
|
||||
s = s.replace(
|
||||
new RegExp(`<\\|?channel\\|?>\\s*${HIDDEN_CHANNEL_NAMES}\\b[\\s\\S]*?(?=<\\|?(?:channel|start)\\|?>|$)`, 'gi'),
|
||||
capture
|
||||
);
|
||||
// <think>/<thinking>/<analysis>/<reasoning>/<scratchpad> blocks — closed first, then unclosed-to-EOS.
|
||||
s = s.replace(/<(think(?:ing)?|analysis|reasoning|scratchpad|reflection)>[\s\S]*?<\/\1>/gi, capture);
|
||||
s = s.replace(/<(?:think(?:ing)?|analysis|reasoning|scratchpad|reflection)>[\s\S]*$/gi, capture);
|
||||
// (C) Leading bare "Thinking Process:" block — only when it's at the very top. Cut up to the
|
||||
// first plausible answer boundary (a heading, a "## 요약"-style line, "---", "답변:" …);
|
||||
// if there's no such boundary, the whole thing was reasoning.
|
||||
const lead = s.match(LEADING_THOUGHT_HEADER_RE);
|
||||
if (lead && (lead.index ?? 0) === 0) {
|
||||
const rest = s.slice(lead[0].length);
|
||||
const boundary = rest.search(
|
||||
/\n(?:#{1,6}\s|\*\*[^*\n]{1,40}\*\*\s*[::]|---\s*\r?\n|##?\s*(?:요약|결론|답변|정리|제안)|답변\s*[::]|결론\s*[::]|최종\s*답변|🔎|✅)/
|
||||
);
|
||||
if (boundary >= 0) {
|
||||
hidden.push((lead[0] + rest.slice(0, boundary)).trim());
|
||||
s = rest.slice(boundary + 1);
|
||||
} else {
|
||||
hidden.push(s.trim());
|
||||
s = '';
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
s = dropControlTokens(s);
|
||||
// Drop a now-leading bare marker line that survived (e.g. "Thinking Process:" with content already gone).
|
||||
s = s.replace(LEADING_THOUGHT_HEADER_RE, '').trim();
|
||||
|
||||
out.visible = s;
|
||||
out.hiddenReasoning = hidden.filter(Boolean).join('\n\n---\n\n');
|
||||
out.hadHiddenReasoning = out.hadHiddenReasoning || hidden.some((p) => p && p.trim());
|
||||
out.wasThoughtOnly = !out.visible && out.hadHiddenReasoning;
|
||||
return out;
|
||||
}
|
||||
|
||||
/** Should we silently re-ask the model for a final answer only (the last reply was all reasoning)? */
|
||||
export function shouldFinalOnlyRetry(cleaned: CleanedAssistantOutput): boolean {
|
||||
return cleaned.wasThoughtOnly;
|
||||
}
|
||||
|
||||
/**
|
||||
* Should we silently continue from where the answer was cut off? Only when it actually hit the
|
||||
* output-token ceiling and we already have a non-trivial visible answer to continue from.
|
||||
*/
|
||||
export function shouldAutoContinue(
|
||||
stopKind: GenerationStopKind,
|
||||
visibleAnswer: string,
|
||||
outputTokens: number,
|
||||
maxOutputTokens: number
|
||||
): boolean {
|
||||
if (stopKind !== 'output-limit') return false;
|
||||
if (!visibleAnswer || visibleAnswer.trim().length < 40) return false;
|
||||
if (!Number.isFinite(maxOutputTokens) || maxOutputTokens <= 0) return true;
|
||||
return outputTokens >= Math.floor(maxOutputTokens * 0.8);
|
||||
}
|
||||
|
||||
/** Appended to the system prompt for a final-only retry — the previous reply was reasoning-only. */
|
||||
export const FINAL_ONLY_DIRECTIVE = [
|
||||
'',
|
||||
'[FINAL ANSWER ONLY]',
|
||||
'Your previous reply contained only hidden reasoning (thought / analysis / channel markers) and no user-visible answer.',
|
||||
'Reply again with the FINAL ANSWER only — directly answer the user, in Korean.',
|
||||
'Do NOT include: <think>, <analysis>, <|channel|> markers, "Thinking Process:", planning notes, or any hidden reasoning.',
|
||||
].join('\n');
|
||||
|
||||
/** A short, self-contained system prompt for a continuation request (we deliberately drop the big context). */
|
||||
export const CONTINUATION_SYSTEM_PROMPT = [
|
||||
'You are continuing a user-visible final answer that was cut off mid-way because it hit the output limit.',
|
||||
'Output the FINAL ANSWER continuation only — in Korean. Do NOT repeat what was already written.',
|
||||
'Do NOT include <think>, <analysis>, <|channel|> markers, "Thinking Process:", or any hidden reasoning.',
|
||||
'Use the same assumptions and context as the answer so far; do not restart.',
|
||||
].join('\n');
|
||||
|
||||
/** Build the user message for a continuation request — original question + the answer so far (tail only). */
|
||||
export function buildContinuationUserPrompt(originalUserPrompt: string, visibleSoFar: string, tailChars = 1400): string {
|
||||
const tail = visibleSoFar.length > tailChars ? '…' + visibleSoFar.slice(-tailChars) : visibleSoFar;
|
||||
return [
|
||||
'Original user request:',
|
||||
(originalUserPrompt || '').trim() || '(unavailable)',
|
||||
'',
|
||||
'The answer so far (end of it — continue directly from here, do not repeat it):',
|
||||
'"""',
|
||||
tail.trim(),
|
||||
'"""',
|
||||
'',
|
||||
'Continue the answer from exactly where it stopped. Korean. Final answer only.',
|
||||
].join('\n');
|
||||
}
|
||||
|
||||
/** Join a continuation onto the previous visible answer, removing any verbatim overlap. */
|
||||
export function mergeContinuationParts(prev: string, next: string): string {
|
||||
const a = (prev || '').replace(/\s+$/, '');
|
||||
let b = (next || '').replace(/^\s+/, '');
|
||||
if (!b) return a;
|
||||
if (!a) return b;
|
||||
// Drop a leading chunk of `b` that the model re-stated verbatim from the end of `a`.
|
||||
const maxOverlap = Math.min(400, a.length, b.length);
|
||||
for (let len = maxOverlap; len >= 16; len--) {
|
||||
if (a.slice(-len) === b.slice(0, len)) { b = b.slice(len).replace(/^\s+/, ''); break; }
|
||||
}
|
||||
// If `a` ended mid-sentence (no terminal punctuation) just splice; otherwise add a paragraph break.
|
||||
const aEndsClean = /[.!?。!?\n)\]”"'`]\s*$/.test(a);
|
||||
return aEndsClean ? a + '\n\n' + b : a + b;
|
||||
}
|
||||
|
||||
/** Rough token count of a string — re-exported helper so callers don't need contextManager directly. */
|
||||
export const countTokens = estimateTokens;
|
||||
Reference in New Issue
Block a user