import { logInfo, logError } from '../../utils'; import type { ChatMessage } from '../../agent'; import { estimateTokens, estimateMessagesTokens, computeOutputBudget, trimHistoryToBudget, truncateSystemPromptContext, estimateModelParamsB, type ContextLimits, } from '../../lib/contextManager'; import { buildDroppedHistorySummary } from '../../lib/contextBuilders/droppedHistorySummary'; export interface ComputeBudgetedRequestInput { fullSystemPrompt: string; /** Caller is expected to have run `capChatHistory` on this already. */ reqMessages: ChatMessage[]; actualModel: string; /** Result of `getConfig()` — reads contextLength, maxOutputTokens, contextSafetyMargin, smallModelContextCap, autoCompactHistory. */ config: any; imageCount: number; /** * The model's *actually-loaded* context window (LM Studio `getContextLength()`), * when known. Budgeting uses the smaller of this and `config.contextLength` so we * never overflow a model loaded with a smaller window than the user's setting. * Omit (undefined) to budget against the configured value alone (prior behavior). */ actualContextLength?: number; } export interface ComputeBudgetedRequestResult { messagesForRequest: ChatMessage[]; ctxLimits: ContextLimits; inputTokens: number; maxOutputTokens: number; systemTokens: number; systemTruncated: boolean; droppedHistoryCount: number; budgetedHistoryLength: number; /** Exact return shape of `computeOutputBudget`. */ outputBudget: { maxOutputTokens: number; available: number; tight: boolean }; modelParamB: number | null; cappedForSmallModel: boolean; /** True when the model's real loaded window is smaller than `config.contextLength` (we clamped to the real one). */ windowMismatch: boolean; /** The window actually used for budgeting (after real-window clamp + small-model cap). */ effectiveContextLength: number; } /** * 입력(시스템 프롬프트 + 대화 기록 + 이미지)을 컨텍스트 윈도우 예산에 맞게 정리하고 * 최종 요청 메시지 배열과 동적 출력 상한을 계산합니다. * * 호출 측에서 미리 capChatHistory 로 메시지 개수를 캡한 뒤 넘겨주는 것을 전제로 합니다 * (AgentExecutor.MAX_RETAINED_MESSAGES 같은 정적 한도는 이 함수의 관심사가 아닙니다). */ export function computeBudgetedRequest(input: ComputeBudgetedRequestInput): ComputeBudgetedRequestResult { const { fullSystemPrompt, reqMessages, actualModel, config, imageCount } = input; // ────────────────────────────────────────────────────────────────── // [Context Limit Manager] context length 는 "답변을 그만큼 길게 써도 된다" // 는 뜻이 아니다: 시스템 프롬프트 + 대화 기록 + 입력 + 생성될 답변 + 여유분 ≤ context length. // 요청을 보내기 전에 입력 토큰을 추정해서 // (1) 시스템 프롬프트가 과하면 [CONTEXT] 블록을 마지막 수단으로 줄이고 // (2) 대화 기록을 남은 예산에 맞게 압축하고 (UI 표시용 chatHistory 는 건드리지 않음) // (3) 동적으로 출력 상한(maxOutputTokens)을 계산한다. // ────────────────────────────────────────────────────────────────── // Optional opt-in guard (g1nation.smallModelContextCap, OFF/0 by default): some very small // models (≤3B) emit EOS as the first token when the prompt is near their context window // even though it nominally fits. If the user opted in, budget ≤3B models against that // smaller effective window. Never applied to 4B+ models, and never when the setting is 0 — // capping squeezes the output-token budget, so it's a knob, not a default. const modelParamB = estimateModelParamsB(actualModel); // The real ceiling is whatever window the model was actually loaded with — the // server truncates anything past it. When known, clamp the configured setting // down to it so we budget against the smaller of the two. (When unknown, keep // the configured value — prior behavior.) const actualWindow = (typeof input.actualContextLength === 'number' && Number.isFinite(input.actualContextLength) && input.actualContextLength > 0) ? input.actualContextLength : undefined; const configuredWindow = config.contextLength; const windowMismatch = actualWindow !== undefined && actualWindow < configuredWindow; const realWindow = actualWindow !== undefined ? Math.min(configuredWindow, actualWindow) : configuredWindow; if (windowMismatch) { logInfo('Model loaded with a smaller context window than the setting — clamping budget to the real window.', { model: actualModel, configuredWindow, actualWindow, }); } const smallModelCap = config.smallModelContextCap; // 0 = disabled (default) const cappedForSmallModel = smallModelCap > 0 && modelParamB !== null && modelParamB <= 3 && realWindow > smallModelCap; const effectiveContextLength = cappedForSmallModel ? smallModelCap : realWindow; if (cappedForSmallModel) { logInfo('Small model detected — capping effective context window for budgeting.', { model: actualModel, paramB: modelParamB, nominalContext: realWindow, effectiveContext: effectiveContextLength, }); } const ctxLimits: ContextLimits = { contextLength: effectiveContextLength, maxOutputTokens: config.maxOutputTokens, safetyMargin: config.contextSafetyMargin, minOutputTokens: 512, }; const imageTokenReserve = imageCount * 1024; // Output budget we ACTUALLY reserve before trimming — not the bare // minOutputTokens floor (512). If we only reserve 512, a long session // is allowed to grow the prompt until ~512-1k tokens remain for the // answer; small/MoE local models (e.g. gemma 4B-active) then emit EOS // as the first token and return an empty response. Reserving ~10% of // the window (>=2048) forces history/system trimming to keep a real // answer-sized hole open. Capped at maxOutputTokens. const preferredOutputReserve = Math.min( ctxLimits.maxOutputTokens, Math.max(2048, Math.floor(ctxLimits.contextLength * 0.1)) ); // (1) 시스템 프롬프트는 예산의 ~65%까지만 허용 — 그 이상이면 [CONTEXT] 블록부터 잘라낸다. const systemCapTokens = Math.max( 1024, Math.floor((ctxLimits.contextLength - ctxLimits.safetyMargin - preferredOutputReserve - imageTokenReserve) * 0.65) ); const { prompt: budgetedSystemPrompt, truncated: systemTruncated } = truncateSystemPromptContext(fullSystemPrompt, systemCapTokens); if (systemTruncated) { logInfo('System prompt context truncated to fit the context window.', { model: actualModel, systemCapTokens }); } const systemTokens = estimateTokens(budgetedSystemPrompt) + 4; // (2) 대화 기록 압축. const historyBudget = Math.max( 256, ctxLimits.contextLength - systemTokens - ctxLimits.safetyMargin - preferredOutputReserve - imageTokenReserve ); let budgetedHistory: ChatMessage[] = reqMessages; if (config.autoCompactHistory) { // v2.2.69 — dropped 메시지를 받아 heuristic 요약을 만든 뒤 한 system 메시지로 prepend. // 단순 count 마커는 "이전에 무슨 얘기를 했는지" 를 전혀 알려주지 않아 후속 턴에서 모델이 // 맥락을 잃어버리는 회귀를 낳았다. 이제는 U1/A1/U2/A2 골자가 남아 sliding window 가 동작. const trim = trimHistoryToBudget(reqMessages, historyBudget, (_n, dropped) => ({ role: 'system', content: buildDroppedHistorySummary(dropped), internal: true, })); budgetedHistory = trim.messages; if (trim.droppedCount > 0) { logInfo('Conversation history compacted to fit the context window (with summary).', { model: actualModel, droppedCount: trim.droppedCount, historyBudget, }); } } const messagesForRequest: ChatMessage[] = [ { role: 'system', content: budgetedSystemPrompt, internal: true }, ...budgetedHistory ]; // (3) 동적 출력 상한. const inputTokens = estimateMessagesTokens(messagesForRequest) + imageTokenReserve; const outputBudget = computeOutputBudget(inputTokens, ctxLimits); const maxOutputTokens = outputBudget.maxOutputTokens; if (outputBudget.tight) { logError('Prompt nearly fills the context window — output budget is at the minimum.', { model: actualModel, contextLength: ctxLimits.contextLength, inputTokens, maxOutputTokens, }); } logInfo('Context budget computed.', { model: actualModel, contextLength: ctxLimits.contextLength, inputTokens, maxOutputTokens, droppedHistory: reqMessages.length - budgetedHistory.length, }); return { messagesForRequest, ctxLimits, inputTokens, maxOutputTokens, systemTokens, systemTruncated, droppedHistoryCount: reqMessages.length - budgetedHistory.length, budgetedHistoryLength: budgetedHistory.length, outputBudget, modelParamB, cappedForSmallModel, windowMismatch, effectiveContextLength, }; }