connectai/src/agent/handlePrompt/computeBudgetedRequest.ts

import { logInfo, logError } from '../../utils';
import type { ChatMessage } from '../../agent';
import {
    estimateTokens,
    estimateMessagesTokens,
    computeOutputBudget,
    trimHistoryToBudget,
    truncateSystemPromptContext,
    estimateModelParamsB,
    type ContextLimits,
} from '../../lib/contextManager';
import { buildDroppedHistorySummary } from '../../lib/contextBuilders/droppedHistorySummary';

export interface ComputeBudgetedRequestInput {
    fullSystemPrompt: string;
    /** Caller is expected to have run `capChatHistory` on this already. */
    reqMessages: ChatMessage[];
    actualModel: string;
    /** Result of `getConfig()` — reads contextLength, maxOutputTokens, contextSafetyMargin, smallModelContextCap, autoCompactHistory. */
    config: any;
    imageCount: number;
    /**
     * The model's *actually-loaded* context window (LM Studio `getContextLength()`),
     * when known. Budgeting uses the smaller of this and `config.contextLength` so we
     * never overflow a model loaded with a smaller window than the user's setting.
     * Omit (undefined) to budget against the configured value alone (prior behavior).
     */
    actualContextLength?: number;
}

export interface ComputeBudgetedRequestResult {
    messagesForRequest: ChatMessage[];
    ctxLimits: ContextLimits;
    inputTokens: number;
    maxOutputTokens: number;
    systemTokens: number;
    systemTruncated: boolean;
    droppedHistoryCount: number;
    budgetedHistoryLength: number;
    /** Exact return shape of `computeOutputBudget`. */
    outputBudget: { maxOutputTokens: number; available: number; tight: boolean };
    modelParamB: number | null;
    cappedForSmallModel: boolean;
    /** True when the model's real loaded window is smaller than `config.contextLength` (we clamped to the real one). */
    windowMismatch: boolean;
    /** The window actually used for budgeting (after real-window clamp + small-model cap). */
    effectiveContextLength: number;
}

/**
 * 입력(시스템 프롬프트 + 대화 기록 + 이미지)을 컨텍스트 윈도우 예산에 맞게 정리하고
 * 최종 요청 메시지 배열과 동적 출력 상한을 계산합니다.
 *
 * 호출 측에서 미리 capChatHistory 로 메시지 개수를 캡한 뒤 넘겨주는 것을 전제로 합니다
 * (AgentExecutor.MAX_RETAINED_MESSAGES 같은 정적 한도는 이 함수의 관심사가 아닙니다).
 */
export function computeBudgetedRequest(input: ComputeBudgetedRequestInput): ComputeBudgetedRequestResult {
    const { fullSystemPrompt, reqMessages, actualModel, config, imageCount } = input;

    // ──────────────────────────────────────────────────────────────────
    // [Context Limit Manager] context length 는 "답변을 그만큼 길게 써도 된다"
    // 는 뜻이 아니다: 시스템 프롬프트 + 대화 기록 + 입력 + 생성될 답변 + 여유분 ≤ context length.
    // 요청을 보내기 전에 입력 토큰을 추정해서
    //   (1) 시스템 프롬프트가 과하면 [CONTEXT] 블록을 마지막 수단으로 줄이고
    //   (2) 대화 기록을 남은 예산에 맞게 압축하고 (UI 표시용 chatHistory 는 건드리지 않음)
    //   (3) 동적으로 출력 상한(maxOutputTokens)을 계산한다.
    // ──────────────────────────────────────────────────────────────────
    // Optional opt-in guard (g1nation.smallModelContextCap, OFF/0 by default): some very small
    // models (≤3B) emit EOS as the first token when the prompt is near their context window
    // even though it nominally fits. If the user opted in, budget ≤3B models against that
    // smaller effective window. Never applied to 4B+ models, and never when the setting is 0 —
    // capping squeezes the output-token budget, so it's a knob, not a default.
    const modelParamB = estimateModelParamsB(actualModel);

    // The real ceiling is whatever window the model was actually loaded with — the
    // server truncates anything past it. When known, clamp the configured setting
    // down to it so we budget against the smaller of the two. (When unknown, keep
    // the configured value — prior behavior.)
    const actualWindow = (typeof input.actualContextLength === 'number'
        && Number.isFinite(input.actualContextLength)
        && input.actualContextLength > 0)
        ? input.actualContextLength
        : undefined;
    const configuredWindow = config.contextLength;
    const windowMismatch = actualWindow !== undefined && actualWindow < configuredWindow;
    const realWindow = actualWindow !== undefined ? Math.min(configuredWindow, actualWindow) : configuredWindow;
    if (windowMismatch) {
        logInfo('Model loaded with a smaller context window than the setting — clamping budget to the real window.', {
            model: actualModel, configuredWindow, actualWindow,
        });
    }

    const smallModelCap = config.smallModelContextCap; // 0 = disabled (default)
    const cappedForSmallModel = smallModelCap > 0
        && modelParamB !== null && modelParamB <= 3
        && realWindow > smallModelCap;
    const effectiveContextLength = cappedForSmallModel ? smallModelCap : realWindow;
    if (cappedForSmallModel) {
        logInfo('Small model detected — capping effective context window for budgeting.', {
            model: actualModel, paramB: modelParamB,
            nominalContext: realWindow, effectiveContext: effectiveContextLength,
        });
    }
    const ctxLimits: ContextLimits = {
        contextLength: effectiveContextLength,
        maxOutputTokens: config.maxOutputTokens,
        safetyMargin: config.contextSafetyMargin,
        minOutputTokens: 512,
    };
    const imageTokenReserve = imageCount * 1024;

    // Output budget we ACTUALLY reserve before trimming — not the bare
    // minOutputTokens floor (512). If we only reserve 512, a long session
    // is allowed to grow the prompt until ~512-1k tokens remain for the
    // answer; small/MoE local models (e.g. gemma 4B-active) then emit EOS
    // as the first token and return an empty response. Reserving ~10% of
    // the window (>=2048) forces history/system trimming to keep a real
    // answer-sized hole open. Capped at maxOutputTokens.
    const preferredOutputReserve = Math.min(
        ctxLimits.maxOutputTokens,
        Math.max(2048, Math.floor(ctxLimits.contextLength * 0.1))
    );

    // (1) 시스템 프롬프트는 예산의 ~65%까지만 허용 — 그 이상이면 [CONTEXT] 블록부터 잘라낸다.
    const systemCapTokens = Math.max(
        1024,
        Math.floor((ctxLimits.contextLength - ctxLimits.safetyMargin - preferredOutputReserve - imageTokenReserve) * 0.65)
    );
    const { prompt: budgetedSystemPrompt, truncated: systemTruncated } =
        truncateSystemPromptContext(fullSystemPrompt, systemCapTokens);
    if (systemTruncated) {
        logInfo('System prompt context truncated to fit the context window.', { model: actualModel, systemCapTokens });
    }
    const systemTokens = estimateTokens(budgetedSystemPrompt) + 4;

    // (2) 대화 기록 압축.
    const historyBudget = Math.max(
        256,
        ctxLimits.contextLength - systemTokens - ctxLimits.safetyMargin - preferredOutputReserve - imageTokenReserve
    );
    let budgetedHistory: ChatMessage[] = reqMessages;
    if (config.autoCompactHistory) {
        // v2.2.69 — dropped 메시지를 받아 heuristic 요약을 만든 뒤 한 system 메시지로 prepend.
        // 단순 count 마커는 "이전에 무슨 얘기를 했는지" 를 전혀 알려주지 않아 후속 턴에서 모델이
        // 맥락을 잃어버리는 회귀를 낳았다. 이제는 U1/A1/U2/A2 골자가 남아 sliding window 가 동작.
        const trim = trimHistoryToBudget<ChatMessage>(reqMessages, historyBudget, (_n, dropped) => ({
            role: 'system',
            content: buildDroppedHistorySummary(dropped),
            internal: true,
        }));
        budgetedHistory = trim.messages;
        if (trim.droppedCount > 0) {
            logInfo('Conversation history compacted to fit the context window (with summary).', {
                model: actualModel, droppedCount: trim.droppedCount, historyBudget,
            });
        }
    }

    const messagesForRequest: ChatMessage[] = [
        { role: 'system', content: budgetedSystemPrompt, internal: true },
        ...budgetedHistory
    ];

    // (3) 동적 출력 상한.
    const inputTokens = estimateMessagesTokens(messagesForRequest) + imageTokenReserve;
    const outputBudget = computeOutputBudget(inputTokens, ctxLimits);
    const maxOutputTokens = outputBudget.maxOutputTokens;
    if (outputBudget.tight) {
        logError('Prompt nearly fills the context window — output budget is at the minimum.', {
            model: actualModel, contextLength: ctxLimits.contextLength, inputTokens, maxOutputTokens,
        });
    }
    logInfo('Context budget computed.', {
        model: actualModel, contextLength: ctxLimits.contextLength,
        inputTokens, maxOutputTokens, droppedHistory: reqMessages.length - budgetedHistory.length,
    });

    return {
        messagesForRequest,
        ctxLimits,
        inputTokens,
        maxOutputTokens,
        systemTokens,
        systemTruncated,
        droppedHistoryCount: reqMessages.length - budgetedHistory.length,
        budgetedHistoryLength: budgetedHistory.length,
        outputBudget,
        modelParamB,
        cappedForSmallModel,
        windowMismatch,
        effectiveContextLength,
    };
}