Update project files

2026-05-22 15:00:14 +09:00
parent 132d130ff1
commit 8016ef18fa
29 changed files with 1353 additions and 804 deletions
@@ -183,6 +183,25 @@ export class AgentExecutor {
    static readonly ABS_PATH_RE = new RegExp(POSIX_ABS_PATH_SRC, 'i');
    static readonly WIN_ABS_PATH_RE = new RegExp(WIN_ABS_PATH_SRC, 'i');

+    /**
+     * Hard cap on retained in-memory chat messages. Older messages beyond this
+     * are dropped (the system/first message is always preserved). Generous so a
+     * normal session is untouched — this only fights unbounded growth in very
+     * long-running sessions. The per-request context budgeter
+     * (`trimHistoryToBudget`) still does the real fitting; this just stops the
+     * array itself from leaking memory across hundreds of turns.
+     */
+    private static readonly MAX_RETAINED_MESSAGES = 40;
+    /**
+     * Older internal tool-result messages (read_file / list_files / list_brain /
+     * read_brain dumps) are the bulkiest part of history and add little once the
+     * conversation has moved on. Anything older than the most recent
+     * `RECENT_FULL_MESSAGES` gets its bulky tool-result content shrunk to this
+     * many characters. Recent messages are kept full for conversation continuity.
+     */
+    private static readonly RECENT_FULL_MESSAGES = 16;
+    private static readonly OLD_TOOL_RESULT_CAP = 600;
+
    private chatHistory: ChatMessage[] = [];
    private abortController: AbortController | null = null;
    private webview: vscode.Webview | undefined;
@@ -225,9 +244,10 @@ export class AgentExecutor {

        // Initialize 5-Layer Cognitive Memory System
        const activeBrain = getActiveBrainProfile();
+        const initConfig = getConfig();
        this.memoryManager = new MemoryManager(activeBrain.localBrainPath, {
-            enabled: getConfig().memoryEnabled,
-            shortTermLimit: getConfig().memoryShortTermMessages,
+            enabled: initConfig.memoryEnabled,
+            shortTermLimit: initConfig.memoryShortTermMessages,
        });

        // Initialize RAG Pipeline Orchestrator
@@ -495,6 +515,9 @@ export class AgentExecutor {

            // 3. API Request Setup (라인 229에서 이미 추출한 ollamaUrl, configDefaultModel 재사용)
            const actualModel = (modelName && modelName.trim()) || configDefaultModel;
+            // Bound the in-memory history before building the request — shrinks bulky
+            // older tool-result bodies and drops the oldest messages past the cap.
+            this.capChatHistory();
            const reqMessages = this.buildRequestHistory(this.chatHistory);

            // Handle Vision Content Injection
@@ -666,10 +689,22 @@ export class AgentExecutor {
                .reduce((n, m) => n + (Array.isArray(m?.images) ? m.images.length : 0), 0);
            const imageTokenReserve = imageCount * 1024;

+            // Output budget we ACTUALLY reserve before trimming — not the bare
+            // minOutputTokens floor (512). If we only reserve 512, a long session
+            // is allowed to grow the prompt until ~512-1k tokens remain for the
+            // answer; small/MoE local models (e.g. gemma 4B-active) then emit EOS
+            // as the first token and return an empty response. Reserving ~10% of
+            // the window (>=2048) forces history/system trimming to keep a real
+            // answer-sized hole open. Capped at maxOutputTokens.
+            const preferredOutputReserve = Math.min(
+                ctxLimits.maxOutputTokens,
+                Math.max(2048, Math.floor(ctxLimits.contextLength * 0.1))
+            );
+
            // (1) 시스템 프롬프트는 예산의 ~65%까지만 허용 — 그 이상이면 [CONTEXT] 블록부터 잘라낸다.
            const systemCapTokens = Math.max(
                1024,
-                Math.floor((ctxLimits.contextLength - ctxLimits.safetyMargin - ctxLimits.minOutputTokens - imageTokenReserve) * 0.65)
+                Math.floor((ctxLimits.contextLength - ctxLimits.safetyMargin - preferredOutputReserve - imageTokenReserve) * 0.65)
            );
            const { prompt: budgetedSystemPrompt, truncated: systemTruncated } =
                truncateSystemPromptContext(fullSystemPrompt, systemCapTokens);
@@ -681,7 +716,7 @@ export class AgentExecutor {
            // (2) 대화 기록 압축.
            const historyBudget = Math.max(
                256,
-                ctxLimits.contextLength - systemTokens - ctxLimits.safetyMargin - ctxLimits.minOutputTokens - imageTokenReserve
+                ctxLimits.contextLength - systemTokens - ctxLimits.safetyMargin - preferredOutputReserve - imageTokenReserve
            );
            let budgetedHistory: ChatMessage[] = reqMessages;
            if (config.autoCompactHistory) {
@@ -1977,6 +2012,50 @@ export class AgentExecutor {
        ].join('\n');
    }

+    /**
+     * Bound the in-memory `chatHistory` so a very long-running session does not
+     * grow it without limit:
+     *   1. Older internal tool-result messages (the bulky read_file/list_files/…
+     *      dumps) beyond the most recent `RECENT_FULL_MESSAGES` have their content
+     *      truncated — recent messages stay full so continuity is unaffected.
+     *   2. If the array still exceeds `MAX_RETAINED_MESSAGES`, the oldest messages
+     *      are dropped, but a leading system/first message is always preserved so
+     *      session restore and conversation framing are not broken.
+     * This only mutates *internal* (`internal: true`) tool-result bodies and
+     * drops the very oldest entries — it never alters visible user/assistant text
+     * within the retained window, so the request the model sees is unchanged for
+     * any normal-length conversation.
+     */
+    private capChatHistory(): void {
+        const history = this.chatHistory;
+        if (history.length === 0) return;
+
+        // (1) Shrink bulky tool-result bodies of older internal messages.
+        const recentStart = Math.max(0, history.length - AgentExecutor.RECENT_FULL_MESSAGES);
+        for (let i = 0; i < recentStart; i++) {
+            const msg = history[i];
+            if (msg.role !== 'system' || !msg.internal || typeof msg.content !== 'string') continue;
+            // Only the bulky tool-result dumps — leave compaction notices etc. alone.
+            if (!/^\[Result of (read_file|list_files|list_brain|read_brain)\b/.test(msg.content)) continue;
+            if (msg.content.length <= AgentExecutor.OLD_TOOL_RESULT_CAP) continue;
+            msg.content = msg.content.slice(0, AgentExecutor.OLD_TOOL_RESULT_CAP)
+                + '\n…[이전 도구 결과는 컨텍스트 절약을 위해 축약되었습니다]';
+        }
+
+        // (2) Drop the oldest messages once over the hard cap, preserving a
+        //     leading system/first message if present.
+        if (history.length > AgentExecutor.MAX_RETAINED_MESSAGES) {
+            const first = history[0];
+            const preserveFirst = first.role === 'system';
+            const overflow = history.length - AgentExecutor.MAX_RETAINED_MESSAGES;
+            if (preserveFirst) {
+                history.splice(1, overflow);
+            } else {
+                history.splice(0, overflow);
+            }
+        }
+    }
+
    private buildRequestHistory(history: ChatMessage[]): ChatMessage[] {
        return history.map((message) => {
            if (message.role !== 'assistant' || typeof message.content !== 'string') {