Update project files
This commit is contained in:
+83
-4
@@ -183,6 +183,25 @@ export class AgentExecutor {
|
||||
static readonly ABS_PATH_RE = new RegExp(POSIX_ABS_PATH_SRC, 'i');
|
||||
static readonly WIN_ABS_PATH_RE = new RegExp(WIN_ABS_PATH_SRC, 'i');
|
||||
|
||||
/**
|
||||
* Hard cap on retained in-memory chat messages. Older messages beyond this
|
||||
* are dropped (the system/first message is always preserved). Generous so a
|
||||
* normal session is untouched — this only fights unbounded growth in very
|
||||
* long-running sessions. The per-request context budgeter
|
||||
* (`trimHistoryToBudget`) still does the real fitting; this just stops the
|
||||
* array itself from leaking memory across hundreds of turns.
|
||||
*/
|
||||
private static readonly MAX_RETAINED_MESSAGES = 40;
|
||||
/**
|
||||
* Older internal tool-result messages (read_file / list_files / list_brain /
|
||||
* read_brain dumps) are the bulkiest part of history and add little once the
|
||||
* conversation has moved on. Anything older than the most recent
|
||||
* `RECENT_FULL_MESSAGES` gets its bulky tool-result content shrunk to this
|
||||
* many characters. Recent messages are kept full for conversation continuity.
|
||||
*/
|
||||
private static readonly RECENT_FULL_MESSAGES = 16;
|
||||
private static readonly OLD_TOOL_RESULT_CAP = 600;
|
||||
|
||||
private chatHistory: ChatMessage[] = [];
|
||||
private abortController: AbortController | null = null;
|
||||
private webview: vscode.Webview | undefined;
|
||||
@@ -225,9 +244,10 @@ export class AgentExecutor {
|
||||
|
||||
// Initialize 5-Layer Cognitive Memory System
|
||||
const activeBrain = getActiveBrainProfile();
|
||||
const initConfig = getConfig();
|
||||
this.memoryManager = new MemoryManager(activeBrain.localBrainPath, {
|
||||
enabled: getConfig().memoryEnabled,
|
||||
shortTermLimit: getConfig().memoryShortTermMessages,
|
||||
enabled: initConfig.memoryEnabled,
|
||||
shortTermLimit: initConfig.memoryShortTermMessages,
|
||||
});
|
||||
|
||||
// Initialize RAG Pipeline Orchestrator
|
||||
@@ -495,6 +515,9 @@ export class AgentExecutor {
|
||||
|
||||
// 3. API Request Setup (라인 229에서 이미 추출한 ollamaUrl, configDefaultModel 재사용)
|
||||
const actualModel = (modelName && modelName.trim()) || configDefaultModel;
|
||||
// Bound the in-memory history before building the request — shrinks bulky
|
||||
// older tool-result bodies and drops the oldest messages past the cap.
|
||||
this.capChatHistory();
|
||||
const reqMessages = this.buildRequestHistory(this.chatHistory);
|
||||
|
||||
// Handle Vision Content Injection
|
||||
@@ -666,10 +689,22 @@ export class AgentExecutor {
|
||||
.reduce((n, m) => n + (Array.isArray(m?.images) ? m.images.length : 0), 0);
|
||||
const imageTokenReserve = imageCount * 1024;
|
||||
|
||||
// Output budget we ACTUALLY reserve before trimming — not the bare
|
||||
// minOutputTokens floor (512). If we only reserve 512, a long session
|
||||
// is allowed to grow the prompt until ~512-1k tokens remain for the
|
||||
// answer; small/MoE local models (e.g. gemma 4B-active) then emit EOS
|
||||
// as the first token and return an empty response. Reserving ~10% of
|
||||
// the window (>=2048) forces history/system trimming to keep a real
|
||||
// answer-sized hole open. Capped at maxOutputTokens.
|
||||
const preferredOutputReserve = Math.min(
|
||||
ctxLimits.maxOutputTokens,
|
||||
Math.max(2048, Math.floor(ctxLimits.contextLength * 0.1))
|
||||
);
|
||||
|
||||
// (1) 시스템 프롬프트는 예산의 ~65%까지만 허용 — 그 이상이면 [CONTEXT] 블록부터 잘라낸다.
|
||||
const systemCapTokens = Math.max(
|
||||
1024,
|
||||
Math.floor((ctxLimits.contextLength - ctxLimits.safetyMargin - ctxLimits.minOutputTokens - imageTokenReserve) * 0.65)
|
||||
Math.floor((ctxLimits.contextLength - ctxLimits.safetyMargin - preferredOutputReserve - imageTokenReserve) * 0.65)
|
||||
);
|
||||
const { prompt: budgetedSystemPrompt, truncated: systemTruncated } =
|
||||
truncateSystemPromptContext(fullSystemPrompt, systemCapTokens);
|
||||
@@ -681,7 +716,7 @@ export class AgentExecutor {
|
||||
// (2) 대화 기록 압축.
|
||||
const historyBudget = Math.max(
|
||||
256,
|
||||
ctxLimits.contextLength - systemTokens - ctxLimits.safetyMargin - ctxLimits.minOutputTokens - imageTokenReserve
|
||||
ctxLimits.contextLength - systemTokens - ctxLimits.safetyMargin - preferredOutputReserve - imageTokenReserve
|
||||
);
|
||||
let budgetedHistory: ChatMessage[] = reqMessages;
|
||||
if (config.autoCompactHistory) {
|
||||
@@ -1977,6 +2012,50 @@ export class AgentExecutor {
|
||||
].join('\n');
|
||||
}
|
||||
|
||||
/**
|
||||
* Bound the in-memory `chatHistory` so a very long-running session does not
|
||||
* grow it without limit:
|
||||
* 1. Older internal tool-result messages (the bulky read_file/list_files/…
|
||||
* dumps) beyond the most recent `RECENT_FULL_MESSAGES` have their content
|
||||
* truncated — recent messages stay full so continuity is unaffected.
|
||||
* 2. If the array still exceeds `MAX_RETAINED_MESSAGES`, the oldest messages
|
||||
* are dropped, but a leading system/first message is always preserved so
|
||||
* session restore and conversation framing are not broken.
|
||||
* This only mutates *internal* (`internal: true`) tool-result bodies and
|
||||
* drops the very oldest entries — it never alters visible user/assistant text
|
||||
* within the retained window, so the request the model sees is unchanged for
|
||||
* any normal-length conversation.
|
||||
*/
|
||||
private capChatHistory(): void {
|
||||
const history = this.chatHistory;
|
||||
if (history.length === 0) return;
|
||||
|
||||
// (1) Shrink bulky tool-result bodies of older internal messages.
|
||||
const recentStart = Math.max(0, history.length - AgentExecutor.RECENT_FULL_MESSAGES);
|
||||
for (let i = 0; i < recentStart; i++) {
|
||||
const msg = history[i];
|
||||
if (msg.role !== 'system' || !msg.internal || typeof msg.content !== 'string') continue;
|
||||
// Only the bulky tool-result dumps — leave compaction notices etc. alone.
|
||||
if (!/^\[Result of (read_file|list_files|list_brain|read_brain)\b/.test(msg.content)) continue;
|
||||
if (msg.content.length <= AgentExecutor.OLD_TOOL_RESULT_CAP) continue;
|
||||
msg.content = msg.content.slice(0, AgentExecutor.OLD_TOOL_RESULT_CAP)
|
||||
+ '\n…[이전 도구 결과는 컨텍스트 절약을 위해 축약되었습니다]';
|
||||
}
|
||||
|
||||
// (2) Drop the oldest messages once over the hard cap, preserving a
|
||||
// leading system/first message if present.
|
||||
if (history.length > AgentExecutor.MAX_RETAINED_MESSAGES) {
|
||||
const first = history[0];
|
||||
const preserveFirst = first.role === 'system';
|
||||
const overflow = history.length - AgentExecutor.MAX_RETAINED_MESSAGES;
|
||||
if (preserveFirst) {
|
||||
history.splice(1, overflow);
|
||||
} else {
|
||||
history.splice(0, overflow);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private buildRequestHistory(history: ChatMessage[]): ChatMessage[] {
|
||||
return history.map((message) => {
|
||||
if (message.role !== 'assistant' || typeof message.content !== 'string') {
|
||||
|
||||
Reference in New Issue
Block a user