fix: n_ctx retry in createStreamingRequest - compress msgs at API level not handlePrompt
This commit is contained in:
+71
-37
@@ -526,38 +526,7 @@ export class AgentExecutor {
|
||||
logInfo('Generation aborted by user.');
|
||||
} else {
|
||||
logError('Stream reading error.', { engine, apiUrl, error: err?.message || String(err) });
|
||||
// LM Studio llama.cpp n_keep > n_ctx 에러 감지 → 자동 압축 재시도
|
||||
const errMsg = String(err?.message || err || '');
|
||||
const nCtxMatch = errMsg.match(/n_keep\s*:\s*(\d+)\s*>=?\s*n_ctx\s*:\s*(\d+)/);
|
||||
if (nCtxMatch && loopDepth < 1) {
|
||||
// loopDepth < 1: 재시도는 최초 1회만 (무한루프 방지)
|
||||
const nCtx = parseInt(nCtxMatch[2], 10);
|
||||
const nKeep = parseInt(nCtxMatch[1], 10);
|
||||
logInfo('n_ctx overflow → auto-retrying with compressed system prompt.', { nKeep, nCtx });
|
||||
// 시스템 프롬프트를 n_ctx - 응답여유(512토큰) 범위로 강제 압축
|
||||
const maxSysChars = Math.max(800, (nCtx - 512)) * 4;
|
||||
const compressedSysPrompt = systemPrompt.length > maxSysChars
|
||||
? systemPrompt.slice(0, maxSysChars) + `\n[System prompt compressed: n_ctx=${nCtx}]`
|
||||
: systemPrompt;
|
||||
this.webview?.postMessage({
|
||||
type: 'streamChunk',
|
||||
value: `\n⚠️ *LM Studio n_ctx=${nCtx} 감지. 컨텍스트 압축 후 재시도 중...*\n`
|
||||
});
|
||||
await this.handlePrompt(prompt, modelName, {
|
||||
...options,
|
||||
loopDepth: loopDepth + 1, // 재시도 플래그
|
||||
runId,
|
||||
systemPrompt: compressedSysPrompt
|
||||
});
|
||||
} else if (nCtxMatch) {
|
||||
// 재시도 이미 했는데도 실패한 경우
|
||||
this.webview?.postMessage({
|
||||
type: 'error',
|
||||
value: `LM Studio n_ctx(${nCtxMatch[2]}) 초과: 압축 재시도도 실패했습니다. LM Studio에서 모델을 재로드하세요.`
|
||||
});
|
||||
} else {
|
||||
this.webview?.postMessage({ type: 'error', value: `Connection lost: ${err.message}` });
|
||||
}
|
||||
this.webview?.postMessage({ type: 'error', value: `Connection lost: ${err.message}` });
|
||||
|
||||
}
|
||||
}
|
||||
@@ -2044,6 +2013,7 @@ export class AgentExecutor {
|
||||
const primaryEngine = resolveEngine(baseUrl);
|
||||
const engines = primaryEngine === 'lmstudio' ? ['lmstudio', 'ollama'] as const : ['ollama', 'lmstudio'] as const;
|
||||
let lastError: Error | null = null;
|
||||
let nCtxRetried = false; // n_ctx 재시도 1회 제한
|
||||
|
||||
for (const engine of engines) {
|
||||
const apiUrl = buildApiUrl(baseUrl, engine, 'chat');
|
||||
@@ -2052,11 +2022,13 @@ export class AgentExecutor {
|
||||
|
||||
for (const candidateModel of modelCandidates) {
|
||||
for (const variant of messageVariants) {
|
||||
const totalChars = variant.messages.reduce((acc, m) => acc + String(m.content || '').length, 0);
|
||||
// 실제 전송할 메시지 (n_ctx 재시도 시 수정됨)
|
||||
let finalMessages = variant.messages;
|
||||
const totalChars = finalMessages.reduce((acc, m) => acc + String(m.content || '').length, 0);
|
||||
const estimatedTokens = Math.ceil(totalChars / 4);
|
||||
const streamBody = {
|
||||
model: candidateModel,
|
||||
messages: variant.messages,
|
||||
messages: finalMessages,
|
||||
stream: true,
|
||||
...(engine === 'lmstudio'
|
||||
? { max_tokens: 4096, temperature }
|
||||
@@ -2065,10 +2037,10 @@ export class AgentExecutor {
|
||||
logInfo('AI streaming request started.', {
|
||||
engine, apiUrl, model: candidateModel,
|
||||
variant: variant.name,
|
||||
messageCount: variant.messages.length,
|
||||
messageCount: finalMessages.length,
|
||||
estimatedTokens,
|
||||
roles: variant.messages.map(message => message.role),
|
||||
firstUserPreview: summarizeText(String(variant.messages.find(message => message.role === 'user')?.content || ''), 300)
|
||||
roles: finalMessages.map(message => message.role),
|
||||
firstUserPreview: summarizeText(String(finalMessages.find(message => message.role === 'user')?.content || ''), 300)
|
||||
});
|
||||
|
||||
try {
|
||||
@@ -2086,6 +2058,68 @@ export class AgentExecutor {
|
||||
|
||||
if (!response.ok) {
|
||||
const errText = await response.text();
|
||||
|
||||
// ── LM Studio n_keep >= n_ctx 에러 감지 및 자동 재시도 ──
|
||||
const nCtxMatch = errText.match(/n_keep\s*:\s*(\d+)\s*>=?\s*n_ctx\s*:\s*(\d+)/);
|
||||
if (nCtxMatch && engine === 'lmstudio' && !nCtxRetried) {
|
||||
nCtxRetried = true;
|
||||
const nCtx = parseInt(nCtxMatch[2], 10);
|
||||
logInfo(`n_ctx overflow detected (n_ctx=${nCtx}). Compressing messages and retrying...`);
|
||||
|
||||
// system 메시지를 n_ctx 크기에 맞게 강제 압축
|
||||
const maxResponseTokens = 512;
|
||||
const maxSysTokens = Math.max(500, nCtx - maxResponseTokens);
|
||||
const maxSysChars = maxSysTokens * 4;
|
||||
|
||||
// 히스토리는 마지막 user 메시지만 유지
|
||||
const sysMsg = finalMessages.find(m => m.role === 'system');
|
||||
const lastUserMsg = [...finalMessages].reverse().find(m => m.role === 'user');
|
||||
|
||||
const compressedMessages: ChatMessage[] = [];
|
||||
if (sysMsg) {
|
||||
const sysContent = String(sysMsg.content || '');
|
||||
compressedMessages.push({
|
||||
role: 'system',
|
||||
content: sysContent.length > maxSysChars
|
||||
? sysContent.slice(0, maxSysChars) + `\n[Compressed for n_ctx=${nCtx}]`
|
||||
: sysContent,
|
||||
internal: true
|
||||
});
|
||||
}
|
||||
if (lastUserMsg) {
|
||||
compressedMessages.push(lastUserMsg);
|
||||
}
|
||||
|
||||
// 압축된 메시지로 즉시 재요청
|
||||
const retryBody = {
|
||||
model: candidateModel,
|
||||
messages: compressedMessages.map(m => ({ role: m.role, content: m.content })),
|
||||
stream: true,
|
||||
max_tokens: Math.min(1024, maxResponseTokens),
|
||||
temperature,
|
||||
};
|
||||
|
||||
logInfo('Retrying with compressed context.', {
|
||||
originalTokens: estimatedTokens,
|
||||
compressedTokens: Math.ceil(compressedMessages.reduce((a, m) => a + String(m.content || '').length, 0) / 4),
|
||||
nCtx,
|
||||
messageCount: compressedMessages.length
|
||||
});
|
||||
|
||||
const retryResponse = await fetch(apiUrl, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json', 'Accept': 'text/event-stream', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive' },
|
||||
body: JSON.stringify(retryBody),
|
||||
signal: this.abortController?.signal
|
||||
});
|
||||
|
||||
if (retryResponse.ok) {
|
||||
logInfo('n_ctx retry succeeded.', { apiUrl });
|
||||
return { response: retryResponse, engine, apiUrl };
|
||||
}
|
||||
logError('n_ctx retry also failed.', { status: retryResponse.status });
|
||||
}
|
||||
|
||||
lastError = new Error(`AI Engine error (${engine}/${variant.name}): ${response.status} - ${summarizeText(errText, 300)}`);
|
||||
logError('AI streaming request returned non-OK status.', { engine, variant: variant.name, apiUrl, status: response.status, body: summarizeText(errText, 500) });
|
||||
continue;
|
||||
|
||||
Reference in New Issue
Block a user