chore: bump version to 2.80.17 and refine agent streaming logic
This commit is contained in:
+1
-1
@@ -2,7 +2,7 @@
|
|||||||
"name": "astra",
|
"name": "astra",
|
||||||
"displayName": "Astra",
|
"displayName": "Astra",
|
||||||
"description": "The personal intelligence layer for Antigravity and VS Code. A private cognitive partner for deep project context, memory, and proactive strategic decision-making.",
|
"description": "The personal intelligence layer for Antigravity and VS Code. A private cognitive partner for deep project context, memory, and proactive strategic decision-making.",
|
||||||
"version": "2.80.16",
|
"version": "2.80.17",
|
||||||
"publisher": "g1nation",
|
"publisher": "g1nation",
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
"icon": "assets/icon.png",
|
"icon": "assets/icon.png",
|
||||||
|
|||||||
+48
-309
@@ -451,74 +451,35 @@ export class AgentExecutor {
|
|||||||
if (this.isStaleRun(runId)) return;
|
if (this.isStaleRun(runId)) return;
|
||||||
|
|
||||||
let aiResponseText = '';
|
let aiResponseText = '';
|
||||||
const body = response.body as any;
|
const reader = response.body?.getReader();
|
||||||
if (!body) throw new Error("Response body is null.");
|
if (!reader) throw new Error("Response body is not readable.");
|
||||||
|
|
||||||
if (loopDepth === 0) this.webview?.postMessage({ type: 'streamStart' });
|
if (loopDepth === 0) this.webview.postMessage({ type: 'streamStart' });
|
||||||
|
|
||||||
let buffer = '';
|
let buffer = '';
|
||||||
const decoder = new TextDecoder();
|
const decoder = new TextDecoder();
|
||||||
|
|
||||||
const processChunk = (value: any) => {
|
|
||||||
if (this.isStaleRun(runId)) return false;
|
|
||||||
|
|
||||||
buffer += decoder.decode(value, { stream: true });
|
|
||||||
const lines = buffer.split('\n');
|
|
||||||
buffer = lines.pop() || '';
|
|
||||||
|
|
||||||
for (const line of lines) {
|
|
||||||
const trimmed = line.trim();
|
|
||||||
if (!trimmed || trimmed === 'data: [DONE]') continue;
|
|
||||||
|
|
||||||
try {
|
|
||||||
let raw = trimmed;
|
|
||||||
if (trimmed.startsWith('data:')) {
|
|
||||||
raw = trimmed.replace(/^data:\s*/, '');
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!raw || raw === '[DONE]') continue;
|
|
||||||
|
|
||||||
const json = JSON.parse(raw);
|
|
||||||
if (json.error) {
|
|
||||||
const errMsg = typeof json.error === 'string' ? json.error : (json.error.message || JSON.stringify(json.error));
|
|
||||||
throw new Error(`AI Engine Error: ${errMsg}`);
|
|
||||||
}
|
|
||||||
|
|
||||||
let token = '';
|
|
||||||
if (json.choices?.[0]) {
|
|
||||||
const choice = json.choices[0];
|
|
||||||
token = choice.delta?.content || choice.message?.content || choice.text || '';
|
|
||||||
} else if (json.message?.content) {
|
|
||||||
token = json.message.content;
|
|
||||||
} else if (json.response) {
|
|
||||||
token = json.response;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (token) {
|
|
||||||
aiResponseText += token;
|
|
||||||
if (loopDepth === 0) {
|
|
||||||
this.webview?.postMessage({ type: 'streamUpdate', value: token });
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch (e: any) {
|
|
||||||
// Silent fail for non-JSON lines unless it's an AI Engine Error
|
|
||||||
if (e.message.startsWith('AI Engine Error:')) throw e;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
};
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
if (typeof body[Symbol.asyncIterator] === 'function') {
|
while (true) {
|
||||||
for await (const chunk of body) {
|
const { done, value } = await reader.read();
|
||||||
if (!processChunk(chunk)) break;
|
if (done) break;
|
||||||
}
|
if (this.isStaleRun(runId)) return;
|
||||||
} else {
|
|
||||||
const reader = body.getReader();
|
buffer += decoder.decode(value, { stream: true });
|
||||||
while (true) {
|
const lines = buffer.split('\n');
|
||||||
const { done, value } = await reader.read();
|
buffer = lines.pop() || '';
|
||||||
if (done) break;
|
for (const line of lines) {
|
||||||
if (!processChunk(value)) break;
|
const trimmed = line.trim();
|
||||||
|
if (!trimmed || trimmed === 'data: [DONE]') continue;
|
||||||
|
try {
|
||||||
|
const raw = trimmed.startsWith('data: ') ? trimmed.slice(6) : trimmed;
|
||||||
|
const json = JSON.parse(raw);
|
||||||
|
const token = engine === 'lmstudio' ? json.choices?.[0]?.delta?.content || '' : json.message?.content || json.response || '';
|
||||||
|
if (token) {
|
||||||
|
aiResponseText += token;
|
||||||
|
}
|
||||||
|
} catch (e: any) {
|
||||||
|
logError('Failed to parse streaming chunk.', { engine, apiUrl, chunk: summarizeText(trimmed, 300), error: e?.message || String(e) });
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (err: any) {
|
} catch (err: any) {
|
||||||
@@ -527,7 +488,6 @@ export class AgentExecutor {
|
|||||||
} else {
|
} else {
|
||||||
logError('Stream reading error.', { engine, apiUrl, error: err?.message || String(err) });
|
logError('Stream reading error.', { engine, apiUrl, error: err?.message || String(err) });
|
||||||
this.webview?.postMessage({ type: 'error', value: `Connection lost: ${err.message}` });
|
this.webview?.postMessage({ type: 'error', value: `Connection lost: ${err.message}` });
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -535,87 +495,17 @@ export class AgentExecutor {
|
|||||||
if (buffer.trim() && buffer.trim() !== 'data: [DONE]') {
|
if (buffer.trim() && buffer.trim() !== 'data: [DONE]') {
|
||||||
try {
|
try {
|
||||||
const trimmed = buffer.trim();
|
const trimmed = buffer.trim();
|
||||||
let raw = trimmed;
|
const raw = trimmed.startsWith('data: ') ? trimmed.slice(6) : trimmed;
|
||||||
if (trimmed.startsWith('data:')) {
|
const json = JSON.parse(raw);
|
||||||
raw = trimmed.replace(/^data:\s*/, '');
|
const token = engine === 'lmstudio' ? json.choices?.[0]?.delta?.content || '' : json.message?.content || json.response || '';
|
||||||
}
|
if (token) {
|
||||||
|
aiResponseText += token;
|
||||||
if (raw && raw !== '[DONE]') {
|
|
||||||
const json = JSON.parse(raw);
|
|
||||||
if (json.error) {
|
|
||||||
const errMsg = typeof json.error === 'string' ? json.error : (json.error.message || JSON.stringify(json.error));
|
|
||||||
throw new Error(`AI Engine Error: ${errMsg}`);
|
|
||||||
}
|
|
||||||
let token = '';
|
|
||||||
if (json.choices?.[0]) {
|
|
||||||
const choice = json.choices[0];
|
|
||||||
token = choice.delta?.content || choice.message?.content || choice.text || '';
|
|
||||||
} else if (json.message?.content) {
|
|
||||||
token = json.message.content;
|
|
||||||
} else if (json.response) {
|
|
||||||
token = json.response;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (token) {
|
|
||||||
aiResponseText += token;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
} catch (e: any) {
|
} catch (e: any) {
|
||||||
logError('Failed to parse final streaming buffer.', { engine, apiUrl, buffer: summarizeText(buffer, 300), error: e?.message || String(e) });
|
logError('Failed to parse final streaming buffer.', { engine, apiUrl, buffer: summarizeText(buffer, 300), error: e?.message || String(e) });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// 4.1 Check for Ghost Response (Empty response from LM Studio/Ollama despite 200 OK)
|
|
||||||
if (!aiResponseText.trim() && request.engine === 'lmstudio' && loopDepth === 0) {
|
|
||||||
logInfo('Empty response detected from LM Studio. Retrying with extreme compression...', { model: actualModel });
|
|
||||||
|
|
||||||
// Force extreme compression: system + last user only
|
|
||||||
const sysMsg = messagesForRequest.find(m => m.role === 'system');
|
|
||||||
const lastUserMsg = [...messagesForRequest].reverse().find(m => m.role === 'user');
|
|
||||||
const extremeMessages = [
|
|
||||||
...(sysMsg ? [sysMsg] : []),
|
|
||||||
...(lastUserMsg ? [lastUserMsg] : [])
|
|
||||||
];
|
|
||||||
|
|
||||||
const retryRequest = await this.createStreamingRequest({
|
|
||||||
baseUrl: ollamaUrl,
|
|
||||||
modelName: actualModel,
|
|
||||||
reqMessages: extremeMessages,
|
|
||||||
temperature
|
|
||||||
});
|
|
||||||
|
|
||||||
if (retryRequest.response.ok) {
|
|
||||||
const retryBody = retryRequest.response.body as any;
|
|
||||||
const retryDecoder = new TextDecoder();
|
|
||||||
let retryBuffer = '';
|
|
||||||
|
|
||||||
// Simple stream reader for retry
|
|
||||||
const reader = retryBody.getReader();
|
|
||||||
while (true) {
|
|
||||||
const { done, value } = await reader.read();
|
|
||||||
if (done) break;
|
|
||||||
retryBuffer += retryDecoder.decode(value, { stream: true });
|
|
||||||
// ... simplified parsing for retry ...
|
|
||||||
const lines = retryBuffer.split('\n');
|
|
||||||
retryBuffer = lines.pop() || '';
|
|
||||||
for (const line of lines) {
|
|
||||||
const trimmed = line.trim();
|
|
||||||
if (!trimmed || trimmed === 'data: [DONE]') continue;
|
|
||||||
try {
|
|
||||||
const raw = trimmed.startsWith('data:') ? trimmed.replace(/^data:\s*/, '') : trimmed;
|
|
||||||
if (!raw || raw === '[DONE]') continue;
|
|
||||||
const json = JSON.parse(raw);
|
|
||||||
const token = json.choices?.[0]?.delta?.content || json.message?.content || json.response || '';
|
|
||||||
if (token) {
|
|
||||||
aiResponseText += token;
|
|
||||||
this.webview?.postMessage({ type: 'streamUpdate', value: token });
|
|
||||||
}
|
|
||||||
} catch {}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (this.isStaleRun(runId)) return;
|
if (this.isStaleRun(runId)) return;
|
||||||
if (requestTimeoutHandle) {
|
if (requestTimeoutHandle) {
|
||||||
clearTimeout(requestTimeoutHandle);
|
clearTimeout(requestTimeoutHandle);
|
||||||
@@ -673,33 +563,14 @@ export class AgentExecutor {
|
|||||||
this.statusBarManager.updateStatus(AgentStatus.Executing);
|
this.statusBarManager.updateStatus(AgentStatus.Executing);
|
||||||
const report = await this.executeActions(aiResponseText, rootPath, activeBrain);
|
const report = await this.executeActions(aiResponseText, rootPath, activeBrain);
|
||||||
if (!assistantContent.trim() && report.length === 0) {
|
if (!assistantContent.trim() && report.length === 0) {
|
||||||
// 실제 전송에 사용된 메시지(request.finalMessages)를 기준으로 토큰 재계산
|
logError('Model returned an empty response without actions.', { model: actualModel, engine, apiUrl, loopDepth });
|
||||||
const usedMessages = request.finalMessages || messagesForRequest;
|
|
||||||
const totalChars2 = usedMessages.reduce((acc, m) => acc + String(m.content || '').length, 0);
|
|
||||||
const estimatedTokens2 = Math.ceil(totalChars2 / 4);
|
|
||||||
const isContextOverflow = estimatedTokens2 > 2500; // 3000 한도에 근접하면 오버플로우로 간주
|
|
||||||
|
|
||||||
logError('Model returned an empty response without actions.', {
|
|
||||||
model: actualModel,
|
|
||||||
engine: request.engine,
|
|
||||||
apiUrl: request.apiUrl,
|
|
||||||
loopDepth,
|
|
||||||
estimatedTokens: estimatedTokens2,
|
|
||||||
wasCompressed: usedMessages.length !== messagesForRequest.length || totalChars2 !== (messagesForRequest.reduce((a, m) => a + String(m.content || '').length, 0))
|
|
||||||
});
|
|
||||||
this.webview.postMessage({
|
this.webview.postMessage({
|
||||||
type: 'error',
|
type: 'error',
|
||||||
value: [
|
value: [
|
||||||
'AI engine returned an empty response.',
|
'AI engine returned an empty response.',
|
||||||
`Engine: ${request.engine} | Model: ${actualModel}`,
|
`Engine: ${engine}`,
|
||||||
isContextOverflow
|
`Model: ${actualModel}`,
|
||||||
? `Context overflow: ~${estimatedTokens2.toLocaleString()} tokens (actually sent). The model context window was likely exceeded even after compression.`
|
'The request reached the local LLM server, but no usable content was returned. Try another model, restart the local server, or reduce the prompt/context size.'
|
||||||
: 'The request reached the LLM server, but no content was returned.',
|
|
||||||
'',
|
|
||||||
'**해결 방법:**',
|
|
||||||
isContextOverflow
|
|
||||||
? '1. Brain 비활성화 후 재시도 2. 더 큰 모델(7B+) 사용 3. 대화 기록 초기화 후 재시도'
|
|
||||||
: '1. LM Studio에서 해당 모델이 로드되어 있는지 확인 2. 모델 재시작 후 재시도 3. 다른 모델로 전환'
|
|
||||||
].join('\n')
|
].join('\n')
|
||||||
});
|
});
|
||||||
return;
|
return;
|
||||||
@@ -2069,12 +1940,11 @@ export class AgentExecutor {
|
|||||||
modelName: string;
|
modelName: string;
|
||||||
reqMessages: ChatMessage[];
|
reqMessages: ChatMessage[];
|
||||||
temperature: number;
|
temperature: number;
|
||||||
}): Promise<{ response: Response; engine: 'lmstudio' | 'ollama'; apiUrl: string; finalMessages: ChatMessage[] }> {
|
}): Promise<{ response: Response; engine: 'lmstudio' | 'ollama'; apiUrl: string }> {
|
||||||
const { baseUrl, modelName, reqMessages, temperature } = params;
|
const { baseUrl, modelName, reqMessages, temperature } = params;
|
||||||
const primaryEngine = resolveEngine(baseUrl);
|
const primaryEngine = resolveEngine(baseUrl);
|
||||||
const engines = primaryEngine === 'lmstudio' ? ['lmstudio', 'ollama'] as const : ['ollama', 'lmstudio'] as const;
|
const engines = primaryEngine === 'lmstudio' ? ['lmstudio', 'ollama'] as const : ['ollama', 'lmstudio'] as const;
|
||||||
let lastError: Error | null = null;
|
let lastError: Error | null = null;
|
||||||
let nCtxRetried = false; // n_ctx 재시도 1회 제한
|
|
||||||
|
|
||||||
for (const engine of engines) {
|
for (const engine of engines) {
|
||||||
const apiUrl = buildApiUrl(baseUrl, engine, 'chat');
|
const apiUrl = buildApiUrl(baseUrl, engine, 'chat');
|
||||||
@@ -2083,95 +1953,25 @@ export class AgentExecutor {
|
|||||||
|
|
||||||
for (const candidateModel of modelCandidates) {
|
for (const candidateModel of modelCandidates) {
|
||||||
for (const variant of messageVariants) {
|
for (const variant of messageVariants) {
|
||||||
// 실제 전송할 메시지
|
|
||||||
let finalMessages = variant.messages;
|
|
||||||
|
|
||||||
// ── LM Studio 선제적 컨텍스트 압축 ──
|
|
||||||
// 소형 모델(4B 등)은 GPU 메모리 부족으로 n_ctx가 설정값보다 크게 줄어들 수 있고,
|
|
||||||
// 이때 LM Studio는 에러 대신 200 OK + 빈 스트림을 반환하여 재시도 불가.
|
|
||||||
// 따라서 전송 전에 선제적으로 메시지를 n_ctx에 맞게 압축합니다.
|
|
||||||
if (engine === 'lmstudio') {
|
|
||||||
const totalCharsRaw = finalMessages.reduce((acc, m) => acc + String(m.content || '').length, 0);
|
|
||||||
const estimatedTokensRaw = Math.ceil(totalCharsRaw / 4);
|
|
||||||
const LM_CTX_SAFE_LIMIT = 3000; // 4096 n_ctx 기준 더 보수적인 안전 마진
|
|
||||||
|
|
||||||
if (estimatedTokensRaw > LM_CTX_SAFE_LIMIT) {
|
|
||||||
logInfo('LM Studio proactive compression triggered.', {
|
|
||||||
estimatedTokens: estimatedTokensRaw,
|
|
||||||
limit: LM_CTX_SAFE_LIMIT,
|
|
||||||
originalMessageCount: finalMessages.length
|
|
||||||
});
|
|
||||||
|
|
||||||
// 1. system 메시지에서 [CONTEXT] 이후 부분을 우선 제거
|
|
||||||
const sysIdx = finalMessages.findIndex(m => m.role === 'system');
|
|
||||||
if (sysIdx >= 0) {
|
|
||||||
const sysContent = String(finalMessages[sysIdx].content || '');
|
|
||||||
const contextSplit = sysContent.indexOf('[CONTEXT]');
|
|
||||||
if (contextSplit > 0) {
|
|
||||||
// [CONTEXT] 이전까지만 유지 (기본 시스템 프롬프트 + 핵심 지시)
|
|
||||||
const trimmedSys = sysContent.slice(0, contextSplit).trimEnd();
|
|
||||||
finalMessages = finalMessages.map((m, i) =>
|
|
||||||
i === sysIdx ? { ...m, content: trimmedSys + '\n[Context omitted: model context limit]' } : m
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// 2. 그래도 크면 시스템 프롬프트를 max 글자로 강제 잘라냄
|
|
||||||
const afterTrimChars = finalMessages.reduce((acc, m) => acc + String(m.content || '').length, 0);
|
|
||||||
const afterTrimTokens = Math.ceil(afterTrimChars / 4);
|
|
||||||
if (afterTrimTokens > LM_CTX_SAFE_LIMIT && sysIdx >= 0) {
|
|
||||||
// 유저 메시지 토큰 계산
|
|
||||||
const nonSysTokens = finalMessages
|
|
||||||
.filter((_, i) => i !== sysIdx)
|
|
||||||
.reduce((acc, m) => acc + String(m.content || '').length, 0) / 4;
|
|
||||||
const maxSysChars = Math.max(2000, (LM_CTX_SAFE_LIMIT - Math.ceil(nonSysTokens) - 512)) * 4;
|
|
||||||
const sysContent = String(finalMessages[sysIdx].content || '');
|
|
||||||
if (sysContent.length > maxSysChars) {
|
|
||||||
finalMessages = finalMessages.map((m, i) =>
|
|
||||||
i === sysIdx ? { ...m, content: sysContent.slice(0, maxSysChars) + '\n[Truncated for model context limit]' } : m
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// 3. 히스토리 메시지 정리: system + 마지막 user만 유지
|
|
||||||
const finalCheck = finalMessages.reduce((acc, m) => acc + String(m.content || '').length, 0) / 4;
|
|
||||||
if (finalCheck > LM_CTX_SAFE_LIMIT) {
|
|
||||||
const sysMsg = finalMessages.find(m => m.role === 'system');
|
|
||||||
const lastUserMsg = [...finalMessages].reverse().find(m => m.role === 'user');
|
|
||||||
finalMessages = [
|
|
||||||
...(sysMsg ? [sysMsg] : []),
|
|
||||||
...(lastUserMsg ? [lastUserMsg] : [])
|
|
||||||
];
|
|
||||||
}
|
|
||||||
|
|
||||||
logInfo('LM Studio compression result.', {
|
|
||||||
originalTokens: estimatedTokensRaw,
|
|
||||||
compressedTokens: Math.ceil(finalMessages.reduce((a, m) => a + String(m.content || '').length, 0) / 4),
|
|
||||||
messageCount: finalMessages.length
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const totalChars = finalMessages.reduce((acc, m) => acc + String(m.content || '').length, 0);
|
|
||||||
const estimatedTokens = Math.ceil(totalChars / 4);
|
|
||||||
const streamBody = {
|
const streamBody = {
|
||||||
model: candidateModel,
|
model: candidateModel,
|
||||||
messages: finalMessages.map(m => ({ role: m.role, content: m.content })),
|
messages: variant.messages,
|
||||||
stream: true,
|
stream: true,
|
||||||
...(engine === 'lmstudio'
|
...(engine === 'lmstudio'
|
||||||
? { max_tokens: Math.min(4096, Math.max(256, 3000 - estimatedTokens)), temperature }
|
? { max_tokens: 4096, temperature }
|
||||||
: { options: { num_ctx: 32768, num_predict: 4096, temperature } }),
|
: { options: { num_ctx: 32768, num_predict: 4096, temperature } }),
|
||||||
};
|
};
|
||||||
logInfo('AI streaming request started.', {
|
|
||||||
engine, apiUrl, model: candidateModel,
|
|
||||||
variant: variant.name,
|
|
||||||
messageCount: finalMessages.length,
|
|
||||||
estimatedTokens,
|
|
||||||
roles: finalMessages.map(message => message.role),
|
|
||||||
firstUserPreview: summarizeText(String(finalMessages.find(message => message.role === 'user')?.content || ''), 300)
|
|
||||||
});
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
logInfo('AI streaming request started.', {
|
||||||
|
engine,
|
||||||
|
apiUrl,
|
||||||
|
model: candidateModel,
|
||||||
|
variant: variant.name,
|
||||||
|
messageCount: variant.messages.length,
|
||||||
|
roles: variant.messages.map(message => message.role),
|
||||||
|
firstUserPreview: summarizeText(String(variant.messages.find(message => message.role === 'user')?.content || ''), 300)
|
||||||
|
});
|
||||||
const response = await fetch(apiUrl, {
|
const response = await fetch(apiUrl, {
|
||||||
method: 'POST',
|
method: 'POST',
|
||||||
headers: {
|
headers: {
|
||||||
@@ -2181,80 +1981,19 @@ export class AgentExecutor {
|
|||||||
'Connection': 'keep-alive'
|
'Connection': 'keep-alive'
|
||||||
},
|
},
|
||||||
body: JSON.stringify(streamBody),
|
body: JSON.stringify(streamBody),
|
||||||
signal: this.abortController?.signal
|
signal: this.abortController?.signal,
|
||||||
|
keepalive: true
|
||||||
});
|
});
|
||||||
|
|
||||||
if (!response.ok) {
|
if (!response.ok) {
|
||||||
const errText = await response.text();
|
const errText = await response.text();
|
||||||
|
|
||||||
// ── LM Studio n_keep >= n_ctx 에러 감지 및 자동 재시도 ──
|
|
||||||
const nCtxMatch = errText.match(/n_keep\s*:\s*(\d+)\s*>=?\s*n_ctx\s*:\s*(\d+)/);
|
|
||||||
if (nCtxMatch && engine === 'lmstudio' && !nCtxRetried) {
|
|
||||||
nCtxRetried = true;
|
|
||||||
const nCtx = parseInt(nCtxMatch[2], 10);
|
|
||||||
logInfo(`n_ctx overflow detected (n_ctx=${nCtx}). Compressing messages and retrying...`);
|
|
||||||
|
|
||||||
// system 메시지를 n_ctx 크기에 맞게 강제 압축
|
|
||||||
const maxResponseTokens = 512;
|
|
||||||
const maxSysTokens = Math.max(500, nCtx - maxResponseTokens);
|
|
||||||
const maxSysChars = maxSysTokens * 4;
|
|
||||||
|
|
||||||
// 히스토리는 마지막 user 메시지만 유지
|
|
||||||
const sysMsg = finalMessages.find(m => m.role === 'system');
|
|
||||||
const lastUserMsg = [...finalMessages].reverse().find(m => m.role === 'user');
|
|
||||||
|
|
||||||
const compressedMessages: ChatMessage[] = [];
|
|
||||||
if (sysMsg) {
|
|
||||||
const sysContent = String(sysMsg.content || '');
|
|
||||||
compressedMessages.push({
|
|
||||||
role: 'system',
|
|
||||||
content: sysContent.length > maxSysChars
|
|
||||||
? sysContent.slice(0, maxSysChars) + `\n[Compressed for n_ctx=${nCtx}]`
|
|
||||||
: sysContent,
|
|
||||||
internal: true
|
|
||||||
});
|
|
||||||
}
|
|
||||||
if (lastUserMsg) {
|
|
||||||
compressedMessages.push(lastUserMsg);
|
|
||||||
}
|
|
||||||
|
|
||||||
// 압축된 메시지로 즉시 재요청
|
|
||||||
const retryBody = {
|
|
||||||
model: candidateModel,
|
|
||||||
messages: compressedMessages.map(m => ({ role: m.role, content: m.content })),
|
|
||||||
stream: true,
|
|
||||||
max_tokens: Math.min(1024, maxResponseTokens),
|
|
||||||
temperature,
|
|
||||||
};
|
|
||||||
|
|
||||||
logInfo('Retrying with compressed context.', {
|
|
||||||
originalTokens: estimatedTokens,
|
|
||||||
compressedTokens: Math.ceil(compressedMessages.reduce((a, m) => a + String(m.content || '').length, 0) / 4),
|
|
||||||
nCtx,
|
|
||||||
messageCount: compressedMessages.length
|
|
||||||
});
|
|
||||||
|
|
||||||
const retryResponse = await fetch(apiUrl, {
|
|
||||||
method: 'POST',
|
|
||||||
headers: { 'Content-Type': 'application/json', 'Accept': 'text/event-stream', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive' },
|
|
||||||
body: JSON.stringify(retryBody),
|
|
||||||
signal: this.abortController?.signal
|
|
||||||
});
|
|
||||||
|
|
||||||
if (retryResponse.ok) {
|
|
||||||
logInfo('n_ctx retry succeeded.', { apiUrl });
|
|
||||||
return { response: retryResponse, engine, apiUrl, finalMessages: compressedMessages };
|
|
||||||
}
|
|
||||||
logError('n_ctx retry also failed.', { status: retryResponse.status });
|
|
||||||
}
|
|
||||||
|
|
||||||
lastError = new Error(`AI Engine error (${engine}/${variant.name}): ${response.status} - ${summarizeText(errText, 300)}`);
|
lastError = new Error(`AI Engine error (${engine}/${variant.name}): ${response.status} - ${summarizeText(errText, 300)}`);
|
||||||
logError('AI streaming request returned non-OK status.', { engine, variant: variant.name, apiUrl, status: response.status, body: summarizeText(errText, 500) });
|
logError('AI streaming request returned non-OK status.', { engine, variant: variant.name, apiUrl, status: response.status, body: summarizeText(errText, 500) });
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
logInfo('AI streaming request connected.', { engine, variant: variant.name, apiUrl });
|
logInfo('AI streaming request connected.', { engine, variant: variant.name, apiUrl });
|
||||||
return { response, engine, apiUrl, finalMessages };
|
return { response, engine, apiUrl };
|
||||||
} catch (error: any) {
|
} catch (error: any) {
|
||||||
lastError = error instanceof Error ? error : new Error(String(error));
|
lastError = error instanceof Error ? error : new Error(String(error));
|
||||||
logError('AI streaming request failed.', { engine, variant: variant.name, apiUrl, model: candidateModel, error: lastError.message });
|
logError('AI streaming request failed.', { engine, variant: variant.name, apiUrl, model: candidateModel, error: lastError.message });
|
||||||
|
|||||||
Reference in New Issue
Block a user