From 68941528920c916eeb98257beb7b186cfcf3ff92 Mon Sep 17 00:00:00 2001 From: g1nation Date: Thu, 7 May 2026 18:12:57 +0900 Subject: [PATCH] chore: bump version to 2.80.17 and refine agent streaming logic --- package.json | 2 +- src/agent.ts | 357 +++++++-------------------------------------------- 2 files changed, 49 insertions(+), 310 deletions(-) diff --git a/package.json b/package.json index 9c39d10..af9de03 100644 --- a/package.json +++ b/package.json @@ -2,7 +2,7 @@ "name": "astra", "displayName": "Astra", "description": "The personal intelligence layer for Antigravity and VS Code. A private cognitive partner for deep project context, memory, and proactive strategic decision-making.", - "version": "2.80.16", + "version": "2.80.17", "publisher": "g1nation", "license": "MIT", "icon": "assets/icon.png", diff --git a/src/agent.ts b/src/agent.ts index d09277e..6a930de 100644 --- a/src/agent.ts +++ b/src/agent.ts @@ -451,74 +451,35 @@ export class AgentExecutor { if (this.isStaleRun(runId)) return; let aiResponseText = ''; - const body = response.body as any; - if (!body) throw new Error("Response body is null."); + const reader = response.body?.getReader(); + if (!reader) throw new Error("Response body is not readable."); - if (loopDepth === 0) this.webview?.postMessage({ type: 'streamStart' }); + if (loopDepth === 0) this.webview.postMessage({ type: 'streamStart' }); let buffer = ''; const decoder = new TextDecoder(); - - const processChunk = (value: any) => { - if (this.isStaleRun(runId)) return false; - - buffer += decoder.decode(value, { stream: true }); - const lines = buffer.split('\n'); - buffer = lines.pop() || ''; - - for (const line of lines) { - const trimmed = line.trim(); - if (!trimmed || trimmed === 'data: [DONE]') continue; - - try { - let raw = trimmed; - if (trimmed.startsWith('data:')) { - raw = trimmed.replace(/^data:\s*/, ''); - } - - if (!raw || raw === '[DONE]') continue; - - const json = JSON.parse(raw); - if (json.error) { - const errMsg = typeof json.error === 'string' ? json.error : (json.error.message || JSON.stringify(json.error)); - throw new Error(`AI Engine Error: ${errMsg}`); - } - - let token = ''; - if (json.choices?.[0]) { - const choice = json.choices[0]; - token = choice.delta?.content || choice.message?.content || choice.text || ''; - } else if (json.message?.content) { - token = json.message.content; - } else if (json.response) { - token = json.response; - } - - if (token) { - aiResponseText += token; - if (loopDepth === 0) { - this.webview?.postMessage({ type: 'streamUpdate', value: token }); - } - } - } catch (e: any) { - // Silent fail for non-JSON lines unless it's an AI Engine Error - if (e.message.startsWith('AI Engine Error:')) throw e; - } - } - return true; - }; - try { - if (typeof body[Symbol.asyncIterator] === 'function') { - for await (const chunk of body) { - if (!processChunk(chunk)) break; - } - } else { - const reader = body.getReader(); - while (true) { - const { done, value } = await reader.read(); - if (done) break; - if (!processChunk(value)) break; + while (true) { + const { done, value } = await reader.read(); + if (done) break; + if (this.isStaleRun(runId)) return; + + buffer += decoder.decode(value, { stream: true }); + const lines = buffer.split('\n'); + buffer = lines.pop() || ''; + for (const line of lines) { + const trimmed = line.trim(); + if (!trimmed || trimmed === 'data: [DONE]') continue; + try { + const raw = trimmed.startsWith('data: ') ? trimmed.slice(6) : trimmed; + const json = JSON.parse(raw); + const token = engine === 'lmstudio' ? json.choices?.[0]?.delta?.content || '' : json.message?.content || json.response || ''; + if (token) { + aiResponseText += token; + } + } catch (e: any) { + logError('Failed to parse streaming chunk.', { engine, apiUrl, chunk: summarizeText(trimmed, 300), error: e?.message || String(e) }); + } } } } catch (err: any) { @@ -527,7 +488,6 @@ export class AgentExecutor { } else { logError('Stream reading error.', { engine, apiUrl, error: err?.message || String(err) }); this.webview?.postMessage({ type: 'error', value: `Connection lost: ${err.message}` }); - } } @@ -535,87 +495,17 @@ export class AgentExecutor { if (buffer.trim() && buffer.trim() !== 'data: [DONE]') { try { const trimmed = buffer.trim(); - let raw = trimmed; - if (trimmed.startsWith('data:')) { - raw = trimmed.replace(/^data:\s*/, ''); - } - - if (raw && raw !== '[DONE]') { - const json = JSON.parse(raw); - if (json.error) { - const errMsg = typeof json.error === 'string' ? json.error : (json.error.message || JSON.stringify(json.error)); - throw new Error(`AI Engine Error: ${errMsg}`); - } - let token = ''; - if (json.choices?.[0]) { - const choice = json.choices[0]; - token = choice.delta?.content || choice.message?.content || choice.text || ''; - } else if (json.message?.content) { - token = json.message.content; - } else if (json.response) { - token = json.response; - } - - if (token) { - aiResponseText += token; - } + const raw = trimmed.startsWith('data: ') ? trimmed.slice(6) : trimmed; + const json = JSON.parse(raw); + const token = engine === 'lmstudio' ? json.choices?.[0]?.delta?.content || '' : json.message?.content || json.response || ''; + if (token) { + aiResponseText += token; } } catch (e: any) { logError('Failed to parse final streaming buffer.', { engine, apiUrl, buffer: summarizeText(buffer, 300), error: e?.message || String(e) }); } } - // 4.1 Check for Ghost Response (Empty response from LM Studio/Ollama despite 200 OK) - if (!aiResponseText.trim() && request.engine === 'lmstudio' && loopDepth === 0) { - logInfo('Empty response detected from LM Studio. Retrying with extreme compression...', { model: actualModel }); - - // Force extreme compression: system + last user only - const sysMsg = messagesForRequest.find(m => m.role === 'system'); - const lastUserMsg = [...messagesForRequest].reverse().find(m => m.role === 'user'); - const extremeMessages = [ - ...(sysMsg ? [sysMsg] : []), - ...(lastUserMsg ? [lastUserMsg] : []) - ]; - - const retryRequest = await this.createStreamingRequest({ - baseUrl: ollamaUrl, - modelName: actualModel, - reqMessages: extremeMessages, - temperature - }); - - if (retryRequest.response.ok) { - const retryBody = retryRequest.response.body as any; - const retryDecoder = new TextDecoder(); - let retryBuffer = ''; - - // Simple stream reader for retry - const reader = retryBody.getReader(); - while (true) { - const { done, value } = await reader.read(); - if (done) break; - retryBuffer += retryDecoder.decode(value, { stream: true }); - // ... simplified parsing for retry ... - const lines = retryBuffer.split('\n'); - retryBuffer = lines.pop() || ''; - for (const line of lines) { - const trimmed = line.trim(); - if (!trimmed || trimmed === 'data: [DONE]') continue; - try { - const raw = trimmed.startsWith('data:') ? trimmed.replace(/^data:\s*/, '') : trimmed; - if (!raw || raw === '[DONE]') continue; - const json = JSON.parse(raw); - const token = json.choices?.[0]?.delta?.content || json.message?.content || json.response || ''; - if (token) { - aiResponseText += token; - this.webview?.postMessage({ type: 'streamUpdate', value: token }); - } - } catch {} - } - } - } - } - if (this.isStaleRun(runId)) return; if (requestTimeoutHandle) { clearTimeout(requestTimeoutHandle); @@ -673,33 +563,14 @@ export class AgentExecutor { this.statusBarManager.updateStatus(AgentStatus.Executing); const report = await this.executeActions(aiResponseText, rootPath, activeBrain); if (!assistantContent.trim() && report.length === 0) { - // 실제 전송에 사용된 메시지(request.finalMessages)를 기준으로 토큰 재계산 - const usedMessages = request.finalMessages || messagesForRequest; - const totalChars2 = usedMessages.reduce((acc, m) => acc + String(m.content || '').length, 0); - const estimatedTokens2 = Math.ceil(totalChars2 / 4); - const isContextOverflow = estimatedTokens2 > 2500; // 3000 한도에 근접하면 오버플로우로 간주 - - logError('Model returned an empty response without actions.', { - model: actualModel, - engine: request.engine, - apiUrl: request.apiUrl, - loopDepth, - estimatedTokens: estimatedTokens2, - wasCompressed: usedMessages.length !== messagesForRequest.length || totalChars2 !== (messagesForRequest.reduce((a, m) => a + String(m.content || '').length, 0)) - }); + logError('Model returned an empty response without actions.', { model: actualModel, engine, apiUrl, loopDepth }); this.webview.postMessage({ type: 'error', value: [ 'AI engine returned an empty response.', - `Engine: ${request.engine} | Model: ${actualModel}`, - isContextOverflow - ? `Context overflow: ~${estimatedTokens2.toLocaleString()} tokens (actually sent). The model context window was likely exceeded even after compression.` - : 'The request reached the LLM server, but no content was returned.', - '', - '**해결 방법:**', - isContextOverflow - ? '1. Brain 비활성화 후 재시도 2. 더 큰 모델(7B+) 사용 3. 대화 기록 초기화 후 재시도' - : '1. LM Studio에서 해당 모델이 로드되어 있는지 확인 2. 모델 재시작 후 재시도 3. 다른 모델로 전환' + `Engine: ${engine}`, + `Model: ${actualModel}`, + 'The request reached the local LLM server, but no usable content was returned. Try another model, restart the local server, or reduce the prompt/context size.' ].join('\n') }); return; @@ -2069,12 +1940,11 @@ export class AgentExecutor { modelName: string; reqMessages: ChatMessage[]; temperature: number; - }): Promise<{ response: Response; engine: 'lmstudio' | 'ollama'; apiUrl: string; finalMessages: ChatMessage[] }> { + }): Promise<{ response: Response; engine: 'lmstudio' | 'ollama'; apiUrl: string }> { const { baseUrl, modelName, reqMessages, temperature } = params; const primaryEngine = resolveEngine(baseUrl); const engines = primaryEngine === 'lmstudio' ? ['lmstudio', 'ollama'] as const : ['ollama', 'lmstudio'] as const; let lastError: Error | null = null; - let nCtxRetried = false; // n_ctx 재시도 1회 제한 for (const engine of engines) { const apiUrl = buildApiUrl(baseUrl, engine, 'chat'); @@ -2083,95 +1953,25 @@ export class AgentExecutor { for (const candidateModel of modelCandidates) { for (const variant of messageVariants) { - // 실제 전송할 메시지 - let finalMessages = variant.messages; - - // ── LM Studio 선제적 컨텍스트 압축 ── - // 소형 모델(4B 등)은 GPU 메모리 부족으로 n_ctx가 설정값보다 크게 줄어들 수 있고, - // 이때 LM Studio는 에러 대신 200 OK + 빈 스트림을 반환하여 재시도 불가. - // 따라서 전송 전에 선제적으로 메시지를 n_ctx에 맞게 압축합니다. - if (engine === 'lmstudio') { - const totalCharsRaw = finalMessages.reduce((acc, m) => acc + String(m.content || '').length, 0); - const estimatedTokensRaw = Math.ceil(totalCharsRaw / 4); - const LM_CTX_SAFE_LIMIT = 3000; // 4096 n_ctx 기준 더 보수적인 안전 마진 - - if (estimatedTokensRaw > LM_CTX_SAFE_LIMIT) { - logInfo('LM Studio proactive compression triggered.', { - estimatedTokens: estimatedTokensRaw, - limit: LM_CTX_SAFE_LIMIT, - originalMessageCount: finalMessages.length - }); - - // 1. system 메시지에서 [CONTEXT] 이후 부분을 우선 제거 - const sysIdx = finalMessages.findIndex(m => m.role === 'system'); - if (sysIdx >= 0) { - const sysContent = String(finalMessages[sysIdx].content || ''); - const contextSplit = sysContent.indexOf('[CONTEXT]'); - if (contextSplit > 0) { - // [CONTEXT] 이전까지만 유지 (기본 시스템 프롬프트 + 핵심 지시) - const trimmedSys = sysContent.slice(0, contextSplit).trimEnd(); - finalMessages = finalMessages.map((m, i) => - i === sysIdx ? { ...m, content: trimmedSys + '\n[Context omitted: model context limit]' } : m - ); - } - } - - // 2. 그래도 크면 시스템 프롬프트를 max 글자로 강제 잘라냄 - const afterTrimChars = finalMessages.reduce((acc, m) => acc + String(m.content || '').length, 0); - const afterTrimTokens = Math.ceil(afterTrimChars / 4); - if (afterTrimTokens > LM_CTX_SAFE_LIMIT && sysIdx >= 0) { - // 유저 메시지 토큰 계산 - const nonSysTokens = finalMessages - .filter((_, i) => i !== sysIdx) - .reduce((acc, m) => acc + String(m.content || '').length, 0) / 4; - const maxSysChars = Math.max(2000, (LM_CTX_SAFE_LIMIT - Math.ceil(nonSysTokens) - 512)) * 4; - const sysContent = String(finalMessages[sysIdx].content || ''); - if (sysContent.length > maxSysChars) { - finalMessages = finalMessages.map((m, i) => - i === sysIdx ? { ...m, content: sysContent.slice(0, maxSysChars) + '\n[Truncated for model context limit]' } : m - ); - } - } - - // 3. 히스토리 메시지 정리: system + 마지막 user만 유지 - const finalCheck = finalMessages.reduce((acc, m) => acc + String(m.content || '').length, 0) / 4; - if (finalCheck > LM_CTX_SAFE_LIMIT) { - const sysMsg = finalMessages.find(m => m.role === 'system'); - const lastUserMsg = [...finalMessages].reverse().find(m => m.role === 'user'); - finalMessages = [ - ...(sysMsg ? [sysMsg] : []), - ...(lastUserMsg ? [lastUserMsg] : []) - ]; - } - - logInfo('LM Studio compression result.', { - originalTokens: estimatedTokensRaw, - compressedTokens: Math.ceil(finalMessages.reduce((a, m) => a + String(m.content || '').length, 0) / 4), - messageCount: finalMessages.length - }); - } - } - - const totalChars = finalMessages.reduce((acc, m) => acc + String(m.content || '').length, 0); - const estimatedTokens = Math.ceil(totalChars / 4); const streamBody = { model: candidateModel, - messages: finalMessages.map(m => ({ role: m.role, content: m.content })), + messages: variant.messages, stream: true, ...(engine === 'lmstudio' - ? { max_tokens: Math.min(4096, Math.max(256, 3000 - estimatedTokens)), temperature } + ? { max_tokens: 4096, temperature } : { options: { num_ctx: 32768, num_predict: 4096, temperature } }), }; - logInfo('AI streaming request started.', { - engine, apiUrl, model: candidateModel, - variant: variant.name, - messageCount: finalMessages.length, - estimatedTokens, - roles: finalMessages.map(message => message.role), - firstUserPreview: summarizeText(String(finalMessages.find(message => message.role === 'user')?.content || ''), 300) - }); try { + logInfo('AI streaming request started.', { + engine, + apiUrl, + model: candidateModel, + variant: variant.name, + messageCount: variant.messages.length, + roles: variant.messages.map(message => message.role), + firstUserPreview: summarizeText(String(variant.messages.find(message => message.role === 'user')?.content || ''), 300) + }); const response = await fetch(apiUrl, { method: 'POST', headers: { @@ -2181,80 +1981,19 @@ export class AgentExecutor { 'Connection': 'keep-alive' }, body: JSON.stringify(streamBody), - signal: this.abortController?.signal + signal: this.abortController?.signal, + keepalive: true }); if (!response.ok) { const errText = await response.text(); - - // ── LM Studio n_keep >= n_ctx 에러 감지 및 자동 재시도 ── - const nCtxMatch = errText.match(/n_keep\s*:\s*(\d+)\s*>=?\s*n_ctx\s*:\s*(\d+)/); - if (nCtxMatch && engine === 'lmstudio' && !nCtxRetried) { - nCtxRetried = true; - const nCtx = parseInt(nCtxMatch[2], 10); - logInfo(`n_ctx overflow detected (n_ctx=${nCtx}). Compressing messages and retrying...`); - - // system 메시지를 n_ctx 크기에 맞게 강제 압축 - const maxResponseTokens = 512; - const maxSysTokens = Math.max(500, nCtx - maxResponseTokens); - const maxSysChars = maxSysTokens * 4; - - // 히스토리는 마지막 user 메시지만 유지 - const sysMsg = finalMessages.find(m => m.role === 'system'); - const lastUserMsg = [...finalMessages].reverse().find(m => m.role === 'user'); - - const compressedMessages: ChatMessage[] = []; - if (sysMsg) { - const sysContent = String(sysMsg.content || ''); - compressedMessages.push({ - role: 'system', - content: sysContent.length > maxSysChars - ? sysContent.slice(0, maxSysChars) + `\n[Compressed for n_ctx=${nCtx}]` - : sysContent, - internal: true - }); - } - if (lastUserMsg) { - compressedMessages.push(lastUserMsg); - } - - // 압축된 메시지로 즉시 재요청 - const retryBody = { - model: candidateModel, - messages: compressedMessages.map(m => ({ role: m.role, content: m.content })), - stream: true, - max_tokens: Math.min(1024, maxResponseTokens), - temperature, - }; - - logInfo('Retrying with compressed context.', { - originalTokens: estimatedTokens, - compressedTokens: Math.ceil(compressedMessages.reduce((a, m) => a + String(m.content || '').length, 0) / 4), - nCtx, - messageCount: compressedMessages.length - }); - - const retryResponse = await fetch(apiUrl, { - method: 'POST', - headers: { 'Content-Type': 'application/json', 'Accept': 'text/event-stream', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive' }, - body: JSON.stringify(retryBody), - signal: this.abortController?.signal - }); - - if (retryResponse.ok) { - logInfo('n_ctx retry succeeded.', { apiUrl }); - return { response: retryResponse, engine, apiUrl, finalMessages: compressedMessages }; - } - logError('n_ctx retry also failed.', { status: retryResponse.status }); - } - lastError = new Error(`AI Engine error (${engine}/${variant.name}): ${response.status} - ${summarizeText(errText, 300)}`); logError('AI streaming request returned non-OK status.', { engine, variant: variant.name, apiUrl, status: response.status, body: summarizeText(errText, 500) }); continue; } logInfo('AI streaming request connected.', { engine, variant: variant.name, apiUrl }); - return { response, engine, apiUrl, finalMessages }; + return { response, engine, apiUrl }; } catch (error: any) { lastError = error instanceof Error ? error : new Error(String(error)); logError('AI streaming request failed.', { engine, variant: variant.name, apiUrl, model: candidateModel, error: lastError.message });