From 1efbe2ec0f4e39eefa13190b44cdf14691157ec6 Mon Sep 17 00:00:00 2001
From: g1nation <koriweb@gmail.com>
Date: Thu, 18 Jun 2026 17:08:49 +0900
Subject: [PATCH] =?UTF-8?q?fix(agent):=20=EB=B9=88=20=EC=9D=91=EB=8B=B5=20?=
 =?UTF-8?q?=EC=A7=84=EB=8B=A8=20=EC=A0=95=ED=99=95=EB=8F=84=20=E2=80=94=20?=
 =?UTF-8?q?MoE=20=ED=99=9C=EC=84=B1=20=ED=8C=8C=EB=9D=BC=EB=AF=B8=ED=84=B0?=
 =?UTF-8?q?=20=EC=9D=B8=EC=8B=9D=20(v2.2.254)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

약한 모델이 큰 입력에 첫 토큰 EOS로 빈 응답을 낼 때, 모델명 파서가
gemma-4-26b-a4b를 "26B 큰 모델"로 오판하던 문제 수정.

- estimateActiveParamsB 추가: MoE 활성 파라미터 추정(a4b→4, A3B→3, e2b→2)
- 빈 응답 에러 메시지 개선: 원인이 답변 길이가 아니라 입력 크기임을 명시,
  MoE 총/활성 파라미터 표기, LM Studio 로드 context length 불일치 1순위 점검 안내
- 테스트 +6건(전체 662 통과)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 PATCHNOTES.md                |  6 ++++++
 package.json                 |  2 +-
 src/agent.ts                 | 41 ++++++++++++++++++------------------
 src/lib/contextManager.ts    | 18 ++++++++++++++++
 tests/contextManager.test.ts | 17 +++++++++++++++
 5 files changed, 63 insertions(+), 21 deletions(-)

diff --git a/PATCHNOTES.md b/PATCHNOTES.md
index f798842..bb42a8b 100644
--- a/PATCHNOTES.md
+++ b/PATCHNOTES.md
@@ -1,5 +1,11 @@
 # Astra Patch Notes
 
+## v2.2.254 (2026-06-18)
+### 🔎 빈 응답(empty response) 진단 정확도 — MoE 활성 파라미터 인식
+- 일반 에이전트 채팅에서 약한 모델이 큰 입력에 첫 토큰 EOS 로 무너져 **빈 응답**이 날 때, 모델명 파서가 `gemma-4-26b-a4b` 를 "26B 큰 모델"로 오판해 엉뚱한 안내를 하던 문제. **활성 파라미터 추정**(`estimateActiveParamsB`: `a4b`→4, `A3B`→3, `e2b`→2) 추가 → MoE 를 정확히 식별. ([contextManager.ts](src/lib/contextManager.ts))
+- 빈 응답 에러 메시지 개선: (1) "**답변이 길어서가 아니라 입력이 모델 용량 대비 커서**" 발생함을 명시, (2) MoE 면 `총 ~26B / 활성 ~4B` 표기 + 활성 7B+ 권장, (3) **LM Studio 로드 context length 와 `g1nation.contextLength` 불일치** 가능성을 1순위 점검 항목으로 안내. ([agent.ts](src/agent.ts))
+- 참고: `/meet` 의 map-reduce 청킹은 그 명령 전용이며, 일반 채팅(코드 리뷰 등)에는 적용되지 않는다(단일 예산 호출). 테스트 +6건(전체 662 통과).
+
 ## v2.2.253 (2026-06-17)
 ### 🪓 /meet 조각 실패 시 절반 분할 재시도 (약한 모델 성공률↑)
 - v2.2.252 의 재시도(반복 억제 강화)에도 조각이 계속 붕괴하면, 그 조각을 **줄 경계로 절반씩 쪼개 재귀 재시도**한다(12K→6K→3.5K). 입력이 작아질수록 약한 모델의 출력 붕괴 확률이 떨어지므로, **모델 교체 없이도** 추출 성공률이 오른다. 최소 크기(3.5K) 이하인데도 실패하는 구간만 건너뛴다. ([handlers.ts](src/features/datacollect/handlers.ts))
diff --git a/package.json b/package.json
index d83d5b0..6e8e981 100644
--- a/package.json
+++ b/package.json
@@ -2,7 +2,7 @@
   "name": "astra",
   "displayName": "Astra",
   "description": "The personal intelligence layer for Antigravity and VS Code. A private cognitive partner for deep project context, memory, and proactive strategic decision-making.",
-  "version": "2.2.253",
+  "version": "2.2.254",
   "publisher": "g1nation",
   "license": "MIT",
   "icon": "assets/icon.png",
diff --git a/src/agent.ts b/src/agent.ts
index dd748c1..b533959 100644
--- a/src/agent.ts
+++ b/src/agent.ts
@@ -125,6 +125,7 @@ import {
     truncationNotice,
     shouldShowTruncationNotice,
     estimateModelParamsB,
+    estimateActiveParamsB,
     type ContextLimits,
 } from './lib/contextManager';
 import { samplingToRestBody, type ChatStreamStats } from './lmstudio/streamer';
@@ -1215,19 +1216,18 @@ export class AgentExecutor {
                     messageCount: messagesForRequest.length,
                     fallbackTried: loopDepth === 0 ? 'yes' : 'no',
                 });
-                // Cheap heuristic: parse a parameter-count hint out of the
-                // model identifier (e.g. "google/gemma-4-e2b", "qwen2-1.5b").
-                // Anything <= 3B is small enough that long-context generation
-                // commonly fails by emitting EOS as the first token even though
-                // the server log shows prompt-eval succeeded with truncated=0.
-                const smallModelMatch = actualModel.match(/(?<![0-9.])((?:[0-9]+\.)?[0-9]+)\s*[bB](?![a-zA-Z0-9])|[-_/]e?([0-9]+)b\b/i);
-                const paramB = smallModelMatch
-                    ? Number(smallModelMatch[1] ?? smallModelMatch[2])
-                    : Number.NaN;
-                const looksSmall = Number.isFinite(paramB) && paramB <= 3;
-                const promptIsLarge = promptCharCount > 60000; // ~15k tokens of English/code
-                const contextLimitHint =
-                    'LM Studio 로그에 `n_tokens = N, truncated = 0` 인데 `eval time` 이 0ms 라면 모델이 첫 토큰부터 EOS 를 뱉은 것입니다. 보통 컨텍스트 한계 초과 또는 모델 용량 부족입니다. 더 큰 모델(7B+)로 교체하거나 컨텍스트를 줄여 보세요.';
+                // 모델 식별자에서 "활성(active) 파라미터" 규모를 추정한다. MoE 모델은
+                // 총 파라미터(예: 26b)가 커도 활성 파라미터(예: a4b=4)가 작아 긴 프롬프트에서
+                // 첫 토큰부터 EOS 를 뱉는다(빈 응답). 총 파라미터만 보면 "26b → 큰 모델"로
+                // 오판하므로 활성 파라미터로 판정한다.
+                const activeB = estimateActiveParamsB(actualModel);
+                const totalB = estimateModelParamsB(actualModel);
+                const isMoE = activeB !== null && totalB !== null && activeB < totalB;
+                const capacityHint = isMoE
+                    ? `이 모델은 MoE 로 추정됩니다 (총 ~${totalB}B, **활성 ~${activeB}B**). 활성 파라미터가 작아 긴 입력(현재 ~${inputTokens.toLocaleString()} tokens)에서 첫 토큰부터 EOS 를 뱉어 빈 응답이 되기 쉽습니다. 코드 리뷰처럼 입력이 큰 작업은 **활성 7B+ 또는 한국어 특화 모델(EXAONE/Qwen 등)** 을 권장합니다.`
+                    : '입력이 큰 작업에서 모델이 첫 토큰부터 EOS 를 뱉으면 보통 모델 용량 부족 또는 컨텍스트 초과입니다. 더 큰 모델(7B+)로 교체하거나 입력을 줄여 보세요.';
+                const ctxMismatchHint =
+                    '**LM Studio 에 로드된 실제 context length 가 Astra 설정(`g1nation.contextLength`)보다 작은지** 확인하세요. 예: 설정은 32768 인데 모델은 8192/16384 로 로드돼 있으면, Astra 가 그 한도를 넘겨 보내 서버가 잘라내거나 EOS 를 뱉습니다. (LM Studio 모델 로드 옵션의 Context Length 와 설정값을 일치)';
 
                 const looksOverflow = outputBudget.tight || inputTokens > ctxLimits.contextLength - ctxLimits.safetyMargin;
                 this.webview.postMessage({
@@ -1235,18 +1235,19 @@ export class AgentExecutor {
                     value: [
                         'AI 엔진이 빈 응답을 반환했습니다 (스트리밍 + non-streaming 폴백 모두 실패).',
                         `Engine: ${engine}`,
-                        `Model: ${actualModel}`,
+                        `Model: ${actualModel}${isMoE ? ` (MoE: 총 ~${totalB}B / 활성 ~${activeB}B)` : ''}`,
                         `Prompt: ~${inputTokens.toLocaleString()} tokens (${promptCharCount.toLocaleString()} chars, ${messagesForRequest.length} messages) / context window ${ctxLimits.contextLength.toLocaleString()} tokens`,
                         `Output budget: ${maxOutputTokens.toLocaleString()} tokens`,
                         ...(finishStopReason ? [`Stop reason: ${finishStopReason}`] : []),
                         '',
+                        '⚠️ 빈 응답은 *답변이 길어서*가 아니라 *입력이 모델 용량에 비해 커서* 발생하는 경우가 대부분입니다 (출력은 어차피 위 budget 으로 제한됨).',
+                        '',
                         '다음을 시도해보세요:',
-                        '  • LM Studio에서 모델이 실제로 로드되어 있는지 확인',
-                        looksOverflow
-                            ? '  • 입력이 모델 context window 에 가깝습니다. `/newChat` 으로 대화를 새로 시작하거나, Skill/Brain 컨텍스트를 줄이거나, Settings 의 `g1nation.contextLength` 를 모델 실제 값으로 맞추세요.'
-                            : '  • 다른 모델로 전환하거나 LM Studio 서버를 재시작',
-                        '  • Settings에서 maxContextSize / memoryLongTermFiles 줄이기',
-                        ...(looksSmall || promptIsLarge ? ['  • ' + contextLimitHint] : []),
+                        '  • ' + ctxMismatchHint,
+                        '  • ' + capacityHint,
+                        '  • `/newChat` 으로 대화를 새로 시작하거나, Settings 에서 memoryLongTermFiles / Brain·Skill 컨텍스트를 줄여 입력을 축소',
+                        '  • LM Studio 에서 모델이 실제로 로드돼 있는지 / 서버 재시작',
+                        ...(looksOverflow ? ['  • 입력이 context window 에 매우 가깝습니다 — 위 컨텍스트 일치 확인이 특히 중요합니다.'] : []),
                     ].join('\n')
                 });
                 return;
diff --git a/src/lib/contextManager.ts b/src/lib/contextManager.ts
index 460c6a9..965e463 100644
--- a/src/lib/contextManager.ts
+++ b/src/lib/contextManager.ts
@@ -73,6 +73,24 @@ export function estimateModelParamsB(modelId: string | null | undefined): number
     return Number.isFinite(n) && n > 0 && n < 2000 ? n : null;
 }
 
+/**
+ * 실제 생성 능력을 좌우하는 "활성(active) 파라미터" 규모를 추출합니다. MoE 모델은
+ * 총 파라미터보다 활성 파라미터가 작아 긴 프롬프트에서 첫 토큰 EOS(빈 응답)로
+ * 무너지기 쉽습니다. 활성 표기(예: "...-a4b", "...-A22B", "gemma-3n-e2b")가 있으면
+ * 그 값을, 없으면 총 파라미터(estimateModelParamsB)를 돌려줍니다.
+ * 예: "gemma-4-26b-a4b-it" → 4, "qwen3-30b-a3b" → 3, "llama-3.1-8b" → 8.
+ */
+export function estimateActiveParamsB(modelId: string | null | undefined): number | null {
+    if (!modelId) return null;
+    // 활성 표기: 구분자 뒤 a/e + 숫자 + b (예: -a4b, _A22B, .e2b)
+    const moe = String(modelId).match(/[-_/:.\s][ae](\d+(?:\.\d+)?)\s*b(?![a-z0-9])/i);
+    if (moe) {
+        const n = Number(moe[1]);
+        if (Number.isFinite(n) && n > 0 && n < 2000) return n;
+    }
+    return estimateModelParamsB(modelId);
+}
+
 /** role/구분자 등 메시지 1개당 발생하는 고정 오버헤드(대략). */
 const PER_MESSAGE_TOKEN_OVERHEAD = 4;
 
diff --git a/tests/contextManager.test.ts b/tests/contextManager.test.ts
index ac3875d..7a7e8c1 100644
--- a/tests/contextManager.test.ts
+++ b/tests/contextManager.test.ts
@@ -7,6 +7,7 @@ import {
     classifyStopReason,
     shouldShowTruncationNotice,
     estimateModelParamsB,
+    estimateActiveParamsB,
     CONTEXT_OPEN_MARKER,
     CONTEXT_CLOSE_MARKER,
     type BudgetMessage,
@@ -29,6 +30,22 @@ describe('contextManager.estimateModelParamsB', () => {
     });
 });
 
+describe('contextManager.estimateActiveParamsB', () => {
+    it('prefers active params for MoE naming (a/e prefix)', () => {
+        expect(estimateActiveParamsB('gemma-4-26b-a4b-it')).toBe(4);   // 활성 4B (총 26B 아님)
+        expect(estimateActiveParamsB('Qwen3-30B-A3B')).toBe(3);        // 활성 3B
+        expect(estimateActiveParamsB('google/gemma-3n-e2b-it')).toBe(2);
+    });
+    it('falls back to total params when no active hint', () => {
+        expect(estimateActiveParamsB('llama-3.1-8b')).toBe(8);
+        expect(estimateActiveParamsB('qwen2.5-7b-instruct')).toBe(7);
+    });
+    it('returns null when there is no parameter hint', () => {
+        expect(estimateActiveParamsB('phi-3-mini')).toBeNull();
+        expect(estimateActiveParamsB('')).toBeNull();
+    });
+});
+
 describe('contextManager.computeOutputBudget', () => {
     const limits = { contextLength: 32768, maxOutputTokens: 4096, safetyMargin: 2048, minOutputTokens: 512 };
     it('caps at maxOutputTokens when there is plenty of room', () => {