chore: v2.2.73 — ASTRA-DEBUG 로그 레벨 + webview CSP font-src 보강

- ASTRA-DEBUG 정상 흐름 로그를 console.error → logInfo/console.log 로 강등 (chatHandlers, extension, slashRouter): DevTools에 ERR로 찍히던 오탐 제거 - sidebar webview에 명시적 CSP meta 추가 + font-src에 data: 허용 (sidebar.html, sidebarProvider._getHtml): VS Code outer iframe이 codicon.ttf를 data:font/ttf 로 inject하면서 기본 CSP에 막혀 매 prompt 마다 violation 경고가 찍히던 문제 해소 - 누적된 LM Studio / agent / 컨텍스트 매니저 / 테스트 갱신 동반 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-23 15:52:19 +09:00
parent 36db170844
commit 0712014fcb
43 changed files with 2417 additions and 977 deletions
@@ -53,6 +53,7 @@ import {
 } from './retrieval/knowledgeMix';
 import {
    extractVisibleFinal,
+    stripMarkdownFormatting,
    shouldFinalOnlyRetry,
    shouldAutoContinue,
    looksCutOff,
@@ -73,6 +74,7 @@ import {
    estimateModelParamsB,
    type ContextLimits,
 } from './lib/contextManager';
+import { samplingToRestBody, type LmStudioSampling, type ChatStreamStats } from './lmstudio/streamer';

 export interface ChatMessage {
    role: 'user' | 'assistant' | 'system';
@@ -208,6 +210,10 @@ export class AgentExecutor {
    private historyChangeListener: HistoryChangeListener | undefined;
    private runSerial = 0;
    private activeRunId = 0;
+    // v2.2.69 — 모드 전환 감지용. handlePrompt 진입 시 현재 mode signature 를 계산해
+    // 직전 값과 다르면 system prompt 에 "이전 대화에서 ... 모드 전환됨" 한 줄을 끼운다.
+    // mode signature 는 (agent skill, multiAgent, company mode, 활성 brain) 의 해시.
+    private _lastModeSignature: string | null = null;
    private transactionManager: TransactionManager;
    private sessionManager: SessionManager;
    private statusBarManager: StatusBarManager;
@@ -369,6 +375,9 @@ export class AgentExecutor {
            this.onSessionEnd();
        }
        this.chatHistory = [];
+        // v2.2.69 — 새 세션엔 "이전 모드" 가 없음. mode signature 초기화하지 않으면 첫 메시지에서
+        // 직전 세션의 mode 와 비교돼 잘못된 bridge 가 끼는 회귀가 생긴다.
+        this._lastModeSignature = null;
        this.emitHistoryChanged();
    }

@@ -387,6 +396,7 @@ export class AgentExecutor {
            this.onSessionEnd();
        }
        this.chatHistory = [];
+        this._lastModeSignature = null;
        this.emitHistoryChanged();
    }

@@ -633,6 +643,39 @@ export class AgentExecutor {
            // 제거하고 에이전트 프롬프트를 최후단에 배치하여 절대 우선 적용.
            // ──────────────────────────────────────────────────────────────────
            const isAgentMode = !!options.agentSkillContext;
+
+            // v2.2.69 — 모드 전환 bridge. 현재 mode signature 를 직전 값과 비교해 바뀌었으면
+            // "이전 대화는 X 모드에서 Y 주제로 진행됨 / 지금부터 Z 모드" 한 줄을 system prompt 에 끼운다.
+            // chatHistory 자체는 손대지 않으므로 사용자 입장에선 대화가 연속되어 보이면서도
+            // 모델은 "모드가 바뀐 직후" 임을 인지한다.
+            let modeBridgeCtx = '';
+            try {
+                const agentSkillName = options.agentSkillContext
+                    ? (options.agentSkillContext.split('\n')[0] || '').slice(0, 60).replace(/^#\s*/, '').trim()
+                    : '';
+                const currentSig = this.computeModeSignature({
+                    agentSkillName: agentSkillName || undefined,
+                    companyMode: !!(options as any).companyMode,
+                    multiAgent: !!(options as any).multiAgent,
+                    brainName: getActiveBrainProfile()?.name,
+                });
+                if (this._lastModeSignature !== null && this._lastModeSignature !== currentSig) {
+                    const topic = this.buildLastTopicLine();
+                    const bridgeLines = [
+                        '',
+                        '[MODE TRANSITION BRIDGE]',
+                        `이전 모드: ${this._lastModeSignature}`,
+                        `현재 모드: ${currentSig}`,
+                    ];
+                    if (topic) bridgeLines.push(`직전 대화 주제(한 줄): ${topic}`);
+                    bridgeLines.push('대화 history 는 그대로 이어진다. 새 모드의 페르소나/포맷을 따르되, 직전까지 사용자가 다루던 맥락을 잊지 말 것.');
+                    modeBridgeCtx = bridgeLines.join('\n');
+                }
+                this._lastModeSignature = currentSig;
+            } catch (e: any) {
+                logError('Mode-bridge computation failed (non-fatal).', { error: e?.message || String(e) });
+            }
+
            let fullSystemPrompt: string;

            if (isAgentMode) {
@@ -665,7 +708,7 @@ export class AgentExecutor {

                // [CONTEXT] … [/CONTEXT] 사이만 컨텍스트 초과 시 trim 대상 — agentBlock(앞)·reminder(뒤)·negative 는 보호.
                // memoryCtx(RAG/메모리/lessons)도 [CONTEXT] 안에 넣어 토큰이 빡빡할 때 대화 기록보다 먼저 잘리게 한다.
-                fullSystemPrompt = `${agentBlock}\n\n${strippedSystemPrompt}${designerCtx}${secondBrainTraceCtx}\n\n[CONTEXT]\n${memoryCtx}\n${knowledgeContextForPrompt}\n${contextBlock}\n[/CONTEXT]\n${negativeCtx}${agentTailReminder}`;
+                fullSystemPrompt = `${agentBlock}${modeBridgeCtx ? '\n\n' + modeBridgeCtx : ''}\n\n${strippedSystemPrompt}${designerCtx}${secondBrainTraceCtx}\n\n[CONTEXT]\n${memoryCtx}\n${knowledgeContextForPrompt}\n${contextBlock}\n[/CONTEXT]\n${negativeCtx}${agentTailReminder}`;
            } else {
                // 기존 Astra 모드 (에이전트 미선택)
                const localProjectKnowledgeCtx = prompt && localPathContext && this.isProjectKnowledgeCreationRequest(prompt)
@@ -700,7 +743,7 @@ export class AgentExecutor {
                    })()
                    : '';
                // memoryCtx(RAG/메모리/lessons)는 [CONTEXT] 안에 — 토큰이 빡빡하면 대화 기록보다 먼저 잘림.
-                fullSystemPrompt = `${systemPrompt}${designerCtx}${projectArchitectureCtx}${localProjectKnowledgeCtx}${thinkingPartnerCtx}${astraStanceCtx}${secondBrainTraceCtx}${v4PolicyCtx}${knowledgeMixCtx}${casualCtx}\n\n[CONTEXT]\n${memoryCtx}\n${knowledgeContextForPrompt}\n${contextBlock}\n[/CONTEXT]\n${negativeCtx}`;
+                fullSystemPrompt = `${systemPrompt}${modeBridgeCtx ? '\n\n' + modeBridgeCtx : ''}${designerCtx}${projectArchitectureCtx}${localProjectKnowledgeCtx}${thinkingPartnerCtx}${astraStanceCtx}${secondBrainTraceCtx}${v4PolicyCtx}${knowledgeMixCtx}${casualCtx}\n\n[CONTEXT]\n${memoryCtx}\n${knowledgeContextForPrompt}\n${contextBlock}\n[/CONTEXT]\n${negativeCtx}`;
            }
            // ──────────────────────────────────────────────────────────────────
            // [Context Limit Manager] context length 는 "답변을 그만큼 길게 써도 된다"
@@ -768,14 +811,17 @@ export class AgentExecutor {
            );
            let budgetedHistory: ChatMessage[] = reqMessages;
            if (config.autoCompactHistory) {
-                const trim = trimHistoryToBudget<ChatMessage>(reqMessages, historyBudget, (n) => ({
+                // v2.2.69 — dropped 메시지를 받아 heuristic 요약을 만든 뒤 한 system 메시지로 prepend.
+                // 단순 count 마커는 "이전에 무슨 얘기를 했는지" 를 전혀 알려주지 않아 후속 턴에서 모델이
+                // 맥락을 잃어버리는 회귀를 낳았다. 이제는 U1/A1/U2/A2 골자가 남아 sliding window 가 동작.
+                const trim = trimHistoryToBudget<ChatMessage>(reqMessages, historyBudget, (_n, dropped) => ({
                    role: 'system',
-                    content: `[이전 대화 ${n}개 메시지는 컨텍스트 한계 때문에 이번 요청에서 생략되었습니다. 필요하면 사용자에게 다시 확인하세요.]`,
+                    content: this.buildDroppedHistorySummary(dropped),
                    internal: true,
                }));
                budgetedHistory = trim.messages;
                if (trim.droppedCount > 0) {
-                    logInfo('Conversation history compacted to fit the context window.', {
+                    logInfo('Conversation history compacted to fit the context window (with summary).', {
                        model: actualModel, droppedCount: trim.droppedCount, historyBudget,
                    });
                }
@@ -864,8 +910,12 @@ export class AgentExecutor {
            // policy enforcement) emits a final `streamReplace` so the bubble
            // ends up matching the cleaned answer regardless of what slipped
            // through live.
-            const postLiveDeltas = loopDepth === 0;
+            // [Clean Stream] g1nation.liveStreamTokens=false (기본) 이면 토큰을 내부에만
+            // 누적하고 sanitize 끝난 최종 답변만 한 번에 표시 → Harmony/think 마커가 잠깐
+            // 화면에 노출되는 누설을 원천 차단한다. true 로 두면 legacy 라이브 스트리밍.
+            const postLiveDeltas = loopDepth === 0 && getConfig().liveStreamTokens === true;

+            let lmStudioStats: ChatStreamStats | undefined;
            if (useLmStudioSdk) {
                apiUrl = `${ollamaUrl} (sdk)`;
                logInfo('Streaming chat via LM Studio SDK.', { model: actualModel });
@@ -876,15 +926,35 @@ export class AgentExecutor {
                        temperature,
                        maxTokens: maxOutputTokens,
                        contextOverflowPolicy: config.contextOverflowPolicy,
+                        ...this.lmStudioSamplingFromConfig(),
+                        ...this.lmStudioRespondExtrasFromConfig(),
                        signal: this.abortController.signal,
                    });
-                    for await (const { token, stopReason } of stream) {
+                    for await (const { token, stopReason, stats } of stream) {
                        if (this.isStaleRun(runId)) return;
                        if (token) {
                            aiResponseText += token;
                            if (postLiveDeltas) this.webview.postMessage({ type: 'streamChunk', value: token });
                        }
                        if (stopReason) finishStopReason = stopReason;
+                        if (stats) lmStudioStats = stats;
+                    }
+                    if (lmStudioStats && getConfig().lmStudioShowStatsInBudget && loopDepth === 0) {
+                        this.webview.postMessage({
+                            type: 'lmStudioStats',
+                            value: {
+                                model: actualModel,
+                                tokensPerSecond: lmStudioStats.tokensPerSecond,
+                                timeToFirstTokenSec: lmStudioStats.timeToFirstTokenSec,
+                                predictedTokensCount: lmStudioStats.predictedTokensCount,
+                                promptTokensCount: lmStudioStats.promptTokensCount,
+                                totalTimeSec: lmStudioStats.totalTimeSec,
+                                draftModelKey: lmStudioStats.draftModelKey,
+                                draftTokensCount: lmStudioStats.draftTokensCount,
+                                acceptedDraftTokensCount: lmStudioStats.acceptedDraftTokensCount,
+                                stopReason: finishStopReason,
+                            },
+                        });
                    }
                } catch (err: any) {
                    if (err?.name === 'AbortError' || this.abortController.signal.aborted) {
@@ -1007,60 +1077,34 @@ export class AgentExecutor {
            //
            // Only attempts recovery on loopDepth === 0 — we don't want to
            // ping-pong inside the autonomous action loop.
+            //
+            // Note: the previous SDK handle-reset retry that lived here is now done
+            // inside `LMStudioStreamer.stream()` itself (it auto-recreates the SDK
+            // on attempt 2 for both dead-handle errors *and* clean-but-empty streams),
+            // so by the time we get here with `useLmStudioSdk` and no text, the SDK
+            // path has already tried twice. Go straight to the REST fallback.
            if (!aiResponseText.trim() && !this.abortController?.signal.aborted && loopDepth === 0) {
-                if (useLmStudioSdk && this.options.lmStudioStreamer?.resetHandle) {
-                    try {
-                        logInfo('Empty SDK stream — resetting LM Studio handle and retrying streaming once.', { model: actualModel });
-                        await this.options.lmStudioStreamer.resetHandle(actualModel);
-                        const retryStream = this.options.lmStudioStreamer.stream({
-                            modelName: actualModel,
-                            messages: messagesForRequest.map((m) => ({ role: m.role, content: m.content })),
-                            temperature,
-                            maxTokens: maxOutputTokens,
-                            contextOverflowPolicy: config.contextOverflowPolicy,
-                            signal: this.abortController.signal,
-                        });
-                        let retryText = '';
-                        for await (const { token, stopReason } of retryStream) {
-                            if (this.isStaleRun(runId)) return;
-                            if (token) {
-                                retryText += token;
-                                if (postLiveDeltas) this.webview.postMessage({ type: 'streamChunk', value: token });
-                            }
-                            if (stopReason) finishStopReason = stopReason;
-                        }
-                        if (retryText.trim()) {
-                            aiResponseText = retryText;
-                            logInfo('Handle-reset retry recovered the answer.', { model: actualModel, length: retryText.length });
-                        }
-                    } catch (retryErr: any) {
-                        logError('Handle-reset retry failed.', { model: actualModel, error: retryErr?.message ?? String(retryErr) });
-                    }
-                }
-
-                if (!aiResponseText.trim() && !this.abortController?.signal.aborted) {
-                    try {
-                        logInfo('Empty stream — trying non-streaming fallback.', { engine, model: actualModel, apiUrl });
-                        const fallback = await this.callNonStreaming({
-                            baseUrl: ollamaUrl,
-                            modelName: actualModel,
-                            engine,
-                            messages: messagesForRequest,
-                            temperature,
-                            maxTokens: maxOutputTokens,
-                            contextLength: ctxLimits.contextLength,
-                            signal: this.abortController?.signal,
-                        });
-                        if (fallback.stopReason) finishStopReason = fallback.stopReason;
-                        if (fallback.text && fallback.text.trim()) {
-                            aiResponseText = fallback.text;
-                            logInfo('Non-streaming fallback recovered the answer.', { engine, model: actualModel, length: fallback.text.length });
-                        }
-                    } catch (recoverErr: any) {
-                        logError('Non-streaming fallback also failed.', {
-                            engine, model: actualModel, error: recoverErr?.message ?? String(recoverErr),
-                        });
+                try {
+                    logInfo('Empty stream — trying non-streaming fallback.', { engine, model: actualModel, apiUrl });
+                    const fallback = await this.callNonStreaming({
+                        baseUrl: ollamaUrl,
+                        modelName: actualModel,
+                        engine,
+                        messages: messagesForRequest,
+                        temperature,
+                        maxTokens: maxOutputTokens,
+                        contextLength: ctxLimits.contextLength,
+                        signal: this.abortController?.signal,
+                    });
+                    if (fallback.stopReason) finishStopReason = fallback.stopReason;
+                    if (fallback.text && fallback.text.trim()) {
+                        aiResponseText = fallback.text;
+                        logInfo('Non-streaming fallback recovered the answer.', { engine, model: actualModel, length: fallback.text.length });
                    }
+                } catch (recoverErr: any) {
+                    logError('Non-streaming fallback also failed.', {
+                        engine, model: actualModel, error: recoverErr?.message ?? String(recoverErr),
+                    });
                }
            }

@@ -1183,7 +1227,12 @@ export class AgentExecutor {
                }
                if (this.isStaleRun(runId)) return;
            }
-            const cleanedVisible = cleaned.visible;
+            // [Plain Text Output] outputFormat='plain' (기본)이면 모델이 무심코 내보낸
+            // 마크다운 마커(`##`, `**`, `> `, `* ` …) 를 후처리로 모두 제거. 라벨 텍스트는 유지.
+            // markdown 모드면 legacy 그대로 통과.
+            const cleanedVisible = getConfig().outputFormat === 'plain'
+                ? stripMarkdownFormatting(cleaned.visible)
+                : cleaned.visible;

            // 5. Execute Actions
            const rationale = this.parseRationale(cleanedVisible);
@@ -1235,7 +1284,13 @@ export class AgentExecutor {
            if (notice && assistantContent.trim()) {
                assistantContent = assistantContent.trimEnd() + notice;
            }
-            const finalAssistantContent = assistantContent;
+            // [Plain Text Output — FINAL pass] enforcer 들이 `## 경로 확인 결과` 같은 하드코딩 헤더를
+            // 다시 prepend 한 후에도 마커가 남지 않도록, webview / chatHistory 에 들어가는 최종 문자열을
+            // 한 번 더 sanitize. cleanedVisible 단계의 1차 sanitize 는 model 출력 자체를 정리하고,
+            // 이 2차 sanitize 는 enforcer 출력까지 모두 청소한다.
+            const finalAssistantContent = getConfig().outputFormat === 'plain'
+                ? stripMarkdownFormatting(assistantContent)
+                : assistantContent;

            const assistantMessage: ChatMessage = { role: 'assistant', content: finalAssistantContent, internal: false, rationale };
            this.chatHistory.push(assistantMessage);
@@ -1470,21 +1525,33 @@ export class AgentExecutor {
                : '';

            // 워크플로우 매니저에게 설정 기반 실행 위임
-            const finalReport = await AgentWorkflowManager.runStrictWorkflow(
+            // [Clean Stream] 단계 진행 메시지는 채팅 본문(streamChunk) 이 아닌 사이드바
+            // 상단의 workflowStage 인디케이터로만 표시한다 → "생각 단계가 본문에 계속 보임"
+            // 답답함 제거. 채팅 버블에는 최종 답변만 한 번에 들어간다.
+            const rawFinalReport = await AgentWorkflowManager.runStrictWorkflow(
                prompt,
                modelName,
                `${brainContext}${selectedAgentContext}${designerContext}`,
                signal,
                (step, msg) => {
-                    this.webview?.postMessage({ type: 'autoContinue', value: `${step}: ${msg}` });
-                    // 각 단계별 시작을 알림
-                    this.webview?.postMessage({ type: 'streamChunk', value: `\n\n> **[${step}]** ${msg}\n\n` });
+                    this.webview?.postMessage({
+                        type: 'workflowStage',
+                        value: { step, message: msg, done: step === '완료' || step === '오류' }
+                    });
                }
            );

            if (signal.aborted || !this.webview) return;

-            this.webview.postMessage({ type: 'streamChunk', value: `\n\n--- \n\n${finalReport}` });
+            // [Plain Text Output] Synthesizer가 잘 따라줬어도 작은 모델은 `##` `**` 를 흘리는 경우가 있어
+            // 최종 후처리로 한 번 더 마커를 벗긴다. 채팅 history 에도 정제된 결과만 남겨 다음 턴 컨텍스트에서
+            // 마커가 재학습되는 일을 막는다.
+            const finalReport = getConfig().outputFormat === 'plain'
+                ? stripMarkdownFormatting(rawFinalReport)
+                : rawFinalReport;
+
+            this.webview.postMessage({ type: 'streamChunk', value: finalReport });
+            this.webview.postMessage({ type: 'workflowStage', value: { step: '완료', message: '', done: true } });
            this.webview.postMessage({ type: 'streamEnd' });

            this.chatHistory.push({ role: 'assistant', content: finalReport });
@@ -1494,6 +1561,8 @@ export class AgentExecutor {
            this.webview.postMessage({ type: 'autoContinue', value: '✅ 모든 분석이 성공적으로 완료되었습니다.' });

        } catch (error: any) {
+            // 어떤 종료 경로에서든 stage indicator 는 반드시 닫는다 — 안 닫으면 사이드바에 영원히 "③ 자기 검증..." 가 남는다.
+            this.webview?.postMessage({ type: 'workflowStage', value: { step: '완료', message: '', done: true } });
            if (error.name === 'AbortError' || error.message?.includes('cancelled')) {
                this.statusBarManager.updateStatus(AgentStatus.Idle, 'Workflow Cancelled');
                return;
@@ -1537,10 +1606,23 @@ export class AgentExecutor {
                    temperature: 0.3,
                    maxTokens: subMaxTokens,
                    contextOverflowPolicy,
+                    ...this.lmStudioSamplingFromConfig(),
+                    ...this.lmStudioRespondExtrasFromConfig(),
                    signal: this.abortController?.signal,
                });
-                for await (const { token } of stream) {
+                let subStopReason: string | undefined;
+                for await (const { token, stopReason } of stream) {
                    if (token) responseText += token;
+                    if (stopReason) subStopReason = stopReason;
+                }
+                // Sub-agent answers that got cut mid-sentence corrupt the pipeline silently
+                // (Planner produces a half-step, Writer can't recover). Surface a warn log so
+                // the operator can raise subMaxTokens or pick a less aggressive output budget.
+                if (subStopReason && /maxPredicted|context|truncat/i.test(subStopReason)) {
+                    logError('Sub-agent answer hit a generation limit.', {
+                        role, model: modelName, stopReason: subStopReason,
+                        chars: responseText.length, maxTokens: subMaxTokens,
+                    });
                }
                return responseText;
            } catch (err: any) {
@@ -1726,12 +1808,13 @@ export class AgentExecutor {
                return [
                    'Intent operating contract — Code Review:',
                    'The user wants a real review, not a meta-plan of how to review.',
-                    'Required sections in this exact order, in Korean:',
-                    '  1. ## 한 줄 판단 — one sentence: would you rely on this today, and under what constraint?',
-                    '  2. ## 잘된 점 — 2~4 concrete strengths. Each MUST cite a specific file path (and a function or section if you can name one) and explain WHY it works, not just that it exists.',
-                    '  3. ## 부족한 점 — 2~4 concrete weaknesses or risks. Same rule: cite a specific file/area, name the actual problem (race condition, missing retry, coupling, etc.), and say what breaks because of it.',
-                    '  4. ## 사용자 관점 개선 — 2~4 changes phrased from the END USER\'s perspective ("when X happens, the user currently sees Y; they should see Z"). Tie each to a code location that needs to change.',
-                    '  5. ## 다음 한 수 — exactly one next action, small enough to do this week.',
+                    'OUTPUT FORMAT: PLAIN TEXT only. Section labels are bare words on their own line (no "#", "##", "**", "__", "> "). Bullets use "- ". Long answers MUST start with a "핵심 요약" block (2~4 bullets) before any detail.',
+                    'Required sections in this exact order, in Korean (each label appears as a plain line, NOT a markdown heading):',
+                    '  1) 한 줄 판단 — one sentence: would you rely on this today, and under what constraint?',
+                    '  2) 잘된 점 — 2~4 concrete strengths. Each MUST cite a specific file path (and a function or section if you can name one) and explain WHY it works, not just that it exists.',
+                    '  3) 부족한 점 — 2~4 concrete weaknesses or risks. Same rule: cite a specific file/area, name the actual problem (race condition, missing retry, coupling, etc.), and say what breaks because of it.',
+                    '  4) 사용자 관점 개선 — 2~4 changes phrased from the END USER\'s perspective ("when X happens, the user currently sees Y; they should see Z"). Tie each to a code location that needs to change.',
+                    '  5) 다음 한 수 — exactly one next action, small enough to do this week.',
                    '',
                    'Hard rules — these are the things that made past reviews feel like a template:',
                    '- Do NOT write meta-sentences like "확인해야 합니다", "다음 리뷰에서는 ~를 보면 됩니다", "~로 보입니다", "~인지 확인하는 것이 핵심입니다". Either you observed it or you read the file with <read_file> right now.',
@@ -1998,12 +2081,53 @@ export class AgentExecutor {
            return false;
        }

-        const complexByShape = prompt.length > 180 || /(보고서|심층|종합\s*분석|리서치|조사|전략\s*수립|기획안|제안서|roadmap|research|report|deep\s*analysis|strategy|proposal)/i.test(prompt);
-        if (!complexByShape) {
+        const cfg = getConfig();
+        const mode = cfg.workflowMultiAgentMode || 'auto';
+
+        // 'off' → 기존 키워드/길이 휴리스틱만 사용 (legacy multiAgentEnabled 토글 존중).
+        if (mode === 'off') {
+            const legacyComplex = prompt.length > 180 || /(보고서|심층|종합\s*분석|리서치|조사|전략\s*수립|기획안|제안서|roadmap|research|report|deep\s*analysis|strategy|proposal)/i.test(prompt);
+            if (!legacyComplex) return false;
+            return configEnabled || /(보고서|심층|종합\s*분석|리서치|조사|전략\s*수립|기획안|제안서|research|report|deep\s*analysis|strategy|proposal)/i.test(prompt);
+        }
+
+        // 인사·잡담은 5단계 파이프라인 낭비. 짧은 casual prompt 는 제외.
+        if (this.isCasualConversationPrompt(prompt)) {
+            return false;
+        }
+        if (prompt.trim().length < 12) {
            return false;
        }

-        return configEnabled || /(보고서|심층|종합\s*분석|리서치|조사|전략\s*수립|기획안|제안서|research|report|deep\s*analysis|strategy|proposal)/i.test(prompt);
+        // 'always' → 위 가드만 통과하면 무조건 발동.
+        if (mode === 'always') return true;
+
+        // 'auto' → 다음 중 하나라도 만족하면 발동:
+        //   (1) 사용자가 multiAgentEnabled 를 명시적으로 켰다,
+        //   (2) 작은 모델 (≤4B params) 이라 한 번에 처리하기 위험,
+        //   (3) prompt 토큰이 효과적 context window 의 임계 이상을 차지한다,
+        //   (4) "보고서/리뷰/심층 분석" 같은 명백한 복합 작업 키워드 매치,
+        //   (5) prompt 길이 자체가 큼 (>240 chars).
+        if (configEnabled) return true;
+
+        const paramB = estimateModelParamsB(cfg.defaultModel);
+        if (paramB !== null && paramB <= 4) return true;
+
+        try {
+            const effectiveCtx = cfg.smallModelContextCap > 0 && paramB !== null && paramB <= 4
+                ? cfg.smallModelContextCap
+                : cfg.contextLength;
+            const promptTokens = estimateTokens(prompt);
+            const threshold = Math.floor(effectiveCtx * cfg.workflowAutoCtxFractionThreshold);
+            if (promptTokens >= threshold) return true;
+        } catch { /* 안전한 폴백: 키워드/길이 체크로 진행 */ }
+
+        if (/(보고서|심층|종합\s*분석|리서치|조사|전략\s*수립|기획안|제안서|코드\s*리뷰|리뷰|아키텍처|architecture|research|report|deep\s*analysis|strategy|proposal|review)/i.test(prompt)) {
+            return true;
+        }
+        if (prompt.length > 240) return true;
+
+        return false;
    }

    private buildAstraModeArchitectureContext(prompt: string): string {
@@ -2129,6 +2253,78 @@ export class AgentExecutor {
        }
    }

+    /**
+     * v2.2.69 — sliding-window 가 잘라낸 메시지들을 한 줄 요약으로 압축.
+     * 추가 LLM 호출 없이 heuristic 으로:
+     *   - 사용자 prompt 첫 문장
+     *   - assistant 답변 첫 문장 (conclusion-first 가정 — R1)
+     * 만 추출해 시간순으로 이어붙인다. 모델이 "이전에 무슨 얘기를 했는지" 의 골자만 알면 충분.
+     * `## ` 같은 마크다운 마커는 떼서 깔끔한 plain text 로 만든다.
+     */
+    private buildDroppedHistorySummary(dropped: ChatMessage[]): string {
+        if (dropped.length === 0) return '';
+        const lines: string[] = [];
+        const firstSentence = (s: string): string => {
+            const cleaned = String(s || '')
+                .replace(/^\s{0,3}#{1,6}\s+/gm, '')
+                .replace(/\*\*/g, '')
+                .replace(/`{3}[\s\S]*?`{3}/g, '[code]')
+                .replace(/\s+/g, ' ')
+                .trim();
+            // 첫 문장 (마침표/물음표/줄바꿈 기준) — 너무 길면 140자 cap.
+            const m = cleaned.match(/^[^.!?。\n]{1,140}[.!?。]?/);
+            const out = (m ? m[0] : cleaned.slice(0, 140)).trim();
+            return out;
+        };
+        let userTurnIdx = 0;
+        for (const msg of dropped) {
+            if (msg.internal) continue;
+            const content = typeof msg.content === 'string' ? msg.content : '';
+            if (!content.trim()) continue;
+            if (msg.role === 'user') {
+                userTurnIdx++;
+                lines.push(`U${userTurnIdx}: ${firstSentence(content)}`);
+            } else if (msg.role === 'assistant') {
+                lines.push(`A${userTurnIdx}: ${firstSentence(content)}`);
+            }
+        }
+        // 너무 많으면 가장 오래된 절반은 한 줄로 합치고 최근 N개만 보존.
+        const MAX_LINES = 8;
+        if (lines.length > MAX_LINES) {
+            const tail = lines.slice(-MAX_LINES);
+            const head = lines.slice(0, lines.length - MAX_LINES);
+            return `[이전 대화 요약 — 총 ${dropped.length}개 메시지가 컨텍스트 한계로 생략됨]\n(더 오래된 ${head.length}개 턴 생략됨)\n${tail.join('\n')}`;
+        }
+        return `[이전 대화 요약 — 총 ${dropped.length}개 메시지가 컨텍스트 한계로 생략됨]\n${lines.join('\n')}`;
+    }
+
+    /**
+     * v2.2.69 — 현재 요청의 mode signature 를 계산.
+     * mode 가 직전과 다르면 system prompt 에 "이전 모드: X / 현재 모드: Y" 한 줄 brige 를 끼울 수 있다.
+     */
+    private computeModeSignature(opts: { agentSkillName?: string; companyMode?: boolean; multiAgent?: boolean; brainName?: string }): string {
+        const parts = [
+            `agent=${opts.agentSkillName || 'none'}`,
+            `company=${opts.companyMode ? 'on' : 'off'}`,
+            `multi=${opts.multiAgent ? 'on' : 'off'}`,
+            `brain=${opts.brainName || '?'}`,
+        ];
+        return parts.join('|');
+    }
+
+    /**
+     * v2.2.69 — chatHistory 의 마지막 user/assistant 턴에서 사용자가 무슨 주제를 다루고 있었는지
+     * 한 줄로 뽑아 모드 전환 bridge 에 쓸 "이전 맥락" 문장을 만든다. 비어 있으면 빈 문자열.
+     */
+    private buildLastTopicLine(): string {
+        const recent = this.chatHistory.filter(m => !m.internal && (m.role === 'user' || m.role === 'assistant'));
+        if (recent.length === 0) return '';
+        const lastUser = [...recent].reverse().find(m => m.role === 'user');
+        if (!lastUser || typeof lastUser.content !== 'string') return '';
+        const topic = lastUser.content.replace(/\s+/g, ' ').trim().slice(0, 120);
+        return topic;
+    }
+
    private buildRequestHistory(history: ChatMessage[]): ChatMessage[] {
        return history.map((message) => {
            if (message.role !== 'assistant' || typeof message.content !== 'string') {
@@ -2957,17 +3153,23 @@ export class AgentExecutor {
        // 같은 엔진 내에서만 model candidate / message variant retry
        for (const candidateModel of modelCandidates) {
            for (const variant of messageVariants) {
+                const sampling = samplingToRestBody(this.lmStudioSamplingFromConfig());
                const streamBody = {
                    model: candidateModel,
                    messages: variant.messages,
                    stream: true,
                    ...(engine === 'lmstudio'
-                        ? { max_tokens: maxTokens, temperature }
-                        : { options: { num_ctx: numCtx, num_predict: maxTokens, temperature } }),
+                        // LM Studio's OpenAI-compatible REST extends the schema with top_k/min_p/
+                        // repeat_penalty (same names as Ollama). Spread the shared sampling block so
+                        // the REST fallback matches the SDK path — without it a fallback after a
+                        // dead handle quietly loses the glitch-suppression preset.
+                        ? { max_tokens: maxTokens, temperature, ...sampling }
+                        : { options: { num_ctx: numCtx, num_predict: maxTokens, temperature, ...sampling } }),
                };

                // 일시적 네트워크 오류용 retry (최대 2회, 지수 backoff)
                const MAX_RETRIES = 2;
+                let serviceDown = false;
                for (let attempt = 0; attempt <= MAX_RETRIES; attempt++) {
                    try {
                        if (attempt > 0) {
@@ -3013,13 +3215,33 @@ export class AgentExecutor {
                        if (lastError.name === 'AbortError') {
                            throw lastError;
                        }
+                        // ECONNREFUSED / DNS-level failures mean the engine process isn't even
+                        // listening — no amount of retries or message-variant juggling will help.
+                        // Abandon the candidate/variant loops now and surface the "is X running?"
+                        // error fast instead of burning 12 fetch attempts before giving up.
+                        const errCode = (error?.cause?.code ?? error?.code ?? '').toString();
+                        const errMsg = lastError.message;
+                        if (
+                            errCode === 'ECONNREFUSED' || errCode === 'ENOTFOUND' || errCode === 'EAI_AGAIN'
+                            || /ECONNREFUSED|ENOTFOUND|getaddrinfo|fetch failed/i.test(errMsg)
+                        ) {
+                            serviceDown = true;
+                            logError('AI streaming request: engine appears to be down.', {
+                                engine, apiUrl, code: errCode, error: errMsg,
+                            });
+                            break; // exit retry loop
+                        }
                        logError('AI streaming request failed.', {
                            engine, variant: variant.name, apiUrl, model: candidateModel,
                            attempt, error: lastError.message
                        });
                    }
                }
+                if (serviceDown) break; // skip remaining variants
            }
+            // serviceDown also short-circuits the model-candidate loop — there is no
+            // candidate / variant the engine can answer if it isn't listening at all.
+            if (lastError && /ECONNREFUSED|ENOTFOUND|fetch failed/i.test(lastError.message)) break;
        }

        // 명확한 에러 메시지: 어느 엔진이 실패했는지 사용자에게 알림
@@ -3151,13 +3373,14 @@ export class AgentExecutor {
        const numCtx = Math.max(2048, params.contextLength ?? 32768);
        const apiUrl = buildApiUrl(baseUrl, engine, 'chat');
        const variants = this.buildEngineMessageVariants(messages, engine);
+        const sampling = samplingToRestBody(this.lmStudioSamplingFromConfig());
        const body = {
            model: modelName,
            messages: variants[0].messages,
            stream: false,
            ...(engine === 'lmstudio'
-                ? { max_tokens: maxTokens, temperature }
-                : { options: { num_ctx: numCtx, num_predict: maxTokens, temperature } }),
+                ? { max_tokens: maxTokens, temperature, ...sampling }
+                : { options: { num_ctx: numCtx, num_predict: maxTokens, temperature, ...sampling } }),
        };
        const response = await fetch(apiUrl, {
            method: 'POST',
@@ -3231,6 +3454,8 @@ export class AgentExecutor {
                    temperature: params.temperature,
                    maxTokens: params.maxTokens,
                    contextOverflowPolicy: params.contextOverflowPolicy,
+                    ...this.lmStudioSamplingFromConfig(),
+                    ...this.lmStudioRespondExtrasFromConfig(),
                    signal: params.signal,
                });
                for await (const { token, stopReason } of stream) {
@@ -3356,6 +3581,29 @@ export class AgentExecutor {
        ];
    }

+    /**
+     * Build the shared LM Studio sampling block from current config. Used by both the
+     * SDK streamer (topPSampling/topKSampling/…) and the REST body (top_p/top_k/…)
+     * so the two paths produce equivalent answers — otherwise a REST fallback after a
+     * dead SDK handle would silently drop the glitch-suppression that the SDK applies
+     * (한글 토큰 깨짐 등). Ollama also accepts these field names inside `options`.
+     */
+    private lmStudioSamplingFromConfig(): LmStudioSampling {
+        const c = getConfig();
+        return {
+            topP: c.lmStudioTopP,
+            topK: c.lmStudioTopK,
+            minP: c.lmStudioMinP,
+            repeatPenalty: c.lmStudioRepeatPenalty,
+        };
+    }
+
+    /** SDK-only extras for `respond()` — currently the draft model for speculative decoding. */
+    private lmStudioRespondExtrasFromConfig(): { draftModel?: string } {
+        const c = getConfig();
+        return c.lmStudioDraftModel ? { draftModel: c.lmStudioDraftModel } : {};
+    }
+
    private buildModelCandidates(modelName: string, engine: 'lmstudio' | 'ollama'): string[] {
        const candidates = [modelName];
        if (engine === 'lmstudio') {