v2.2.256: 코어 채팅 큰 입력 청킹·통합 + 실제 컨텍스트 창 정렬 + 모델 핸들 race 수정

큰 입력 시 "Failed to acquire LM Studio model handle … Operation canceled" 로 턴 전체가 죽던 문제를 3계층으로 해결. 일반 채팅(코어 경로)은 그동안 단일 예산 호출이라 약한 모델·큰 입력에서 무너졌다 — 그 갭을 메움. - 핸들 race 수정: getModelHandle 을 재시도 루프 안으로 이동. 취소/죽은-핸들 류 에러는 SDK 재생성 후 1회 자동 재시도(실제 사용자 취소는 존중). 라이프 사이클의 동시 로드가 abort 되며 SDK 가 coalesce 한 JIT 조회까지 죽던 것. - Phase 1 실제 창 정렬: llm.getContextLength()(캐시)로 실측 창에 예산 클램프. 설정값보다 작은 창으로 로드된 경우 서버 truncation/빈 답변 차단. 배지에 표시. - Phase 2 코어 Map-Reduce: 단일 입력이 (유효 창 × ratio) 초과 시 청크→질의 인지형 추출→통합. 부분/전체 폴백, 무관 시 정직 신호. 동시성 기본 2. - Phase 3 메타 노출: 진행/결과 배지 표시, [조각 k] 출처 옵트인. 신규 설정 5종. /meet·/review 전용 경로는 불변. 테스트 +25건, 전체 684 통과. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-19 18:05:44 +09:00
parent 6adbc2a6fa
commit 76d5fedfb5
13 changed files with 883 additions and 19 deletions
@@ -140,6 +140,7 @@ export { _parseTaskAttrs, _parseSheetAttrs, _parseCalEventAttrs };
 // 8 method bodies extracted to dedicated modules. AgentExecutor 의 동명 메서드는
 // 이제 thin wrapper — deps 객체를 묶어서 free function 으로 위임.
 import { callNonStreaming as callNonStreamingFn } from './agent/llm/callNonStreaming';
+import { runMapReduce, shouldMapReduce } from './agent/handlePrompt/largeInputMapReduce';
 import { createStreamingRequest as createStreamingRequestFn } from './agent/llm/createStreamingRequest';
 import { streamChatOnce as streamChatOnceFn } from './agent/llm/streamChatOnce';
 import { maybeEmitDevilRebuttal as maybeEmitDevilRebuttalFn } from './agent/llm/devilRebuttal';
@@ -768,12 +769,103 @@ export class AgentExecutor {
            // Context budget computation → src/agent/handlePrompt/computeBudgetedRequest.ts
            const imageCount = (reqMessages as any[])
                .reduce((n, m) => n + (Array.isArray(m?.images) ? m.images.length : 0), 0);
+            // Budget against the model's REAL loaded window, not just the user's
+            // contextLength setting. Best-effort + cached; only for the LM Studio
+            // SDK path (REST/Ollama/cloud expose no such query → undefined → prior behavior).
+            let actualContextLength: number | undefined;
+            try {
+                const _isCloud = (() => {
+                    try {
+                        const { parseModelPrefix } = require('./features/providers') as typeof import('./features/providers');
+                        return !!parseModelPrefix(actualModel);
+                    } catch { return false; }
+                })();
+                if (!_isCloud
+                    && resolveEngine(ollamaUrl) === 'lmstudio'
+                    && this.options.lmStudioStreamer?.getModelContextLength) {
+                    actualContextLength = await this.options.lmStudioStreamer.getModelContextLength(actualModel);
+                }
+            } catch { /* best-effort — fall back to configured window */ }
+
+            // ── Large-input Map-Reduce ────────────────────────────────────────
+            // When a SINGLE user message is too big to fit the (real) window,
+            // history-trimming can't help — you can't drop the current question.
+            // Chunk it, extract only the request-relevant facts per chunk, and
+            // integrate, then let the normal streaming path answer from the
+            // condensed context. Only the user-visible turn; casual chat skipped.
+            if (loopDepth === 0 && !isCasualConversation && config.largeInputMapReduce) {
+                try {
+                    const effWindow = (typeof actualContextLength === 'number' && actualContextLength > 0)
+                        ? Math.min(config.contextLength, actualContextLength)
+                        : config.contextLength;
+                    const lastUserIdx = reqMessages.map((m) => m.role).lastIndexOf('user');
+                    const lastUser = lastUserIdx >= 0 ? reqMessages[lastUserIdx] : undefined;
+                    const content = typeof lastUser?.content === 'string' ? lastUser.content : '';
+                    const sysTokens = estimateTokens(fullSystemPrompt) + 4;
+                    const mrCfg = {
+                        enabled: true,
+                        triggerRatio: config.mapReduceTriggerRatio,
+                        concurrency: config.mapReduceConcurrency,
+                        maxDepth: config.mapReduceMaxDepth,
+                        showProvenance: config.mapReduceShowProvenance,
+                    };
+                    if (lastUser && shouldMapReduce(estimateTokens(content), effWindow, mrCfg)) {
+                        const intent = content.length > 1400
+                            ? `${content.slice(0, 800)}\n…\n${content.slice(-400)}`
+                            : content;
+                        const mrEngine = resolveEngine(ollamaUrl);
+                        this.webview?.postMessage({ type: 'mapReduceStatus', value: { phase: 'start' } });
+                        const mr = await runMapReduce(
+                            {
+                                callLLM: async (messages, maxTokens) => {
+                                    const r = await this.callNonStreaming({
+                                        baseUrl: ollamaUrl,
+                                        modelName: actualModel,
+                                        engine: mrEngine,
+                                        messages,
+                                        temperature: 0.1,
+                                        maxTokens,
+                                        contextLength: effWindow,
+                                        signal: this.abortController?.signal,
+                                    });
+                                    return r.text;
+                                },
+                                estimateTokens,
+                                log: (msg, meta) => logInfo(msg, meta),
+                                signal: this.abortController?.signal,
+                            },
+                            { intent, largeContent: content, windowTokens: effWindow, systemTokens: sysTokens, safetyMargin: config.contextSafetyMargin, cfg: mrCfg },
+                        );
+                        // allIrrelevant → keep original (budgeter truncates) rather than forcing an empty context.
+                        if (!mr.allIrrelevant && mr.condensedContext.trim()) {
+                            reqMessages[lastUserIdx] = {
+                                ...lastUser,
+                                content: `${intent}\n\n──────── 추출된 관련 자료 (원본 ${mr.chunkCount}조각 중 ${mr.relevantCount}조각, 통합 ${mr.reduceDepth}단계) ────────\n${mr.condensedContext}`,
+                            } as any;
+                            logInfo('Large input condensed via map-reduce.', {
+                                model: actualModel, chunkCount: mr.chunkCount, relevantCount: mr.relevantCount, reduceDepth: mr.reduceDepth,
+                            });
+                        }
+                        this.webview?.postMessage({
+                            type: 'mapReduceStatus',
+                            value: { phase: 'done', chunkCount: mr.chunkCount, relevantCount: mr.relevantCount, allIrrelevant: mr.allIrrelevant },
+                        });
+                    }
+                } catch (e: any) {
+                    // Any failure → fall through to the normal (single-shot) path. Worst case the
+                    // budgeter truncates the oversized input, which is the prior behavior.
+                    logError('Large-input map-reduce failed — falling back to single-shot path.', { error: e?.message ?? String(e) });
+                    this.webview?.postMessage({ type: 'mapReduceStatus', value: { phase: 'error' } });
+                }
+            }
+
            const _budget = computeBudgetedRequest({
                fullSystemPrompt,
                reqMessages,
                actualModel,
                config,
                imageCount,
+                actualContextLength,
            });
            const messagesForRequest = _budget.messagesForRequest;
            const ctxLimits = _budget.ctxLimits;
@@ -819,6 +911,8 @@ export class AgentExecutor {
                        paramB: modelParamB,
                        contextLength: ctxLimits.contextLength,
                        nominalContextLength: config.contextLength,
+                        actualContextLength,
+                        windowMismatch: _budget.windowMismatch,
                        cappedForSmallModel,
                        inputTokens,
                        maxOutputTokens,
@@ -19,6 +19,13 @@ export interface ComputeBudgetedRequestInput {
    /** Result of `getConfig()` — reads contextLength, maxOutputTokens, contextSafetyMargin, smallModelContextCap, autoCompactHistory. */
    config: any;
    imageCount: number;
+    /**
+     * The model's *actually-loaded* context window (LM Studio `getContextLength()`),
+     * when known. Budgeting uses the smaller of this and `config.contextLength` so we
+     * never overflow a model loaded with a smaller window than the user's setting.
+     * Omit (undefined) to budget against the configured value alone (prior behavior).
+     */
+    actualContextLength?: number;
 }

 export interface ComputeBudgetedRequestResult {
@@ -34,6 +41,10 @@ export interface ComputeBudgetedRequestResult {
    outputBudget: { maxOutputTokens: number; available: number; tight: boolean };
    modelParamB: number | null;
    cappedForSmallModel: boolean;
+    /** True when the model's real loaded window is smaller than `config.contextLength` (we clamped to the real one). */
+    windowMismatch: boolean;
+    /** The window actually used for budgeting (after real-window clamp + small-model cap). */
+    effectiveContextLength: number;
 }

 /**
@@ -60,15 +71,34 @@ export function computeBudgetedRequest(input: ComputeBudgetedRequestInput): Comp
    // smaller effective window. Never applied to 4B+ models, and never when the setting is 0 —
    // capping squeezes the output-token budget, so it's a knob, not a default.
    const modelParamB = estimateModelParamsB(actualModel);
+
+    // The real ceiling is whatever window the model was actually loaded with — the
+    // server truncates anything past it. When known, clamp the configured setting
+    // down to it so we budget against the smaller of the two. (When unknown, keep
+    // the configured value — prior behavior.)
+    const actualWindow = (typeof input.actualContextLength === 'number'
+        && Number.isFinite(input.actualContextLength)
+        && input.actualContextLength > 0)
+        ? input.actualContextLength
+        : undefined;
+    const configuredWindow = config.contextLength;
+    const windowMismatch = actualWindow !== undefined && actualWindow < configuredWindow;
+    const realWindow = actualWindow !== undefined ? Math.min(configuredWindow, actualWindow) : configuredWindow;
+    if (windowMismatch) {
+        logInfo('Model loaded with a smaller context window than the setting — clamping budget to the real window.', {
+            model: actualModel, configuredWindow, actualWindow,
+        });
+    }
+
    const smallModelCap = config.smallModelContextCap; // 0 = disabled (default)
    const cappedForSmallModel = smallModelCap > 0
        && modelParamB !== null && modelParamB <= 3
-        && config.contextLength > smallModelCap;
-    const effectiveContextLength = cappedForSmallModel ? smallModelCap : config.contextLength;
+        && realWindow > smallModelCap;
+    const effectiveContextLength = cappedForSmallModel ? smallModelCap : realWindow;
    if (cappedForSmallModel) {
        logInfo('Small model detected — capping effective context window for budgeting.', {
            model: actualModel, paramB: modelParamB,
-            nominalContext: config.contextLength, effectiveContext: effectiveContextLength,
+            nominalContext: realWindow, effectiveContext: effectiveContextLength,
        });
    }
    const ctxLimits: ContextLimits = {
@@ -157,5 +187,7 @@ export function computeBudgetedRequest(input: ComputeBudgetedRequestInput): Comp
        outputBudget,
        modelParamB,
        cappedForSmallModel,
+        windowMismatch,
+        effectiveContextLength,
    };
 }
@@ -0,0 +1,265 @@
+/**
+ * ============================================================
+ * Large-Input Map-Reduce (큰 입력 청킹 + 통합)
+ *
+ * 한 번에 컨텍스트 창에 안 들어가는 단일 사용자 입력(긴 회의록·리서치 덤프 등)을
+ *   1) 청크로 분할(Map 대상)
+ *   2) 각 청크에서 "요청과 관련된 사실만" 발췌 (질의 인지형 추출 — 일반 요약 X)
+ *   3) 발췌들을 통합(Reduce). 합본이 또 창을 넘으면 계층적으로 재통합.
+ * 한 뒤, 압축된 컨텍스트를 돌려줘 정상 스트리밍 경로가 최종 답변을 생성하게 한다.
+ *
+ * 신뢰성 원칙(ASTRA): 추측·창작 금지, 원문 표현 보존, 출처(`[조각 k]`) 태깅,
+ * 전부 무관하면 정직하게 "관련 내용 없음" 신호.
+ *
+ * LLM 호출은 `callLLM` 으로 주입 → 코어 로직은 네트워크 의존 없이 단위 테스트 가능.
+ * ============================================================
+ */
+
+import type { ChatMessage } from '../../agent';
+import { splitIntoSections } from '../../retrieval/chunker';
+
+export interface MapReduceConfig {
+    enabled: boolean;
+    /** 단일 입력 토큰 > (유효 창 × triggerRatio) 이면 발동. */
+    triggerRatio: number;
+    concurrency: number;
+    maxDepth: number;
+    showProvenance: boolean;
+}
+
+export interface MapReduceDeps {
+    /** 메시지 배열 → 모델 응답 텍스트. (callNonStreaming 래퍼) */
+    callLLM: (messages: ChatMessage[], maxTokens: number) => Promise<string>;
+    estimateTokens: (text: string) => number;
+    log?: (msg: string, meta?: Record<string, unknown>) => void;
+    signal?: AbortSignal;
+}
+
+export interface MapReduceParams {
+    /** 사용자 요청 의도 힌트 (보통 원본 입력의 머리/꼬리 발췌 — 지시문이 거기 있음). */
+    intent: string;
+    /** 청킹 대상이 되는 큰 본문. */
+    largeContent: string;
+    /** 유효 컨텍스트 창(토큰) — Phase 1 의 effectiveContextLength. */
+    windowTokens: number;
+    /** 시스템 프롬프트가 이미 차지한 토큰. */
+    systemTokens: number;
+    safetyMargin: number;
+    cfg: MapReduceConfig;
+}
+
+export interface MapReduceResult {
+    /** 통합된 관련 자료. 정상 경로에서 사용자 메시지 본문을 이걸로 대체. */
+    condensedContext: string;
+    chunkCount: number;
+    relevantCount: number;
+    reduceDepth: number;
+    /** 모든 청크가 무관 → 호출 측에서 정직한 에스컬레이션. */
+    allIrrelevant: boolean;
+}
+
+const IRRELEVANT_MARKER = '(관련 없음)';
+/** 추출/통합 호출이 쓸 출력 토큰 상한 — 발췌는 원문보다 짧으므로 보수적으로. */
+const EXTRACT_OUTPUT_TOKENS = 1024;
+const REDUCE_OUTPUT_TOKENS = 2048;
+/** 토큰→문자 환산(한국어 보수치 ~2자/토큰). 청크 크기 산정용. */
+const CHARS_PER_TOKEN = 2;
+
+/** 유효 창에서 입력에 쓸 수 있는 토큰 예산. computeBudgetedRequest 와 같은 공식. */
+export function inputBudgetTokens(windowTokens: number, systemTokens: number, safetyMargin: number): number {
+    const outputReserve = Math.max(2048, Math.floor(windowTokens * 0.1));
+    return Math.max(256, windowTokens - systemTokens - outputReserve - safetyMargin);
+}
+
+/** 단일 입력이 map-reduce 대상인지. (cfg.enabled + 입력이 창의 triggerRatio 초과) */
+export function shouldMapReduce(latestUserTokens: number, windowTokens: number, cfg: MapReduceConfig): boolean {
+    if (!cfg.enabled) return false;
+    if (windowTokens <= 0) return false;
+    return latestUserTokens > windowTokens * cfg.triggerRatio;
+}
+
+/** 한 청크가 (자기 + 추출 프롬프트 오버헤드 + 출력 예약)으로 창에 들어가도록 문자 상한 산정. */
+export function chunkCharBudget(windowTokens: number, systemTokens: number, safetyMargin: number): number {
+    // 추출 프롬프트 자체 오버헤드(지시문 + intent) ~800 토큰 가정.
+    const promptOverhead = 800;
+    const perChunkTokenBudget = Math.max(
+        512,
+        windowTokens - systemTokens - safetyMargin - EXTRACT_OUTPUT_TOKENS - promptOverhead
+    );
+    // 보수적으로 70% 만 사용 (추정 오차 흡수).
+    return Math.floor(perChunkTokenBudget * CHARS_PER_TOKEN * 0.7);
+}
+
+function buildExtractPrompt(intent: string, chunkText: string, idx: number, total: number): ChatMessage[] {
+    const system = [
+        '너는 긴 자료에서 사용자 요청에 필요한 사실만 정확히 발췌하는 추출기다.',
+        '규칙:',
+        '1) 사용자 요청과 직접 관련된 사실·수치·발언·결정사항만 원문 표현 그대로 발췌한다.',
+        '2) 요약·추측·창작·일반화 금지. 자료에 없는 내용은 절대 만들지 않는다.',
+        `3) 이 조각에 관련 내용이 전혀 없으면 정확히 "${IRRELEVANT_MARKER}" 한 줄만 출력한다.`,
+        '4) 불릿(-)으로 간결하게. 각 항목은 자료에 근거해야 한다.',
+    ].join('\n');
+    const user = [
+        `[사용자 요청 의도]\n${intent}`,
+        `\n[자료 조각 ${idx}/${total}]\n${chunkText}`,
+        `\n위 조각에서 요청 수행에 필요한 사실만 발췌하라. 없으면 "${IRRELEVANT_MARKER}".`,
+    ].join('\n');
+    return [
+        { role: 'system', content: system },
+        { role: 'user', content: user },
+    ];
+}
+
+function buildReducePrompt(intent: string, extractions: string): ChatMessage[] {
+    const system = [
+        '너는 여러 발췌를 중복 없이 하나로 통합하는 통합기다.',
+        '규칙: 발췌에 있는 사실만 유지하고, 중복은 병합한다. 추측·창작 금지.',
+        '원문 사실과 (있다면) [조각 k] 출처 표기를 보존한다.',
+    ].join('\n');
+    const user = `[사용자 요청 의도]\n${intent}\n\n[발췌 모음]\n${extractions}\n\n위 발췌들을 요청 관점에서 중복 없이 통합하라.`;
+    return [
+        { role: 'system', content: system },
+        { role: 'user', content: user },
+    ];
+}
+
+/** 동시성 제한 map. 순서 보존. */
+async function mapWithConcurrency<T, R>(
+    items: T[],
+    limit: number,
+    fn: (item: T, index: number) => Promise<R>,
+    signal?: AbortSignal,
+): Promise<R[]> {
+    const results: R[] = new Array(items.length);
+    let next = 0;
+    const n = Math.max(1, Math.min(limit, items.length));
+    const workers = Array.from({ length: n }, async () => {
+        while (true) {
+            if (signal?.aborted) return;
+            const i = next++;
+            if (i >= items.length) return;
+            results[i] = await fn(items[i], i);
+        }
+    });
+    await Promise.all(workers);
+    return results;
+}
+
+function isIrrelevant(text: string): boolean {
+    const t = (text || '').trim();
+    return t.length === 0 || t === IRRELEVANT_MARKER || /^\(?\s*관련\s*없음\s*\)?$/.test(t);
+}
+
+/**
+ * 큰 입력을 청크→추출→통합한다. 호출 측은 trigger 를 이미 통과시킨 뒤 호출한다고 가정하지만,
+ * 방어적으로 단일 청크면 추출만 하고 통합은 건너뛴다.
+ */
+export async function runMapReduce(deps: MapReduceDeps, params: MapReduceParams): Promise<MapReduceResult> {
+    const { intent, largeContent, windowTokens, systemTokens, safetyMargin, cfg } = params;
+    const log = deps.log ?? (() => {});
+
+    const targetChars = chunkCharBudget(windowTokens, systemTokens, safetyMargin);
+    const sections = splitIntoSections(largeContent, {
+        targetChars,
+        maxChars: targetChars * 2,
+    });
+    const chunks = sections.map((s) => s.text);
+    log('Map-reduce: split large input into chunks.', { chunkCount: chunks.length, targetChars });
+
+    // ── Map: 각 청크 → 질의 인지형 추출 ──────────────────────────────────
+    const extracted = await mapWithConcurrency(
+        chunks,
+        cfg.concurrency,
+        async (chunk, i) => {
+            if (deps.signal?.aborted) return '';
+            try {
+                const text = await deps.callLLM(
+                    buildExtractPrompt(intent, chunk, i + 1, chunks.length),
+                    EXTRACT_OUTPUT_TOKENS,
+                );
+                return text ?? '';
+            } catch (e: any) {
+                // 한 청크 실패가 전체를 막지 않게 — 원문 일부로 폴백(빈손보다 낫다).
+                log('Map-reduce: chunk extraction failed — falling back to truncated raw.', { chunk: i + 1, error: e?.message ?? String(e) });
+                return chunk.slice(0, targetChars);
+            }
+        },
+        deps.signal,
+    );
+
+    const relevant: string[] = [];
+    extracted.forEach((text, i) => {
+        if (isIrrelevant(text)) return;
+        relevant.push(cfg.showProvenance ? `[조각 ${i + 1}]\n${text.trim()}` : text.trim());
+    });
+
+    if (relevant.length === 0) {
+        log('Map-reduce: every chunk was irrelevant.', { chunkCount: chunks.length });
+        return { condensedContext: '', chunkCount: chunks.length, relevantCount: 0, reduceDepth: 0, allIrrelevant: true };
+    }
+
+    // ── Reduce: 합본이 입력 예산에 들어갈 때까지 계층적으로 통합 ──────────
+    const budget = inputBudgetTokens(windowTokens, systemTokens, safetyMargin);
+    // intent 분량 + 헤더 여유를 위해 예산의 80% 를 컨텍스트 상한으로.
+    const contextCeiling = Math.floor(budget * 0.8);
+
+    let current = relevant;
+    let depth = 0;
+    while (depth < cfg.maxDepth) {
+        const joined = current.join('\n\n');
+        if (deps.estimateTokens(joined) <= contextCeiling) break;
+        // 그룹으로 묶어 각 그룹을 통합 → 개수 감소.
+        const groups = groupToFit(current, deps.estimateTokens, contextCeiling);
+        if (groups.length >= current.length) break; // 더 못 줄임 — 마지막에 잘림 처리
+        log('Map-reduce: hierarchical reduce round.', { depth: depth + 1, from: current.length, to: groups.length });
+        current = await mapWithConcurrency(
+            groups,
+            cfg.concurrency,
+            async (group) => {
+                if (deps.signal?.aborted) return group.join('\n\n');
+                try {
+                    return await deps.callLLM(buildReducePrompt(intent, group.join('\n\n')), REDUCE_OUTPUT_TOKENS);
+                } catch {
+                    return group.join('\n\n'); // 통합 실패 → 원본 그룹 유지
+                }
+            },
+            deps.signal,
+        );
+        depth++;
+    }
+
+    let condensed = current.join('\n\n');
+    // maxDepth 도달했는데도 넘치면 하드 트렁케이트(서버 overflow 방지) + 경고는 호출 측에서.
+    if (deps.estimateTokens(condensed) > contextCeiling) {
+        const charCeiling = contextCeiling * CHARS_PER_TOKEN;
+        condensed = condensed.slice(0, charCeiling) + '\n\n[…자료가 많아 일부 생략됨]';
+        log('Map-reduce: reduce hit max depth and was hard-truncated.', { maxDepth: cfg.maxDepth });
+    }
+
+    return {
+        condensedContext: condensed,
+        chunkCount: chunks.length,
+        relevantCount: relevant.length,
+        reduceDepth: depth,
+        allIrrelevant: false,
+    };
+}
+
+/** 항목들을 순서대로 누적해 ceiling 을 넘기 직전까지 한 그룹으로 묶는다. */
+function groupToFit(items: string[], estimate: (s: string) => number, ceiling: number): string[][] {
+    const groups: string[][] = [];
+    let cur: string[] = [];
+    let curTokens = 0;
+    for (const item of items) {
+        const t = estimate(item);
+        if (cur.length > 0 && curTokens + t > ceiling) {
+            groups.push(cur);
+            cur = [];
+            curTokens = 0;
+        }
+        cur.push(item);
+        curTokens += t;
+    }
+    if (cur.length > 0) groups.push(cur);
+    return groups;
+}
@@ -40,6 +40,17 @@ export interface IAgentConfig {
    autoCompactHistory: boolean;
    /** 작은 모델(≤4B) 감지 시 예산 계산에 쓸 유효 context window 상한. 0 = 비활성화. */
    smallModelContextCap: number;
+    // ─── 큰 입력 Map-Reduce (긴 회의록/리서치 덤프 청킹·통합) ───
+    /** 단일 사용자 입력이 창을 넘으면 청크→추출→통합으로 처리. 끄면 기존 단발 경로(잘릴 수 있음). */
+    largeInputMapReduce: boolean;
+    /** 단일 입력 토큰이 (유효 창 × 이 비율)을 넘으면 map-reduce 발동. 기본 0.6. */
+    mapReduceTriggerRatio: number;
+    /** 청크 추출 동시성. 로컬 단일 GPU 보호용으로 낮게. 기본 2. */
+    mapReduceConcurrency: number;
+    /** 추출 합본이 창을 넘을 때 계층적 통합 최대 깊이. 기본 3. */
+    mapReduceMaxDepth: number;
+    /** 최종 답변에 `[조각 k]` 출처 태그를 노출. 기본 false. */
+    mapReduceShowProvenance: boolean;
    // ─── 응답 복구 (Thought Quarantine / Auto-Continuation) ───
    /** 답변이 출력 토큰 한계에 걸리면 사용자 개입 없이 내부적으로 이어서 생성. */
    autoContinueOnOutputLimit: boolean;
@@ -500,6 +511,11 @@ export function getConfig(): IAgentConfig {
        })(),
        autoCompactHistory: cfg.get<boolean>('autoCompactHistory', true),
        smallModelContextCap: Math.max(0, cfg.get<number>('smallModelContextCap', 0)),
+        largeInputMapReduce: cfg.get<boolean>('largeInputMapReduce', true),
+        mapReduceTriggerRatio: Math.min(0.95, Math.max(0.3, cfg.get<number>('mapReduceTriggerRatio', 0.6))),
+        mapReduceConcurrency: Math.min(8, Math.max(1, cfg.get<number>('mapReduceConcurrency', 2))),
+        mapReduceMaxDepth: Math.min(6, Math.max(1, cfg.get<number>('mapReduceMaxDepth', 3))),
+        mapReduceShowProvenance: cfg.get<boolean>('mapReduceShowProvenance', false),
        autoContinueOnOutputLimit: cfg.get<boolean>('autoContinueOnOutputLimit', true),
        maxAutoContinuations: Math.max(0, Math.min(10, cfg.get<number>('maxAutoContinuations', 4))),
        finalOnlyRetryOnThoughtLeak: cfg.get<boolean>('finalOnlyRetryOnThoughtLeak', true),
@@ -39,6 +39,17 @@ export interface ILMStudioClient {
     * "Model is disposed!" or "lock() request could not be registered" error.
     */
    getModelHandle(modelKey: string, options?: { refresh?: boolean }): Promise<LLM>;
+    /**
+     * The model's *actually-loaded* context window in tokens (LM Studio's
+     * `llm.getContextLength()`), or `undefined` if it can't be determined.
+     *
+     * The user-facing `g1nation.contextLength` setting is only a budgeting
+     * intent — the real ceiling is whatever window the model was loaded with.
+     * Budgeting against the larger of the two silently overflows the server,
+     * which then truncates the prompt or emits EOS as the first token (empty
+     * answer). Cached per-key because it only changes on reload.
+     */
+    getModelContextLength(modelKey: string): Promise<number | undefined>;
    isReachable(): Promise<boolean>;
    setBaseUrl(httpBaseUrl: string): void;
 }
@@ -84,8 +95,10 @@ export class LMStudioClient implements ILMStudioClient {
    private _wsUrl: string | undefined;
    private _loadedCache: { value: string[]; expiresAt: number } | undefined;
    private _downloadedCache: { value: string[]; expiresAt: number } | undefined;
+    private _contextLengthCache = new Map<string, { value: number; expiresAt: number }>();
    private static readonly DEFAULT_LOADED_CACHE_TTL_MS = 5000;
    private static readonly DEFAULT_DOWNLOADED_CACHE_TTL_MS = 60_000;
+    private static readonly DEFAULT_CONTEXT_LENGTH_CACHE_TTL_MS = 60_000;

    constructor(httpBaseUrl: string) {
        this.setBaseUrl(httpBaseUrl);
@@ -98,6 +111,7 @@ export class LMStudioClient implements ILMStudioClient {
            this._sdk = undefined;
            this._loadedCache = undefined;
            this._downloadedCache = undefined;
+            this._contextLengthCache.clear();
        }
    }

@@ -170,6 +184,7 @@ export class LMStudioClient implements ILMStudioClient {
    invalidateCaches(): void {
        this._loadedCache = undefined;
        this._downloadedCache = undefined;
+        this._contextLengthCache.clear();
    }

    async listLoaded(): Promise<string[]> {
@@ -243,6 +258,36 @@ export class LMStudioClient implements ILMStudioClient {
        }
    }

+    async getModelContextLength(modelKey: string): Promise<number | undefined> {
+        const key = (modelKey || '').trim();
+        if (!key) return undefined;
+        const now = Date.now();
+        const cached = this._contextLengthCache.get(key);
+        if (cached && cached.expiresAt > now) return cached.value;
+        try {
+            // Reuses the same handle the stream will use. If the model isn't
+            // loaded yet this forces a JIT load — acceptable since the very next
+            // step streams from it anyway. Best-effort: any failure (incl. the
+            // load-coalescing "Operation canceled" race) falls back to undefined
+            // so the caller keeps the configured window.
+            const handle: any = await this.getSdk().llm.model(key);
+            const len = typeof handle?.getContextLength === 'function'
+                ? await handle.getContextLength()
+                : undefined;
+            if (typeof len === 'number' && Number.isFinite(len) && len > 0) {
+                this._contextLengthCache.set(key, {
+                    value: len,
+                    expiresAt: now + LMStudioClient.DEFAULT_CONTEXT_LENGTH_CACHE_TTL_MS,
+                });
+                return len;
+            }
+            return undefined;
+        } catch (e: any) {
+            logError('Failed to query LM Studio model context length.', { modelKey: key, error: e?.message ?? String(e) });
+            return undefined;
+        }
+    }
+
    async isReachable(): Promise<boolean> {
        try {
            await this.getSdk().llm.listLoaded();
@@ -83,6 +83,12 @@ export interface IChatStreamer {
     * silently-disposed handle that needs a fresh WebSocket round-trip.
     */
    resetHandle?(modelName: string): Promise<void>;
+    /**
+     * The model's actually-loaded context window in tokens, or `undefined` if
+     * unavailable. Callers use this to budget against the real ceiling instead
+     * of the user's `contextLength` setting. Best-effort — never throws.
+     */
+    getModelContextLength?(modelName: string): Promise<number | undefined>;
 }

 /**
@@ -115,7 +121,28 @@ export class LMStudioStreamer implements IChatStreamer {
        // would duplicate tokens.
        for (let attempt = 1; attempt <= 2; attempt++) {
            const refresh = attempt > 1;
-            const model = await this.client.getModelHandle(trimmedModel, refresh ? { refresh: true } : undefined);
+            // Handle acquisition is guarded on its own: it happens BEFORE the
+            // stream try/catch below, so without this an "Operation canceled"
+            // (the lifecycle manager's concurrent load for this same model was
+            // superseded/aborted and the SDK coalesced our JIT lookup into that
+            // dead load), a disposed handle, or a dropped WebSocket would crash
+            // the whole turn with no retry. Large inputs make this far more
+            // likely: loading a big model to hold a large prompt is slow, which
+            // widens the window for a concurrent switch/abort to land mid-load.
+            let model: Awaited<ReturnType<ILMStudioClient['getModelHandle']>>;
+            try {
+                model = await this.client.getModelHandle(trimmedModel, refresh ? { refresh: true } : undefined);
+            } catch (acqErr: any) {
+                // Genuine user cancel — don't retry, just stop quietly.
+                if (req.signal?.aborted || acqErr?.name === 'AbortError') return;
+                const acqMsg = String(acqErr?.message ?? acqErr);
+                if (this.isTransientHandleError(acqMsg) && attempt === 1) {
+                    logInfo('LM Studio model handle acquisition hit a transient error — retrying with a fresh SDK.', { model: trimmedModel, error: acqMsg });
+                    continue; // attempt 2 passes { refresh: true } → recreates the SDK client
+                }
+                logError('LM Studio model handle acquisition failed.', { model: trimmedModel, error: acqMsg, attempt });
+                throw acqErr;
+            }
            logInfo('LM Studio SDK chat stream started.', { model: trimmedModel, messageCount: req.messages.length, attempt });

            // Sampling defaults match the historical glitch-suppression preset for small /
@@ -216,17 +243,7 @@ export class LMStudioStreamer implements IChatStreamer {
            }

            const errMsg = String(caught?.message ?? caught);
-            // Broaden the "handle is bound to a dead WebSocket binding" detection. All of
-            // these resolve with the same fix (recreate the SDK client so the next
-            // llm.model() lookup mints a fresh handle).
-            const handleDead =
-                /\bdisposed\b/i.test(errMsg)
-                || /lock\(\) request could not be registered/i.test(errMsg)
-                || /channel\s+closed/i.test(errMsg)
-                || /WebSocket\s+(?:is\s+not\s+open|closed|disconnected)/i.test(errMsg)
-                || /Connection\s+(?:lost|reset|closed)/i.test(errMsg)
-                || /\bECONNRESET\b/i.test(errMsg)
-                || /socket\s+hang\s*up/i.test(errMsg);
+            const handleDead = this.isTransientHandleError(errMsg);

            if (handleDead && yielded === 0 && attempt === 1) {
                logInfo('Dead LM Studio handle detected — retrying with a fresh SDK.', { model: trimmedModel, error: errMsg });
@@ -238,6 +255,38 @@ export class LMStudioStreamer implements IChatStreamer {
        }
    }

+    /**
+     * True when an error message indicates the SDK handle / WebSocket binding is
+     * dead, or its in-flight (coalesced) load was canceled out from under us —
+     * all fixable by recreating the SDK client so the next `llm.model()` lookup
+     * mints a fresh handle. Deliberately excludes genuine user aborts, which are
+     * caught earlier via `req.signal.aborted` / `AbortError` before reaching here.
+     */
+    private isTransientHandleError(errMsg: string): boolean {
+        return (
+            /\bdisposed\b/i.test(errMsg)
+            || /lock\(\) request could not be registered/i.test(errMsg)
+            || /channel\s+closed/i.test(errMsg)
+            || /WebSocket\s+(?:is\s+not\s+open|closed|disconnected)/i.test(errMsg)
+            || /Connection\s+(?:lost|reset|closed)/i.test(errMsg)
+            || /\bECONNRESET\b/i.test(errMsg)
+            || /socket\s+hang\s*up/i.test(errMsg)
+            // The lifecycle manager's load got superseded/aborted and the SDK
+            // coalesced our JIT model() lookup into that canceled load.
+            || /\boperation\s+cancell?ed\b/i.test(errMsg)
+        );
+    }
+
+    async getModelContextLength(modelName: string): Promise<number | undefined> {
+        const trimmed = (modelName || '').trim();
+        if (!trimmed) return undefined;
+        try {
+            return await this.client.getModelContextLength(trimmed);
+        } catch {
+            return undefined; // best-effort — caller falls back to the configured window
+        }
+    }
+
    async resetHandle(modelName: string): Promise<void> {
        const trimmed = (modelName || '').trim();
        if (!trimmed) return;