v2.2.256: 코어 채팅 큰 입력 청킹·통합 + 실제 컨텍스트 창 정렬 + 모델 핸들 race 수정

큰 입력 시 "Failed to acquire LM Studio model handle … Operation canceled" 로 턴 전체가 죽던 문제를 3계층으로 해결. 일반 채팅(코어 경로)은 그동안 단일 예산 호출이라 약한 모델·큰 입력에서 무너졌다 — 그 갭을 메움. - 핸들 race 수정: getModelHandle 을 재시도 루프 안으로 이동. 취소/죽은-핸들 류 에러는 SDK 재생성 후 1회 자동 재시도(실제 사용자 취소는 존중). 라이프 사이클의 동시 로드가 abort 되며 SDK 가 coalesce 한 JIT 조회까지 죽던 것. - Phase 1 실제 창 정렬: llm.getContextLength()(캐시)로 실측 창에 예산 클램프. 설정값보다 작은 창으로 로드된 경우 서버 truncation/빈 답변 차단. 배지에 표시. - Phase 2 코어 Map-Reduce: 단일 입력이 (유효 창 × ratio) 초과 시 청크→질의 인지형 추출→통합. 부분/전체 폴백, 무관 시 정직 신호. 동시성 기본 2. - Phase 3 메타 노출: 진행/결과 배지 표시, [조각 k] 출처 옵트인. 신규 설정 5종. /meet·/review 전용 경로는 불변. 테스트 +25건, 전체 684 통과. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-19 18:05:44 +09:00
parent 6adbc2a6fa
commit 76d5fedfb5
13 changed files with 883 additions and 19 deletions
@@ -140,6 +140,7 @@ export { _parseTaskAttrs, _parseSheetAttrs, _parseCalEventAttrs };
 // 8 method bodies extracted to dedicated modules. AgentExecutor 의 동명 메서드는
 // 이제 thin wrapper — deps 객체를 묶어서 free function 으로 위임.
 import { callNonStreaming as callNonStreamingFn } from './agent/llm/callNonStreaming';
+import { runMapReduce, shouldMapReduce } from './agent/handlePrompt/largeInputMapReduce';
 import { createStreamingRequest as createStreamingRequestFn } from './agent/llm/createStreamingRequest';
 import { streamChatOnce as streamChatOnceFn } from './agent/llm/streamChatOnce';
 import { maybeEmitDevilRebuttal as maybeEmitDevilRebuttalFn } from './agent/llm/devilRebuttal';
@@ -768,12 +769,103 @@ export class AgentExecutor {
            // Context budget computation → src/agent/handlePrompt/computeBudgetedRequest.ts
            const imageCount = (reqMessages as any[])
                .reduce((n, m) => n + (Array.isArray(m?.images) ? m.images.length : 0), 0);
+            // Budget against the model's REAL loaded window, not just the user's
+            // contextLength setting. Best-effort + cached; only for the LM Studio
+            // SDK path (REST/Ollama/cloud expose no such query → undefined → prior behavior).
+            let actualContextLength: number | undefined;
+            try {
+                const _isCloud = (() => {
+                    try {
+                        const { parseModelPrefix } = require('./features/providers') as typeof import('./features/providers');
+                        return !!parseModelPrefix(actualModel);
+                    } catch { return false; }
+                })();
+                if (!_isCloud
+                    && resolveEngine(ollamaUrl) === 'lmstudio'
+                    && this.options.lmStudioStreamer?.getModelContextLength) {
+                    actualContextLength = await this.options.lmStudioStreamer.getModelContextLength(actualModel);
+                }
+            } catch { /* best-effort — fall back to configured window */ }
+
+            // ── Large-input Map-Reduce ────────────────────────────────────────
+            // When a SINGLE user message is too big to fit the (real) window,
+            // history-trimming can't help — you can't drop the current question.
+            // Chunk it, extract only the request-relevant facts per chunk, and
+            // integrate, then let the normal streaming path answer from the
+            // condensed context. Only the user-visible turn; casual chat skipped.
+            if (loopDepth === 0 && !isCasualConversation && config.largeInputMapReduce) {
+                try {
+                    const effWindow = (typeof actualContextLength === 'number' && actualContextLength > 0)
+                        ? Math.min(config.contextLength, actualContextLength)
+                        : config.contextLength;
+                    const lastUserIdx = reqMessages.map((m) => m.role).lastIndexOf('user');
+                    const lastUser = lastUserIdx >= 0 ? reqMessages[lastUserIdx] : undefined;
+                    const content = typeof lastUser?.content === 'string' ? lastUser.content : '';
+                    const sysTokens = estimateTokens(fullSystemPrompt) + 4;
+                    const mrCfg = {
+                        enabled: true,
+                        triggerRatio: config.mapReduceTriggerRatio,
+                        concurrency: config.mapReduceConcurrency,
+                        maxDepth: config.mapReduceMaxDepth,
+                        showProvenance: config.mapReduceShowProvenance,
+                    };
+                    if (lastUser && shouldMapReduce(estimateTokens(content), effWindow, mrCfg)) {
+                        const intent = content.length > 1400
+                            ? `${content.slice(0, 800)}\n…\n${content.slice(-400)}`
+                            : content;
+                        const mrEngine = resolveEngine(ollamaUrl);
+                        this.webview?.postMessage({ type: 'mapReduceStatus', value: { phase: 'start' } });
+                        const mr = await runMapReduce(
+                            {
+                                callLLM: async (messages, maxTokens) => {
+                                    const r = await this.callNonStreaming({
+                                        baseUrl: ollamaUrl,
+                                        modelName: actualModel,
+                                        engine: mrEngine,
+                                        messages,
+                                        temperature: 0.1,
+                                        maxTokens,
+                                        contextLength: effWindow,
+                                        signal: this.abortController?.signal,
+                                    });
+                                    return r.text;
+                                },
+                                estimateTokens,
+                                log: (msg, meta) => logInfo(msg, meta),
+                                signal: this.abortController?.signal,
+                            },
+                            { intent, largeContent: content, windowTokens: effWindow, systemTokens: sysTokens, safetyMargin: config.contextSafetyMargin, cfg: mrCfg },
+                        );
+                        // allIrrelevant → keep original (budgeter truncates) rather than forcing an empty context.
+                        if (!mr.allIrrelevant && mr.condensedContext.trim()) {
+                            reqMessages[lastUserIdx] = {
+                                ...lastUser,
+                                content: `${intent}\n\n──────── 추출된 관련 자료 (원본 ${mr.chunkCount}조각 중 ${mr.relevantCount}조각, 통합 ${mr.reduceDepth}단계) ────────\n${mr.condensedContext}`,
+                            } as any;
+                            logInfo('Large input condensed via map-reduce.', {
+                                model: actualModel, chunkCount: mr.chunkCount, relevantCount: mr.relevantCount, reduceDepth: mr.reduceDepth,
+                            });
+                        }
+                        this.webview?.postMessage({
+                            type: 'mapReduceStatus',
+                            value: { phase: 'done', chunkCount: mr.chunkCount, relevantCount: mr.relevantCount, allIrrelevant: mr.allIrrelevant },
+                        });
+                    }
+                } catch (e: any) {
+                    // Any failure → fall through to the normal (single-shot) path. Worst case the
+                    // budgeter truncates the oversized input, which is the prior behavior.
+                    logError('Large-input map-reduce failed — falling back to single-shot path.', { error: e?.message ?? String(e) });
+                    this.webview?.postMessage({ type: 'mapReduceStatus', value: { phase: 'error' } });
+                }
+            }
+
            const _budget = computeBudgetedRequest({
                fullSystemPrompt,
                reqMessages,
                actualModel,
                config,
                imageCount,
+                actualContextLength,
            });
            const messagesForRequest = _budget.messagesForRequest;
            const ctxLimits = _budget.ctxLimits;
@@ -819,6 +911,8 @@ export class AgentExecutor {
                        paramB: modelParamB,
                        contextLength: ctxLimits.contextLength,
                        nominalContextLength: config.contextLength,
+                        actualContextLength,
+                        windowMismatch: _budget.windowMismatch,
                        cappedForSmallModel,
                        inputTokens,
                        maxOutputTokens,