v2.2.256: 코어 채팅 큰 입력 청킹·통합 + 실제 컨텍스트 창 정렬 + 모델 핸들 race 수정

큰 입력 시 "Failed to acquire LM Studio model handle … Operation canceled" 로 턴 전체가 죽던 문제를 3계층으로 해결. 일반 채팅(코어 경로)은 그동안 단일 예산 호출이라 약한 모델·큰 입력에서 무너졌다 — 그 갭을 메움. - 핸들 race 수정: getModelHandle 을 재시도 루프 안으로 이동. 취소/죽은-핸들 류 에러는 SDK 재생성 후 1회 자동 재시도(실제 사용자 취소는 존중). 라이프 사이클의 동시 로드가 abort 되며 SDK 가 coalesce 한 JIT 조회까지 죽던 것. - Phase 1 실제 창 정렬: llm.getContextLength()(캐시)로 실측 창에 예산 클램프. 설정값보다 작은 창으로 로드된 경우 서버 truncation/빈 답변 차단. 배지에 표시. - Phase 2 코어 Map-Reduce: 단일 입력이 (유효 창 × ratio) 초과 시 청크→질의 인지형 추출→통합. 부분/전체 폴백, 무관 시 정직 신호. 동시성 기본 2. - Phase 3 메타 노출: 진행/결과 배지 표시, [조각 k] 출처 옵트인. 신규 설정 5종. /meet·/review 전용 경로는 불변. 테스트 +25건, 전체 684 통과. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-19 18:05:44 +09:00
parent 6adbc2a6fa
commit 76d5fedfb5
13 changed files with 883 additions and 19 deletions
@@ -0,0 +1,58 @@
+/**
+ * Phase 1 — context-window alignment.
+ *
+ * The budgeter must clamp to the model's ACTUALLY-loaded window when it's
+ * smaller than the user's `contextLength` setting, so a model loaded with a
+ * smaller window than the setting never silently overflows the server.
+ */
+
+import { computeBudgetedRequest } from '../src/agent/handlePrompt/computeBudgetedRequest';
+import type { ChatMessage } from '../src/agent';
+
+const baseConfig = {
+    contextLength: 32768,
+    maxOutputTokens: 4096,
+    contextSafetyMargin: 512,
+    smallModelContextCap: 0, // disabled
+    autoCompactHistory: false,
+};
+
+function run(overrides: { actualContextLength?: number; config?: Partial<typeof baseConfig> } = {}) {
+    const reqMessages: ChatMessage[] = [{ role: 'user', content: 'hello' }];
+    return computeBudgetedRequest({
+        fullSystemPrompt: 'You are a helpful assistant.',
+        reqMessages,
+        actualModel: 'some-13b-model',
+        config: { ...baseConfig, ...overrides.config },
+        imageCount: 0,
+        actualContextLength: overrides.actualContextLength,
+    });
+}
+
+describe('computeBudgetedRequest — real-window alignment', () => {
+    test('clamps to the actual loaded window when it is smaller than the setting', () => {
+        const r = run({ actualContextLength: 8192 });
+        expect(r.windowMismatch).toBe(true);
+        expect(r.effectiveContextLength).toBe(8192);
+        expect(r.ctxLimits.contextLength).toBe(8192);
+    });
+
+    test('keeps the configured window when the actual window is unknown', () => {
+        const r = run({ actualContextLength: undefined });
+        expect(r.windowMismatch).toBe(false);
+        expect(r.effectiveContextLength).toBe(32768);
+        expect(r.ctxLimits.contextLength).toBe(32768);
+    });
+
+    test('does not raise the window when the actual window is larger than the setting', () => {
+        const r = run({ actualContextLength: 131072 });
+        expect(r.windowMismatch).toBe(false);
+        expect(r.effectiveContextLength).toBe(32768); // setting is the lower bound here
+    });
+
+    test('ignores a non-positive / non-finite actual window (falls back to setting)', () => {
+        expect(run({ actualContextLength: 0 }).effectiveContextLength).toBe(32768);
+        expect(run({ actualContextLength: -5 }).effectiveContextLength).toBe(32768);
+        expect(run({ actualContextLength: NaN }).effectiveContextLength).toBe(32768);
+    });
+});
@@ -0,0 +1,159 @@
+/**
+ * Phase 2 — large-input map-reduce core.
+ *
+ * Pure orchestration with an injected `callLLM`, so no network / SDK is touched.
+ */
+
+import {
+    runMapReduce,
+    shouldMapReduce,
+    chunkCharBudget,
+    inputBudgetTokens,
+    type MapReduceConfig,
+    type MapReduceDeps,
+} from '../src/agent/handlePrompt/largeInputMapReduce';
+import type { ChatMessage } from '../src/agent';
+
+const estimateTokens = (s: string) => Math.ceil((s || '').length / 4);
+
+const cfg: MapReduceConfig = {
+    enabled: true,
+    triggerRatio: 0.6,
+    concurrency: 2,
+    maxDepth: 3,
+    showProvenance: false,
+};
+
+function isExtract(messages: ChatMessage[]): boolean {
+    return /추출기/.test(messages[0]?.content ?? '');
+}
+function chunkLabel(messages: ChatMessage[]): string {
+    const m = (messages[1]?.content ?? '').match(/자료 조각 (\d+)\/(\d+)/);
+    return m ? m[1] : '?';
+}
+
+// ~12 short markdown sections → forces multiple chunks under a small window.
+const bigContent = Array.from({ length: 12 }, (_, i) =>
+    `## 섹션 ${i + 1}\n안건 ${i + 1}: 결정사항과 수치 ${i * 10}. ` + '내용 '.repeat(40)
+).join('\n\n');
+
+describe('shouldMapReduce', () => {
+    test('triggers only above window * triggerRatio and when enabled', () => {
+        expect(shouldMapReduce(6200, 10000, cfg)).toBe(true);  // > 6000
+        expect(shouldMapReduce(5000, 10000, cfg)).toBe(false); // < 6000
+        expect(shouldMapReduce(99999, 10000, { ...cfg, enabled: false })).toBe(false);
+        expect(shouldMapReduce(100, 0, cfg)).toBe(false);      // unknown window
+    });
+});
+
+describe('budget helpers', () => {
+    test('inputBudgetTokens reserves output + safety', () => {
+        // 10000 - sys(500) - max(2048, 1000)=2048 - safety(512) = 6940
+        expect(inputBudgetTokens(10000, 500, 512)).toBe(6940);
+    });
+    test('chunkCharBudget is positive and scales with the window', () => {
+        const small = chunkCharBudget(4000, 200, 512);
+        const big = chunkCharBudget(16000, 200, 512);
+        expect(small).toBeGreaterThan(0);
+        expect(big).toBeGreaterThan(small);
+    });
+});
+
+describe('runMapReduce', () => {
+    function deps(callLLM: MapReduceDeps['callLLM']): MapReduceDeps {
+        return { callLLM, estimateTokens };
+    }
+    const params = {
+        intent: '회의록을 안건별로 정리해줘',
+        largeContent: bigContent,
+        windowTokens: 4000,
+        systemTokens: 200,
+        safetyMargin: 512,
+        cfg,
+    };
+
+    test('extracts relevant facts per chunk and condenses them', async () => {
+        const seen: string[] = [];
+        const r = await runMapReduce(
+            deps(async (messages) => {
+                expect(isExtract(messages)).toBe(true);
+                const k = chunkLabel(messages);
+                seen.push(k);
+                return `추출-${k}`;
+            }),
+            params,
+        );
+        expect(r.allIrrelevant).toBe(false);
+        expect(r.chunkCount).toBeGreaterThan(1);
+        expect(r.relevantCount).toBe(r.chunkCount);
+        expect(r.condensedContext).toContain('추출-1');
+        // every chunk was visited
+        expect(seen.length).toBe(r.chunkCount);
+    });
+
+    test('all-irrelevant chunks → allIrrelevant with empty context', async () => {
+        const r = await runMapReduce(
+            deps(async () => '(관련 없음)'),
+            params,
+        );
+        expect(r.allIrrelevant).toBe(true);
+        expect(r.relevantCount).toBe(0);
+        expect(r.condensedContext).toBe('');
+    });
+
+    test('respects concurrency limit', async () => {
+        let active = 0;
+        let peak = 0;
+        await runMapReduce(
+            deps(async (messages) => {
+                active++;
+                peak = Math.max(peak, active);
+                await new Promise((res) => setTimeout(res, 5));
+                active--;
+                return `x-${chunkLabel(messages)}`;
+            }),
+            params,
+        );
+        expect(peak).toBeLessThanOrEqual(cfg.concurrency);
+    });
+
+    test('a failing chunk extraction falls back to truncated raw (not a crash)', async () => {
+        let call = 0;
+        const r = await runMapReduce(
+            deps(async (messages) => {
+                if (isExtract(messages) && ++call === 1) throw new Error('boom');
+                return `ok-${chunkLabel(messages)}`;
+            }),
+            params,
+        );
+        expect(r.allIrrelevant).toBe(false);
+        // The failed chunk still contributed (raw fallback), so relevantCount === chunkCount.
+        expect(r.relevantCount).toBe(r.chunkCount);
+    });
+
+    test('tags provenance when showProvenance is on', async () => {
+        const r = await runMapReduce(
+            deps(async (messages) => `발췌-${chunkLabel(messages)}`),
+            { ...params, cfg: { ...cfg, showProvenance: true } },
+        );
+        expect(r.condensedContext).toMatch(/\[조각 \d+\]/);
+    });
+
+    test('hierarchical reduce kicks in when extractions overflow the context ceiling', async () => {
+        // Tiny window so even a few extractions exceed the ceiling → reduce rounds run.
+        let reduceCalls = 0;
+        const r = await runMapReduce(
+            deps(async (messages) => {
+                if (isExtract(messages)) {
+                    return '관련 사실 '.repeat(60); // big extraction per chunk
+                }
+                reduceCalls++;
+                return '통합본'; // reduce collapses to something small
+            }),
+            { ...params, windowTokens: 2200 },
+        );
+        expect(reduceCalls).toBeGreaterThan(0);
+        expect(r.reduceDepth).toBeGreaterThan(0);
+        expect(r.allIrrelevant).toBe(false);
+    });
+});
@@ -78,6 +78,10 @@ class FakeLMStudioClient implements ILMStudioClient {
        return true;
    }

+    async getModelContextLength(_modelKey: string): Promise<number | undefined> {
+        return undefined;
+    }
+
    async listLoadedCached(): Promise<string[]> {
        return [...this.loaded];
    }
@@ -69,6 +69,9 @@ class FakeModel {
 class FakeClient implements ILMStudioClient {
    public model: FakeModel;
    public getModelHandleCalls: string[] = [];
+    public getModelHandleOpts: Array<{ refresh?: boolean } | undefined> = [];
+    /** Errors to throw on successive getModelHandle calls before returning the model. */
+    public handleAcqFailures: Error[] = [];

    constructor(model: FakeModel = new FakeModel()) {
        this.model = model;
@@ -83,10 +86,18 @@ class FakeClient implements ILMStudioClient {
    async listDownloadedCached(): Promise<string[]> { return []; }
    async isReachable(): Promise<boolean> { return true; }

-    async getModelHandle(modelKey: string): Promise<any> {
+    async getModelHandle(modelKey: string, options?: { refresh?: boolean }): Promise<any> {
        this.getModelHandleCalls.push(modelKey);
+        this.getModelHandleOpts.push(options);
+        const failure = this.handleAcqFailures.shift();
+        if (failure) throw failure;
        return this.model;
    }
+
+    public contextLength: number | undefined = undefined;
+    async getModelContextLength(_modelKey: string): Promise<number | undefined> {
+        return this.contextLength;
+    }
 }

 // The streamer emits a trailing { token: '', stopReason } event on normal completion;
@@ -209,6 +220,68 @@ describe('LMStudioStreamer', () => {
        expect(out).toEqual(['a']);
    });

+    test('transient "Operation canceled" on handle acquisition is retried with a fresh SDK', async () => {
+        // The lifecycle manager's concurrent load for this model got superseded;
+        // the SDK coalesced our JIT model() lookup into that aborted load. The
+        // first getModelHandle throws — the streamer must recreate the SDK
+        // (refresh) and retry rather than crashing the whole turn.
+        const client = new FakeClient(new FakeModel({ chunks: ['ok'] }));
+        client.handleAcqFailures = [new Error('Failed to acquire LM Studio model handle "m1": Operation canceled.')];
+        const streamer = new LMStudioStreamer(client);
+        const tokens = await collect(streamer.stream({
+            modelName: 'm1',
+            messages: [{ role: 'user', content: 'hi' }],
+            temperature: 0.2,
+        }));
+        expect(tokens).toEqual(['ok']);
+        expect(client.getModelHandleCalls).toEqual(['m1', 'm1']);
+        // First attempt: no refresh. Retry: refresh=true so the SDK is recreated.
+        expect(client.getModelHandleOpts[0]).toBeUndefined();
+        expect(client.getModelHandleOpts[1]).toEqual({ refresh: true });
+    });
+
+    test('non-transient handle acquisition error is thrown without retry', async () => {
+        const client = new FakeClient();
+        client.handleAcqFailures = [new Error('Failed to acquire LM Studio model handle "m1": model not found')];
+        const streamer = new LMStudioStreamer(client);
+        await expect(collect(streamer.stream({
+            modelName: 'm1',
+            messages: [{ role: 'user', content: 'hi' }],
+            temperature: 0.2,
+        }))).rejects.toThrow(/model not found/);
+        expect(client.getModelHandleCalls).toEqual(['m1']); // no retry
+    });
+
+    test('handle acquisition failure is swallowed when the user already aborted', async () => {
+        const client = new FakeClient();
+        client.handleAcqFailures = [new Error('Operation canceled')];
+        const streamer = new LMStudioStreamer(client);
+        const ac = new AbortController();
+        ac.abort();
+        const out = await collect(streamer.stream({
+            modelName: 'm1',
+            messages: [{ role: 'user', content: 'hi' }],
+            temperature: 0.2,
+            signal: ac.signal,
+        }));
+        expect(out).toEqual([]);
+        expect(client.getModelHandleCalls).toEqual(['m1']); // no retry — genuine cancel
+    });
+
+    test('getModelContextLength delegates to the client (and survives a throwing client)', async () => {
+        const client = new FakeClient();
+        client.contextLength = 8192;
+        const streamer = new LMStudioStreamer(client);
+        expect(await streamer.getModelContextLength('m1')).toBe(8192);
+        expect(await streamer.getModelContextLength('')).toBeUndefined();
+
+        // A throwing client must degrade to undefined, never reject.
+        const throwing = new FakeClient();
+        throwing.getModelContextLength = async () => { throw new Error('ws down'); };
+        const s2 = new LMStudioStreamer(throwing);
+        expect(await s2.getModelContextLength('m1')).toBeUndefined();
+    });
+
    test('passes messages through to model.respond', async () => {
        const client = new FakeClient();
        const streamer = new LMStudioStreamer(client);