diff --git a/PATCHNOTES.md b/PATCHNOTES.md index 4ee4592..a4d45c3 100644 --- a/PATCHNOTES.md +++ b/PATCHNOTES.md @@ -1,5 +1,15 @@ # Astra Patch Notes +## v2.2.256 (2026-06-19) +### ๐Ÿงฉ ์ฝ”์–ด ์ฑ„ํŒ… ๊ฒฝ๋กœ โ€” ํฐ ์ž…๋ ฅ ์ฒญํ‚นยทํ†ตํ•ฉ + ์‹ค์ œ ์ปจํ…์ŠคํŠธ ์ฐฝ ์ •๋ ฌ + ๋ชจ๋ธ ํ•ธ๋“ค race ์ˆ˜์ • +ํฐ ์ž…๋ ฅ์„ ๋„ฃ์œผ๋ฉด `Failed to acquire LM Studio model handle โ€ฆ Operation canceled` ๋กœ ํ„ด ์ „์ฒด๊ฐ€ ์ฃฝ๋˜ ๋ฌธ์ œ๋ฅผ 3๊ณ„์ธต์œผ๋กœ ํ•ด๊ฒฐ. `/meet`ยท`/review` ์™€ ๋‹ฌ๋ฆฌ **์ผ๋ฐ˜ ์ฑ„ํŒ…(์ฝ”์–ด ๊ฒฝ๋กœ)** ์€ ๊ทธ๋™์•ˆ ๋‹จ์ผ ์˜ˆ์‚ฐ ํ˜ธ์ถœ์ด๋ผ ์•ฝํ•œ ๋ชจ๋ธยทํฐ ์ž…๋ ฅ์—์„œ ๋ฌด๋„ˆ์กŒ๋‹ค โ€” ๊ทธ ๊ฐญ์„ ๋ฉ”์›€. + +- **๋ชจ๋ธ ํ•ธ๋“ค race ์ˆ˜์ •**: ํ•ธ๋“ค ํš๋“(`getModelHandle`โ†’`llm.model()`)์ด ์žฌ์‹œ๋„ try/catch **๋ฐ”๊นฅ**์— ์žˆ์–ด, ๋ผ์ดํ”„์‚ฌ์ดํด์˜ ๋™์‹œ ๋กœ๋“œ๊ฐ€ superseded/abort ๋˜๋ฉฐ SDK ๊ฐ€ ํ•ฉ์ณ๋ฒ„๋ฆฐ(coalesce) ์šฐ๋ฆฌ JIT ์กฐํšŒ๊นŒ์ง€ "Operation canceled" ๋กœ ๋–จ์–ด์ง€๋ฉด **์žฌ์‹œ๋„ ์—†์ด ํฌ๋ž˜์‹œ**ํ–ˆ๋‹ค. ํ•ธ๋“ค ํš๋“์„ ์žฌ์‹œ๋„ ๋ฃจํ”„ ์•ˆ์œผ๋กœ ๋„ฃ๊ณ , ์ทจ์†Œ/์ฃฝ์€-ํ•ธ๋“ค ๋ฅ˜ ์—๋Ÿฌ๋Š” SDK ์žฌ์ƒ์„ฑ ํ›„ 1ํšŒ ์ž๋™ ์žฌ์‹œ๋„(์‹ค์ œ ์‚ฌ์šฉ์ž ์ทจ์†Œ๋Š” ๊ทธ๋Œ€๋กœ ์กด์ค‘). ํฐ ์ž…๋ ฅ์ผ์ˆ˜๋ก 26B ๋กœ๋“œ๊ฐ€ ๋А๋ ค race ์ฐฝ์ด ๋„“์–ด์ ธ ์ž˜ ํ„ฐ์ง€๋˜ ๊ฒƒ. ([streamer.ts](src/lmstudio/streamer.ts)) +- **Phase 1 โ€” ์‹ค์ œ ์ฐฝ ์ •๋ ฌ**: ์˜ˆ์‚ฐ์„ ์„ค์ •๊ฐ’(`g1nation.contextLength`)์ด ์•„๋‹ˆ๋ผ ๋ชจ๋ธ์ด **์‹ค์ œ ๋กœ๋“œ๋œ ์ฐฝ**(`llm.getContextLength()`, ์บ์‹œ)์— ๋งž์ถฐ ๋‘˜ ์ค‘ ์ž‘์€ ์ชฝ์œผ๋กœ ํด๋žจํ”„. ์„ค์ • 32768 ์ธ๋ฐ ๋ชจ๋ธ์ด 8192/16384 ๋กœ ๋–  ์žˆ์œผ๋ฉด ์„œ๋ฒ„๊ฐ€ ์กฐ์šฉํžˆ ์ž˜๋ผ ๋นˆ ๋‹ต๋ณ€์ด ๋‚˜๋˜ ๋ฌธ์ œ๋ฅผ ์‹ค์ธก์œผ๋กœ ์ฐจ๋‹จ. ๋ถˆ์ผ์น˜ ์‹œ ์ปจํ…์ŠคํŠธ ๋ฐฐ์ง€์— `โš  ์‹ค์ œ ์ฐฝ Nโ†“` ๋…ธ์ถœ. ([client.ts](src/lmstudio/client.ts) ยท [computeBudgetedRequest.ts](src/agent/handlePrompt/computeBudgetedRequest.ts)) +- **Phase 2 โ€” ์ฝ”์–ด ์ฑ„ํŒ… Map-Reduce**: ๋‹จ์ผ ์‚ฌ์šฉ์ž ์ž…๋ ฅ์ด (์œ ํšจ ์ฐฝ ร— `mapReduceTriggerRatio`, ๊ธฐ๋ณธ 0.6) ์„ ๋„˜์œผ๋ฉด ์ฒญํฌโ†’**์งˆ์˜ ์ธ์ง€ํ˜• ์ถ”์ถœ**(์š”์•ฝ ์•„๋‹˜, ์›๋ฌธ ์‚ฌ์‹ค๋งŒยท์ถ”์ธก ๊ธˆ์ง€)โ†’ํ†ตํ•ฉ ํ›„, ์••์ถ•๋œ ์ปจํ…์ŠคํŠธ๋กœ ์ •์ƒ ์ŠคํŠธ๋ฆฌ๋ฐ ๋‹ต๋ณ€. ํ•ฉ๋ณธ์ด ๋˜ ๋„˜์น˜๋ฉด ๊ณ„์ธต์  ํ†ตํ•ฉ(`mapReduceMaxDepth`). ํ•œ ์กฐ๊ฐ ์‹คํŒจ๋Š” ๋ถ€๋ถ„ ํด๋ฐฑ, ์ „์ฒด ์‹คํŒจ๋Š” ๋‹จ๋ฐœ ๊ฒฝ๋กœ๋กœ ํด๋ฐฑ. ๋ชจ๋‘ ๋ฌด๊ด€ํ•˜๋ฉด ์ •์งํ•˜๊ฒŒ "๊ด€๋ จ ๋‚ด์šฉ ์—†์Œ" ์‹ ํ˜ธ. ๋™์‹œ์„ฑ์€ ๋กœ์ปฌ GPU ๋ณดํ˜ธ๋กœ ๊ธฐ๋ณธ 2. ([largeInputMapReduce.ts](src/agent/handlePrompt/largeInputMapReduce.ts)) +- **Phase 3 โ€” ๋ฉ”ํƒ€ ๋…ธ์ถœ**: map-reduce ์ง„ํ–‰/๊ฒฐ๊ณผ(`N์กฐ๊ฐ โ†’ M์ถ”์ถœ`, ๋ฌด๊ด€ยท์‹คํŒจ)๋ฅผ ์ปจํ…์ŠคํŠธ ๋ฐฐ์ง€์— ํ‘œ์‹œ. ์ถœ์ฒ˜ ์ถ”์ ์šฉ `[์กฐ๊ฐ k]` ํƒœ๊น…์€ `g1nation.mapReduceShowProvenance` ๋กœ ์˜ตํŠธ์ธ. +- ์‹ ๊ทœ ์„ค์ •: `largeInputMapReduce`(๊ธฐ๋ณธ on) ยท `mapReduceTriggerRatio` ยท `mapReduceConcurrency` ยท `mapReduceMaxDepth` ยท `mapReduceShowProvenance`. ์ฝ”์–ด ๊ฒฝ๋กœ๋งŒ ๋ณ€๊ฒฝ, `/meet`ยท`/review` ์ „์šฉ ๊ฒฝ๋กœ๋Š” ๋ถˆ๋ณ€. ํ…Œ์ŠคํŠธ +25๊ฑด(streamerยทbudgetยทmap-reduce ์ฝ”์–ด), ์ „์ฒด 684 ํ†ต๊ณผ. + ## v2.2.255 (2026-06-18) ### ๐Ÿงฉ `/review` โ€” ์ฝ”๋“œ ๋ฆฌ๋ทฐ map-reduce ์ฒญํ‚น (์•ฝํ•œ ๋ชจ๋ธ๋„ ํฐ ์ฝ”๋“œ๋ฒ ์ด์Šค ์ฒ˜๋ฆฌ) - ์ผ๋ฐ˜ ์—์ด์ „ํŠธ ์ฑ„ํŒ…์€ ์ฝ”๋“œ ๋ฆฌ๋ทฐ์ฒ˜๋Ÿผ ์ž…๋ ฅ์ด ํฐ ์ž‘์—…์„ ๋‹จ์ผ ํ˜ธ์ถœ๋กœ ์ฒ˜๋ฆฌํ•˜๋‹ค ์•ฝํ•œ ๋กœ์ปฌ ๋ชจ๋ธ์—์„œ ๋นˆ ์‘๋‹ต(์ฒซ ํ† ํฐ EOS)์œผ๋กœ ๋ฌด๋„ˆ์ง„๋‹ค. `/meet` ์˜ ๊ฒ€์ฆ๋œ map-reduce ๋ฅผ ์ฝ”๋“œ ๋ฆฌ๋ทฐ์— ์ ์šฉํ•œ **`/review <๋””๋ ‰ํ„ฐ๋ฆฌ|ํŒŒ์ผ> [์ดˆ์ ]`** ๋ช…๋ น ์‹ ์„ค. ์ฝ”์–ด ์ฑ„ํŒ… ๊ฒฝ๋กœ๋Š” ๊ฑด๋“œ๋ฆฌ์ง€ ์•Š์Œ. diff --git a/media/sidebar.js b/media/sidebar.js index b69e5b6..671c62d 100644 --- a/media/sidebar.js +++ b/media/sidebar.js @@ -444,13 +444,38 @@ if (b.droppedHistory > 0) parts.push(`๊ธฐ๋ก โˆ’${b.droppedHistory}`); if (b.systemTruncated) parts.push('์ปจํ…์ŠคํŠธ ์ผ๋ถ€ ์ƒ๋žต'); if (b.cappedForSmallModel) parts.push('๐Ÿ”ป ์ž‘์€ ๋ชจ๋ธ ๋ชจ๋“œ'); + if (b.windowMismatch && typeof b.actualContextLength === 'number') parts.push('โš  ์‹ค์ œ ์ฐฝ ' + fmtK(b.actualContextLength) + 'โ†“'); if (b.tight) parts.push('โš  ์ปจํ…์ŠคํŠธ ๊ฑฐ์˜ ๊ฐ€๋“'); - const warn = b.tight || b.systemTruncated; + const warn = b.tight || b.systemTruncated || b.windowMismatch; ctxBadge.textContent = parts.join(' ยท '); ctxBadge.className = 'ctx-badge' + (warn ? ' warn' : ' ok'); // New turn starts โ†’ drop stale stats from the previous answer. lastLmStats = null; - ctxBadge.title = `model: ${b.model || ''}${b.paramB != null ? ' (~' + b.paramB + 'B)' : ''}\n์ž…๋ ฅ โ‰ˆ ${b.inputTokens} tokens (์‹œ์Šคํ…œ ${b.systemTokens}, ๊ธฐ๋ก ${b.historyKept}๊ฐœ)\n์ถœ๋ ฅ ์ƒํ•œ ${b.maxOutputTokens} tokens / ์œ ํšจ context window ${b.contextLength} tokens${b.cappedForSmallModel ? ' (์ž‘์€ ๋ชจ๋ธ์šฉ ์ถ•์†Œ; ์„ค์ •๊ฐ’ ' + b.nominalContextLength + ')' : ''}`; + const mismatchNote = (b.windowMismatch && typeof b.actualContextLength === 'number') + ? `\nโš  ๋ชจ๋ธ์ด ์‹ค์ œ๋กœ๋Š” ${b.actualContextLength} ํ† ํฐ ์ฐฝ์œผ๋กœ ๋กœ๋“œ๋จ (์„ค์ • ${b.nominalContextLength}). ๊ทธ ํ•œ๋„์— ๋งž์ถฐ ์˜ˆ์‚ฐํ•จ.` + : ''; + ctxBadge.title = `model: ${b.model || ''}${b.paramB != null ? ' (~' + b.paramB + 'B)' : ''}\n์ž…๋ ฅ โ‰ˆ ${b.inputTokens} tokens (์‹œ์Šคํ…œ ${b.systemTokens}, ๊ธฐ๋ก ${b.historyKept}๊ฐœ)\n์ถœ๋ ฅ ์ƒํ•œ ${b.maxOutputTokens} tokens / ์œ ํšจ context window ${b.contextLength} tokens${b.cappedForSmallModel ? ' (์ž‘์€ ๋ชจ๋ธ์šฉ ์ถ•์†Œ; ์„ค์ •๊ฐ’ ' + b.nominalContextLength + ')' : ''}${mismatchNote}`; + } + function renderMapReduceStatus(v) { + if (!ctxBadge || !v) return; + if (v.phase === 'start') { + ctxBadge.textContent = '๐Ÿงฉ ํฐ ์ž…๋ ฅ์„ ์กฐ๊ฐ์œผ๋กœ ๋‚˜๋ˆ  ๊ด€๋ จ ๋‚ด์šฉ ์ถ”์ถœ ์ค‘โ€ฆ'; + ctxBadge.className = 'ctx-badge warn'; + ctxBadge.title = '์ž…๋ ฅ์ด ์ปจํ…์ŠคํŠธ ์ฐฝ๋ณด๋‹ค ์ปค์„œ ์ฒญํฌโ†’์ถ”์ถœโ†’ํ†ตํ•ฉ(map-reduce)์œผ๋กœ ์ฒ˜๋ฆฌ ์ค‘์ž…๋‹ˆ๋‹ค.'; + } else if (v.phase === 'done') { + if (v.allIrrelevant) { + ctxBadge.textContent = '๐Ÿงฉ ์ถ”์ถœ ๊ฒฐ๊ณผ: ์š”์ฒญ ๊ด€๋ จ ๋‚ด์šฉ ์—†์Œ'; + ctxBadge.className = 'ctx-badge warn'; + ctxBadge.title = '๊ธด ์ž…๋ ฅ์˜ ๋ชจ๋“  ์กฐ๊ฐ์—์„œ ์š”์ฒญ๊ณผ ์ง์ ‘ ๊ด€๋ จ๋œ ๋‚ด์šฉ์„ ์ฐพ์ง€ ๋ชปํ–ˆ์Šต๋‹ˆ๋‹ค. ์›๋ณธ์„ ๊ทธ๋Œ€๋กœ(์˜ˆ์‚ฐ ๋‚ด์—์„œ ์ž˜๋ผ) ์ „๋‹ฌํ•ฉ๋‹ˆ๋‹ค.'; + } else { + ctxBadge.textContent = `๐Ÿงฉ ${v.chunkCount}์กฐ๊ฐ โ†’ ${v.relevantCount}์กฐ๊ฐ ์ถ”์ถœยทํ†ตํ•ฉ`; + ctxBadge.className = 'ctx-badge ok'; + ctxBadge.title = `ํฐ ์ž…๋ ฅ์„ ${v.chunkCount}๊ฐœ ์กฐ๊ฐ์œผ๋กœ ๋‚˜๋ˆ  ๊ทธ์ค‘ ${v.relevantCount}๊ฐœ์—์„œ ๊ด€๋ จ ๋‚ด์šฉ์„ ์ถ”์ถœยทํ†ตํ•ฉํ–ˆ์Šต๋‹ˆ๋‹ค.`; + } + } else if (v.phase === 'error') { + ctxBadge.textContent = '๐Ÿงฉ ๋ถ„ํ•  ์ฒ˜๋ฆฌ ์‹คํŒจ โ€” ๋‹จ๋ฐœ ์ฒ˜๋ฆฌ๋กœ ์ง„ํ–‰'; + ctxBadge.className = 'ctx-badge warn'; + } } function renderLmStudioStats(s) { if (!ctxBadge || !s) return; @@ -995,6 +1020,9 @@ case 'lmStudioStats': renderLmStudioStats(msg.value); break; + case 'mapReduceStatus': + renderMapReduceStatus(msg.value); + break; case 'usedScope': { let target = streamBody && streamBody._parent; if (!target) { diff --git a/package.json b/package.json index abd6d25..976ddf1 100644 --- a/package.json +++ b/package.json @@ -2,7 +2,7 @@ "name": "astra", "displayName": "Astra", "description": "The personal intelligence layer for Antigravity and VS Code. A private cognitive partner for deep project context, memory, and proactive strategic decision-making.", - "version": "2.2.255", + "version": "2.2.256", "publisher": "g1nation", "license": "MIT", "icon": "assets/icon.png", @@ -441,6 +441,37 @@ "minimum": 0, "description": "Optional safety knob, OFF by default (0). Some very small models (โ‰ค3B) emit an empty/EOS response when given a prompt near their context window even though it nominally fits. If you observe that with a tiny model, set this to e.g. 8192โ€“16384: for โ‰ค3B models only, Astra then budgets the prompt against this smaller effective window instead of g1nation.contextLength. Never applies to 4B+ models. Leave 0 unless you actually hit the issue โ€” it reduces the output-token budget. Default: 0 (disabled)" }, + "g1nation.largeInputMapReduce": { + "type": "boolean", + "default": true, + "description": "When a single message is too large to fit the model's context window, split it into chunks, extract only the request-relevant facts from each (no hallucination/summary), integrate them, and answer from the condensed result. Turn off to send oversized input in one shot (the server may then truncate it). Default: true" + }, + "g1nation.mapReduceTriggerRatio": { + "type": "number", + "default": 0.6, + "minimum": 0.3, + "maximum": 0.95, + "description": "Map-reduce kicks in when a single message exceeds (effective context window ร— this ratio). Lower = engages sooner (safer for big inputs, more LLM calls). Default: 0.6" + }, + "g1nation.mapReduceConcurrency": { + "type": "number", + "default": 2, + "minimum": 1, + "maximum": 8, + "description": "How many chunk extractions run in parallel. Keep low on a single local GPU (one model serves them sequentially anyway). Default: 2" + }, + "g1nation.mapReduceMaxDepth": { + "type": "number", + "default": 3, + "minimum": 1, + "maximum": 6, + "description": "Maximum hierarchical-integration depth when the combined extractions still overflow the window. Default: 3" + }, + "g1nation.mapReduceShowProvenance": { + "type": "boolean", + "default": false, + "description": "Tag each extracted block with its source chunk ([์กฐ๊ฐ k]) so the final answer can be traced back to the part of the input it came from. Default: false" + }, "g1nation.autoContinueOnOutputLimit": { "type": "boolean", "default": true, diff --git a/src/agent.ts b/src/agent.ts index b533959..6165dc2 100644 --- a/src/agent.ts +++ b/src/agent.ts @@ -140,6 +140,7 @@ export { _parseTaskAttrs, _parseSheetAttrs, _parseCalEventAttrs }; // 8 method bodies extracted to dedicated modules. AgentExecutor ์˜ ๋™๋ช… ๋ฉ”์„œ๋“œ๋Š” // ์ด์ œ thin wrapper โ€” deps ๊ฐ์ฒด๋ฅผ ๋ฌถ์–ด์„œ free function ์œผ๋กœ ์œ„์ž„. import { callNonStreaming as callNonStreamingFn } from './agent/llm/callNonStreaming'; +import { runMapReduce, shouldMapReduce } from './agent/handlePrompt/largeInputMapReduce'; import { createStreamingRequest as createStreamingRequestFn } from './agent/llm/createStreamingRequest'; import { streamChatOnce as streamChatOnceFn } from './agent/llm/streamChatOnce'; import { maybeEmitDevilRebuttal as maybeEmitDevilRebuttalFn } from './agent/llm/devilRebuttal'; @@ -768,12 +769,103 @@ export class AgentExecutor { // Context budget computation โ†’ src/agent/handlePrompt/computeBudgetedRequest.ts const imageCount = (reqMessages as any[]) .reduce((n, m) => n + (Array.isArray(m?.images) ? m.images.length : 0), 0); + // Budget against the model's REAL loaded window, not just the user's + // contextLength setting. Best-effort + cached; only for the LM Studio + // SDK path (REST/Ollama/cloud expose no such query โ†’ undefined โ†’ prior behavior). + let actualContextLength: number | undefined; + try { + const _isCloud = (() => { + try { + const { parseModelPrefix } = require('./features/providers') as typeof import('./features/providers'); + return !!parseModelPrefix(actualModel); + } catch { return false; } + })(); + if (!_isCloud + && resolveEngine(ollamaUrl) === 'lmstudio' + && this.options.lmStudioStreamer?.getModelContextLength) { + actualContextLength = await this.options.lmStudioStreamer.getModelContextLength(actualModel); + } + } catch { /* best-effort โ€” fall back to configured window */ } + + // โ”€โ”€ Large-input Map-Reduce โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + // When a SINGLE user message is too big to fit the (real) window, + // history-trimming can't help โ€” you can't drop the current question. + // Chunk it, extract only the request-relevant facts per chunk, and + // integrate, then let the normal streaming path answer from the + // condensed context. Only the user-visible turn; casual chat skipped. + if (loopDepth === 0 && !isCasualConversation && config.largeInputMapReduce) { + try { + const effWindow = (typeof actualContextLength === 'number' && actualContextLength > 0) + ? Math.min(config.contextLength, actualContextLength) + : config.contextLength; + const lastUserIdx = reqMessages.map((m) => m.role).lastIndexOf('user'); + const lastUser = lastUserIdx >= 0 ? reqMessages[lastUserIdx] : undefined; + const content = typeof lastUser?.content === 'string' ? lastUser.content : ''; + const sysTokens = estimateTokens(fullSystemPrompt) + 4; + const mrCfg = { + enabled: true, + triggerRatio: config.mapReduceTriggerRatio, + concurrency: config.mapReduceConcurrency, + maxDepth: config.mapReduceMaxDepth, + showProvenance: config.mapReduceShowProvenance, + }; + if (lastUser && shouldMapReduce(estimateTokens(content), effWindow, mrCfg)) { + const intent = content.length > 1400 + ? `${content.slice(0, 800)}\nโ€ฆ\n${content.slice(-400)}` + : content; + const mrEngine = resolveEngine(ollamaUrl); + this.webview?.postMessage({ type: 'mapReduceStatus', value: { phase: 'start' } }); + const mr = await runMapReduce( + { + callLLM: async (messages, maxTokens) => { + const r = await this.callNonStreaming({ + baseUrl: ollamaUrl, + modelName: actualModel, + engine: mrEngine, + messages, + temperature: 0.1, + maxTokens, + contextLength: effWindow, + signal: this.abortController?.signal, + }); + return r.text; + }, + estimateTokens, + log: (msg, meta) => logInfo(msg, meta), + signal: this.abortController?.signal, + }, + { intent, largeContent: content, windowTokens: effWindow, systemTokens: sysTokens, safetyMargin: config.contextSafetyMargin, cfg: mrCfg }, + ); + // allIrrelevant โ†’ keep original (budgeter truncates) rather than forcing an empty context. + if (!mr.allIrrelevant && mr.condensedContext.trim()) { + reqMessages[lastUserIdx] = { + ...lastUser, + content: `${intent}\n\nโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ ์ถ”์ถœ๋œ ๊ด€๋ จ ์ž๋ฃŒ (์›๋ณธ ${mr.chunkCount}์กฐ๊ฐ ์ค‘ ${mr.relevantCount}์กฐ๊ฐ, ํ†ตํ•ฉ ${mr.reduceDepth}๋‹จ๊ณ„) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€\n${mr.condensedContext}`, + } as any; + logInfo('Large input condensed via map-reduce.', { + model: actualModel, chunkCount: mr.chunkCount, relevantCount: mr.relevantCount, reduceDepth: mr.reduceDepth, + }); + } + this.webview?.postMessage({ + type: 'mapReduceStatus', + value: { phase: 'done', chunkCount: mr.chunkCount, relevantCount: mr.relevantCount, allIrrelevant: mr.allIrrelevant }, + }); + } + } catch (e: any) { + // Any failure โ†’ fall through to the normal (single-shot) path. Worst case the + // budgeter truncates the oversized input, which is the prior behavior. + logError('Large-input map-reduce failed โ€” falling back to single-shot path.', { error: e?.message ?? String(e) }); + this.webview?.postMessage({ type: 'mapReduceStatus', value: { phase: 'error' } }); + } + } + const _budget = computeBudgetedRequest({ fullSystemPrompt, reqMessages, actualModel, config, imageCount, + actualContextLength, }); const messagesForRequest = _budget.messagesForRequest; const ctxLimits = _budget.ctxLimits; @@ -819,6 +911,8 @@ export class AgentExecutor { paramB: modelParamB, contextLength: ctxLimits.contextLength, nominalContextLength: config.contextLength, + actualContextLength, + windowMismatch: _budget.windowMismatch, cappedForSmallModel, inputTokens, maxOutputTokens, diff --git a/src/agent/handlePrompt/computeBudgetedRequest.ts b/src/agent/handlePrompt/computeBudgetedRequest.ts index 02709f9..641a276 100644 --- a/src/agent/handlePrompt/computeBudgetedRequest.ts +++ b/src/agent/handlePrompt/computeBudgetedRequest.ts @@ -19,6 +19,13 @@ export interface ComputeBudgetedRequestInput { /** Result of `getConfig()` โ€” reads contextLength, maxOutputTokens, contextSafetyMargin, smallModelContextCap, autoCompactHistory. */ config: any; imageCount: number; + /** + * The model's *actually-loaded* context window (LM Studio `getContextLength()`), + * when known. Budgeting uses the smaller of this and `config.contextLength` so we + * never overflow a model loaded with a smaller window than the user's setting. + * Omit (undefined) to budget against the configured value alone (prior behavior). + */ + actualContextLength?: number; } export interface ComputeBudgetedRequestResult { @@ -34,6 +41,10 @@ export interface ComputeBudgetedRequestResult { outputBudget: { maxOutputTokens: number; available: number; tight: boolean }; modelParamB: number | null; cappedForSmallModel: boolean; + /** True when the model's real loaded window is smaller than `config.contextLength` (we clamped to the real one). */ + windowMismatch: boolean; + /** The window actually used for budgeting (after real-window clamp + small-model cap). */ + effectiveContextLength: number; } /** @@ -60,15 +71,34 @@ export function computeBudgetedRequest(input: ComputeBudgetedRequestInput): Comp // smaller effective window. Never applied to 4B+ models, and never when the setting is 0 โ€” // capping squeezes the output-token budget, so it's a knob, not a default. const modelParamB = estimateModelParamsB(actualModel); + + // The real ceiling is whatever window the model was actually loaded with โ€” the + // server truncates anything past it. When known, clamp the configured setting + // down to it so we budget against the smaller of the two. (When unknown, keep + // the configured value โ€” prior behavior.) + const actualWindow = (typeof input.actualContextLength === 'number' + && Number.isFinite(input.actualContextLength) + && input.actualContextLength > 0) + ? input.actualContextLength + : undefined; + const configuredWindow = config.contextLength; + const windowMismatch = actualWindow !== undefined && actualWindow < configuredWindow; + const realWindow = actualWindow !== undefined ? Math.min(configuredWindow, actualWindow) : configuredWindow; + if (windowMismatch) { + logInfo('Model loaded with a smaller context window than the setting โ€” clamping budget to the real window.', { + model: actualModel, configuredWindow, actualWindow, + }); + } + const smallModelCap = config.smallModelContextCap; // 0 = disabled (default) const cappedForSmallModel = smallModelCap > 0 && modelParamB !== null && modelParamB <= 3 - && config.contextLength > smallModelCap; - const effectiveContextLength = cappedForSmallModel ? smallModelCap : config.contextLength; + && realWindow > smallModelCap; + const effectiveContextLength = cappedForSmallModel ? smallModelCap : realWindow; if (cappedForSmallModel) { logInfo('Small model detected โ€” capping effective context window for budgeting.', { model: actualModel, paramB: modelParamB, - nominalContext: config.contextLength, effectiveContext: effectiveContextLength, + nominalContext: realWindow, effectiveContext: effectiveContextLength, }); } const ctxLimits: ContextLimits = { @@ -157,5 +187,7 @@ export function computeBudgetedRequest(input: ComputeBudgetedRequestInput): Comp outputBudget, modelParamB, cappedForSmallModel, + windowMismatch, + effectiveContextLength, }; } diff --git a/src/agent/handlePrompt/largeInputMapReduce.ts b/src/agent/handlePrompt/largeInputMapReduce.ts new file mode 100644 index 0000000..8aa6a3f --- /dev/null +++ b/src/agent/handlePrompt/largeInputMapReduce.ts @@ -0,0 +1,265 @@ +/** + * ============================================================ + * Large-Input Map-Reduce (ํฐ ์ž…๋ ฅ ์ฒญํ‚น + ํ†ตํ•ฉ) + * + * ํ•œ ๋ฒˆ์— ์ปจํ…์ŠคํŠธ ์ฐฝ์— ์•ˆ ๋“ค์–ด๊ฐ€๋Š” ๋‹จ์ผ ์‚ฌ์šฉ์ž ์ž…๋ ฅ(๊ธด ํšŒ์˜๋กยท๋ฆฌ์„œ์น˜ ๋คํ”„ ๋“ฑ)์„ + * 1) ์ฒญํฌ๋กœ ๋ถ„ํ• (Map ๋Œ€์ƒ) + * 2) ๊ฐ ์ฒญํฌ์—์„œ "์š”์ฒญ๊ณผ ๊ด€๋ จ๋œ ์‚ฌ์‹ค๋งŒ" ๋ฐœ์ทŒ (์งˆ์˜ ์ธ์ง€ํ˜• ์ถ”์ถœ โ€” ์ผ๋ฐ˜ ์š”์•ฝ X) + * 3) ๋ฐœ์ทŒ๋“ค์„ ํ†ตํ•ฉ(Reduce). ํ•ฉ๋ณธ์ด ๋˜ ์ฐฝ์„ ๋„˜์œผ๋ฉด ๊ณ„์ธต์ ์œผ๋กœ ์žฌํ†ตํ•ฉ. + * ํ•œ ๋’ค, ์••์ถ•๋œ ์ปจํ…์ŠคํŠธ๋ฅผ ๋Œ๋ ค์ค˜ ์ •์ƒ ์ŠคํŠธ๋ฆฌ๋ฐ ๊ฒฝ๋กœ๊ฐ€ ์ตœ์ข… ๋‹ต๋ณ€์„ ์ƒ์„ฑํ•˜๊ฒŒ ํ•œ๋‹ค. + * + * ์‹ ๋ขฐ์„ฑ ์›์น™(ASTRA): ์ถ”์ธกยท์ฐฝ์ž‘ ๊ธˆ์ง€, ์›๋ฌธ ํ‘œํ˜„ ๋ณด์กด, ์ถœ์ฒ˜(`[์กฐ๊ฐ k]`) ํƒœ๊น…, + * ์ „๋ถ€ ๋ฌด๊ด€ํ•˜๋ฉด ์ •์งํ•˜๊ฒŒ "๊ด€๋ จ ๋‚ด์šฉ ์—†์Œ" ์‹ ํ˜ธ. + * + * LLM ํ˜ธ์ถœ์€ `callLLM` ์œผ๋กœ ์ฃผ์ž… โ†’ ์ฝ”์–ด ๋กœ์ง์€ ๋„คํŠธ์›Œํฌ ์˜์กด ์—†์ด ๋‹จ์œ„ ํ…Œ์ŠคํŠธ ๊ฐ€๋Šฅ. + * ============================================================ + */ + +import type { ChatMessage } from '../../agent'; +import { splitIntoSections } from '../../retrieval/chunker'; + +export interface MapReduceConfig { + enabled: boolean; + /** ๋‹จ์ผ ์ž…๋ ฅ ํ† ํฐ > (์œ ํšจ ์ฐฝ ร— triggerRatio) ์ด๋ฉด ๋ฐœ๋™. */ + triggerRatio: number; + concurrency: number; + maxDepth: number; + showProvenance: boolean; +} + +export interface MapReduceDeps { + /** ๋ฉ”์‹œ์ง€ ๋ฐฐ์—ด โ†’ ๋ชจ๋ธ ์‘๋‹ต ํ…์ŠคํŠธ. (callNonStreaming ๋ž˜ํผ) */ + callLLM: (messages: ChatMessage[], maxTokens: number) => Promise; + estimateTokens: (text: string) => number; + log?: (msg: string, meta?: Record) => void; + signal?: AbortSignal; +} + +export interface MapReduceParams { + /** ์‚ฌ์šฉ์ž ์š”์ฒญ ์˜๋„ ํžŒํŠธ (๋ณดํ†ต ์›๋ณธ ์ž…๋ ฅ์˜ ๋จธ๋ฆฌ/๊ผฌ๋ฆฌ ๋ฐœ์ทŒ โ€” ์ง€์‹œ๋ฌธ์ด ๊ฑฐ๊ธฐ ์žˆ์Œ). */ + intent: string; + /** ์ฒญํ‚น ๋Œ€์ƒ์ด ๋˜๋Š” ํฐ ๋ณธ๋ฌธ. */ + largeContent: string; + /** ์œ ํšจ ์ปจํ…์ŠคํŠธ ์ฐฝ(ํ† ํฐ) โ€” Phase 1 ์˜ effectiveContextLength. */ + windowTokens: number; + /** ์‹œ์Šคํ…œ ํ”„๋กฌํ”„ํŠธ๊ฐ€ ์ด๋ฏธ ์ฐจ์ง€ํ•œ ํ† ํฐ. */ + systemTokens: number; + safetyMargin: number; + cfg: MapReduceConfig; +} + +export interface MapReduceResult { + /** ํ†ตํ•ฉ๋œ ๊ด€๋ จ ์ž๋ฃŒ. ์ •์ƒ ๊ฒฝ๋กœ์—์„œ ์‚ฌ์šฉ์ž ๋ฉ”์‹œ์ง€ ๋ณธ๋ฌธ์„ ์ด๊ฑธ๋กœ ๋Œ€์ฒด. */ + condensedContext: string; + chunkCount: number; + relevantCount: number; + reduceDepth: number; + /** ๋ชจ๋“  ์ฒญํฌ๊ฐ€ ๋ฌด๊ด€ โ†’ ํ˜ธ์ถœ ์ธก์—์„œ ์ •์งํ•œ ์—์Šค์ปฌ๋ ˆ์ด์…˜. */ + allIrrelevant: boolean; +} + +const IRRELEVANT_MARKER = '(๊ด€๋ จ ์—†์Œ)'; +/** ์ถ”์ถœ/ํ†ตํ•ฉ ํ˜ธ์ถœ์ด ์“ธ ์ถœ๋ ฅ ํ† ํฐ ์ƒํ•œ โ€” ๋ฐœ์ทŒ๋Š” ์›๋ฌธ๋ณด๋‹ค ์งง์œผ๋ฏ€๋กœ ๋ณด์ˆ˜์ ์œผ๋กœ. */ +const EXTRACT_OUTPUT_TOKENS = 1024; +const REDUCE_OUTPUT_TOKENS = 2048; +/** ํ† ํฐโ†’๋ฌธ์ž ํ™˜์‚ฐ(ํ•œ๊ตญ์–ด ๋ณด์ˆ˜์น˜ ~2์ž/ํ† ํฐ). ์ฒญํฌ ํฌ๊ธฐ ์‚ฐ์ •์šฉ. */ +const CHARS_PER_TOKEN = 2; + +/** ์œ ํšจ ์ฐฝ์—์„œ ์ž…๋ ฅ์— ์“ธ ์ˆ˜ ์žˆ๋Š” ํ† ํฐ ์˜ˆ์‚ฐ. computeBudgetedRequest ์™€ ๊ฐ™์€ ๊ณต์‹. */ +export function inputBudgetTokens(windowTokens: number, systemTokens: number, safetyMargin: number): number { + const outputReserve = Math.max(2048, Math.floor(windowTokens * 0.1)); + return Math.max(256, windowTokens - systemTokens - outputReserve - safetyMargin); +} + +/** ๋‹จ์ผ ์ž…๋ ฅ์ด map-reduce ๋Œ€์ƒ์ธ์ง€. (cfg.enabled + ์ž…๋ ฅ์ด ์ฐฝ์˜ triggerRatio ์ดˆ๊ณผ) */ +export function shouldMapReduce(latestUserTokens: number, windowTokens: number, cfg: MapReduceConfig): boolean { + if (!cfg.enabled) return false; + if (windowTokens <= 0) return false; + return latestUserTokens > windowTokens * cfg.triggerRatio; +} + +/** ํ•œ ์ฒญํฌ๊ฐ€ (์ž๊ธฐ + ์ถ”์ถœ ํ”„๋กฌํ”„ํŠธ ์˜ค๋ฒ„ํ—ค๋“œ + ์ถœ๋ ฅ ์˜ˆ์•ฝ)์œผ๋กœ ์ฐฝ์— ๋“ค์–ด๊ฐ€๋„๋ก ๋ฌธ์ž ์ƒํ•œ ์‚ฐ์ •. */ +export function chunkCharBudget(windowTokens: number, systemTokens: number, safetyMargin: number): number { + // ์ถ”์ถœ ํ”„๋กฌํ”„ํŠธ ์ž์ฒด ์˜ค๋ฒ„ํ—ค๋“œ(์ง€์‹œ๋ฌธ + intent) ~800 ํ† ํฐ ๊ฐ€์ •. + const promptOverhead = 800; + const perChunkTokenBudget = Math.max( + 512, + windowTokens - systemTokens - safetyMargin - EXTRACT_OUTPUT_TOKENS - promptOverhead + ); + // ๋ณด์ˆ˜์ ์œผ๋กœ 70% ๋งŒ ์‚ฌ์šฉ (์ถ”์ • ์˜ค์ฐจ ํก์ˆ˜). + return Math.floor(perChunkTokenBudget * CHARS_PER_TOKEN * 0.7); +} + +function buildExtractPrompt(intent: string, chunkText: string, idx: number, total: number): ChatMessage[] { + const system = [ + '๋„ˆ๋Š” ๊ธด ์ž๋ฃŒ์—์„œ ์‚ฌ์šฉ์ž ์š”์ฒญ์— ํ•„์š”ํ•œ ์‚ฌ์‹ค๋งŒ ์ •ํ™•ํžˆ ๋ฐœ์ทŒํ•˜๋Š” ์ถ”์ถœ๊ธฐ๋‹ค.', + '๊ทœ์น™:', + '1) ์‚ฌ์šฉ์ž ์š”์ฒญ๊ณผ ์ง์ ‘ ๊ด€๋ จ๋œ ์‚ฌ์‹คยท์ˆ˜์น˜ยท๋ฐœ์–ธยท๊ฒฐ์ •์‚ฌํ•ญ๋งŒ ์›๋ฌธ ํ‘œํ˜„ ๊ทธ๋Œ€๋กœ ๋ฐœ์ทŒํ•œ๋‹ค.', + '2) ์š”์•ฝยท์ถ”์ธกยท์ฐฝ์ž‘ยท์ผ๋ฐ˜ํ™” ๊ธˆ์ง€. ์ž๋ฃŒ์— ์—†๋Š” ๋‚ด์šฉ์€ ์ ˆ๋Œ€ ๋งŒ๋“ค์ง€ ์•Š๋Š”๋‹ค.', + `3) ์ด ์กฐ๊ฐ์— ๊ด€๋ จ ๋‚ด์šฉ์ด ์ „ํ˜€ ์—†์œผ๋ฉด ์ •ํ™•ํžˆ "${IRRELEVANT_MARKER}" ํ•œ ์ค„๋งŒ ์ถœ๋ ฅํ•œ๋‹ค.`, + '4) ๋ถˆ๋ฆฟ(-)์œผ๋กœ ๊ฐ„๊ฒฐํ•˜๊ฒŒ. ๊ฐ ํ•ญ๋ชฉ์€ ์ž๋ฃŒ์— ๊ทผ๊ฑฐํ•ด์•ผ ํ•œ๋‹ค.', + ].join('\n'); + const user = [ + `[์‚ฌ์šฉ์ž ์š”์ฒญ ์˜๋„]\n${intent}`, + `\n[์ž๋ฃŒ ์กฐ๊ฐ ${idx}/${total}]\n${chunkText}`, + `\n์œ„ ์กฐ๊ฐ์—์„œ ์š”์ฒญ ์ˆ˜ํ–‰์— ํ•„์š”ํ•œ ์‚ฌ์‹ค๋งŒ ๋ฐœ์ทŒํ•˜๋ผ. ์—†์œผ๋ฉด "${IRRELEVANT_MARKER}".`, + ].join('\n'); + return [ + { role: 'system', content: system }, + { role: 'user', content: user }, + ]; +} + +function buildReducePrompt(intent: string, extractions: string): ChatMessage[] { + const system = [ + '๋„ˆ๋Š” ์—ฌ๋Ÿฌ ๋ฐœ์ทŒ๋ฅผ ์ค‘๋ณต ์—†์ด ํ•˜๋‚˜๋กœ ํ†ตํ•ฉํ•˜๋Š” ํ†ตํ•ฉ๊ธฐ๋‹ค.', + '๊ทœ์น™: ๋ฐœ์ทŒ์— ์žˆ๋Š” ์‚ฌ์‹ค๋งŒ ์œ ์ง€ํ•˜๊ณ , ์ค‘๋ณต์€ ๋ณ‘ํ•ฉํ•œ๋‹ค. ์ถ”์ธกยท์ฐฝ์ž‘ ๊ธˆ์ง€.', + '์›๋ฌธ ์‚ฌ์‹ค๊ณผ (์žˆ๋‹ค๋ฉด) [์กฐ๊ฐ k] ์ถœ์ฒ˜ ํ‘œ๊ธฐ๋ฅผ ๋ณด์กดํ•œ๋‹ค.', + ].join('\n'); + const user = `[์‚ฌ์šฉ์ž ์š”์ฒญ ์˜๋„]\n${intent}\n\n[๋ฐœ์ทŒ ๋ชจ์Œ]\n${extractions}\n\n์œ„ ๋ฐœ์ทŒ๋“ค์„ ์š”์ฒญ ๊ด€์ ์—์„œ ์ค‘๋ณต ์—†์ด ํ†ตํ•ฉํ•˜๋ผ.`; + return [ + { role: 'system', content: system }, + { role: 'user', content: user }, + ]; +} + +/** ๋™์‹œ์„ฑ ์ œํ•œ map. ์ˆœ์„œ ๋ณด์กด. */ +async function mapWithConcurrency( + items: T[], + limit: number, + fn: (item: T, index: number) => Promise, + signal?: AbortSignal, +): Promise { + const results: R[] = new Array(items.length); + let next = 0; + const n = Math.max(1, Math.min(limit, items.length)); + const workers = Array.from({ length: n }, async () => { + while (true) { + if (signal?.aborted) return; + const i = next++; + if (i >= items.length) return; + results[i] = await fn(items[i], i); + } + }); + await Promise.all(workers); + return results; +} + +function isIrrelevant(text: string): boolean { + const t = (text || '').trim(); + return t.length === 0 || t === IRRELEVANT_MARKER || /^\(?\s*๊ด€๋ จ\s*์—†์Œ\s*\)?$/.test(t); +} + +/** + * ํฐ ์ž…๋ ฅ์„ ์ฒญํฌโ†’์ถ”์ถœโ†’ํ†ตํ•ฉํ•œ๋‹ค. ํ˜ธ์ถœ ์ธก์€ trigger ๋ฅผ ์ด๋ฏธ ํ†ต๊ณผ์‹œํ‚จ ๋’ค ํ˜ธ์ถœํ•œ๋‹ค๊ณ  ๊ฐ€์ •ํ•˜์ง€๋งŒ, + * ๋ฐฉ์–ด์ ์œผ๋กœ ๋‹จ์ผ ์ฒญํฌ๋ฉด ์ถ”์ถœ๋งŒ ํ•˜๊ณ  ํ†ตํ•ฉ์€ ๊ฑด๋„ˆ๋›ด๋‹ค. + */ +export async function runMapReduce(deps: MapReduceDeps, params: MapReduceParams): Promise { + const { intent, largeContent, windowTokens, systemTokens, safetyMargin, cfg } = params; + const log = deps.log ?? (() => {}); + + const targetChars = chunkCharBudget(windowTokens, systemTokens, safetyMargin); + const sections = splitIntoSections(largeContent, { + targetChars, + maxChars: targetChars * 2, + }); + const chunks = sections.map((s) => s.text); + log('Map-reduce: split large input into chunks.', { chunkCount: chunks.length, targetChars }); + + // โ”€โ”€ Map: ๊ฐ ์ฒญํฌ โ†’ ์งˆ์˜ ์ธ์ง€ํ˜• ์ถ”์ถœ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + const extracted = await mapWithConcurrency( + chunks, + cfg.concurrency, + async (chunk, i) => { + if (deps.signal?.aborted) return ''; + try { + const text = await deps.callLLM( + buildExtractPrompt(intent, chunk, i + 1, chunks.length), + EXTRACT_OUTPUT_TOKENS, + ); + return text ?? ''; + } catch (e: any) { + // ํ•œ ์ฒญํฌ ์‹คํŒจ๊ฐ€ ์ „์ฒด๋ฅผ ๋ง‰์ง€ ์•Š๊ฒŒ โ€” ์›๋ฌธ ์ผ๋ถ€๋กœ ํด๋ฐฑ(๋นˆ์†๋ณด๋‹ค ๋‚ซ๋‹ค). + log('Map-reduce: chunk extraction failed โ€” falling back to truncated raw.', { chunk: i + 1, error: e?.message ?? String(e) }); + return chunk.slice(0, targetChars); + } + }, + deps.signal, + ); + + const relevant: string[] = []; + extracted.forEach((text, i) => { + if (isIrrelevant(text)) return; + relevant.push(cfg.showProvenance ? `[์กฐ๊ฐ ${i + 1}]\n${text.trim()}` : text.trim()); + }); + + if (relevant.length === 0) { + log('Map-reduce: every chunk was irrelevant.', { chunkCount: chunks.length }); + return { condensedContext: '', chunkCount: chunks.length, relevantCount: 0, reduceDepth: 0, allIrrelevant: true }; + } + + // โ”€โ”€ Reduce: ํ•ฉ๋ณธ์ด ์ž…๋ ฅ ์˜ˆ์‚ฐ์— ๋“ค์–ด๊ฐˆ ๋•Œ๊นŒ์ง€ ๊ณ„์ธต์ ์œผ๋กœ ํ†ตํ•ฉ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + const budget = inputBudgetTokens(windowTokens, systemTokens, safetyMargin); + // intent ๋ถ„๋Ÿ‰ + ํ—ค๋” ์—ฌ์œ ๋ฅผ ์œ„ํ•ด ์˜ˆ์‚ฐ์˜ 80% ๋ฅผ ์ปจํ…์ŠคํŠธ ์ƒํ•œ์œผ๋กœ. + const contextCeiling = Math.floor(budget * 0.8); + + let current = relevant; + let depth = 0; + while (depth < cfg.maxDepth) { + const joined = current.join('\n\n'); + if (deps.estimateTokens(joined) <= contextCeiling) break; + // ๊ทธ๋ฃน์œผ๋กœ ๋ฌถ์–ด ๊ฐ ๊ทธ๋ฃน์„ ํ†ตํ•ฉ โ†’ ๊ฐœ์ˆ˜ ๊ฐ์†Œ. + const groups = groupToFit(current, deps.estimateTokens, contextCeiling); + if (groups.length >= current.length) break; // ๋” ๋ชป ์ค„์ž„ โ€” ๋งˆ์ง€๋ง‰์— ์ž˜๋ฆผ ์ฒ˜๋ฆฌ + log('Map-reduce: hierarchical reduce round.', { depth: depth + 1, from: current.length, to: groups.length }); + current = await mapWithConcurrency( + groups, + cfg.concurrency, + async (group) => { + if (deps.signal?.aborted) return group.join('\n\n'); + try { + return await deps.callLLM(buildReducePrompt(intent, group.join('\n\n')), REDUCE_OUTPUT_TOKENS); + } catch { + return group.join('\n\n'); // ํ†ตํ•ฉ ์‹คํŒจ โ†’ ์›๋ณธ ๊ทธ๋ฃน ์œ ์ง€ + } + }, + deps.signal, + ); + depth++; + } + + let condensed = current.join('\n\n'); + // maxDepth ๋„๋‹ฌํ–ˆ๋Š”๋ฐ๋„ ๋„˜์น˜๋ฉด ํ•˜๋“œ ํŠธ๋ ์ผ€์ดํŠธ(์„œ๋ฒ„ overflow ๋ฐฉ์ง€) + ๊ฒฝ๊ณ ๋Š” ํ˜ธ์ถœ ์ธก์—์„œ. + if (deps.estimateTokens(condensed) > contextCeiling) { + const charCeiling = contextCeiling * CHARS_PER_TOKEN; + condensed = condensed.slice(0, charCeiling) + '\n\n[โ€ฆ์ž๋ฃŒ๊ฐ€ ๋งŽ์•„ ์ผ๋ถ€ ์ƒ๋žต๋จ]'; + log('Map-reduce: reduce hit max depth and was hard-truncated.', { maxDepth: cfg.maxDepth }); + } + + return { + condensedContext: condensed, + chunkCount: chunks.length, + relevantCount: relevant.length, + reduceDepth: depth, + allIrrelevant: false, + }; +} + +/** ํ•ญ๋ชฉ๋“ค์„ ์ˆœ์„œ๋Œ€๋กœ ๋ˆ„์ ํ•ด ceiling ์„ ๋„˜๊ธฐ ์ง์ „๊นŒ์ง€ ํ•œ ๊ทธ๋ฃน์œผ๋กœ ๋ฌถ๋Š”๋‹ค. */ +function groupToFit(items: string[], estimate: (s: string) => number, ceiling: number): string[][] { + const groups: string[][] = []; + let cur: string[] = []; + let curTokens = 0; + for (const item of items) { + const t = estimate(item); + if (cur.length > 0 && curTokens + t > ceiling) { + groups.push(cur); + cur = []; + curTokens = 0; + } + cur.push(item); + curTokens += t; + } + if (cur.length > 0) groups.push(cur); + return groups; +} diff --git a/src/config.ts b/src/config.ts index 102e179..6ad7cd3 100644 --- a/src/config.ts +++ b/src/config.ts @@ -40,6 +40,17 @@ export interface IAgentConfig { autoCompactHistory: boolean; /** ์ž‘์€ ๋ชจ๋ธ(โ‰ค4B) ๊ฐ์ง€ ์‹œ ์˜ˆ์‚ฐ ๊ณ„์‚ฐ์— ์“ธ ์œ ํšจ context window ์ƒํ•œ. 0 = ๋น„ํ™œ์„ฑํ™”. */ smallModelContextCap: number; + // โ”€โ”€โ”€ ํฐ ์ž…๋ ฅ Map-Reduce (๊ธด ํšŒ์˜๋ก/๋ฆฌ์„œ์น˜ ๋คํ”„ ์ฒญํ‚นยทํ†ตํ•ฉ) โ”€โ”€โ”€ + /** ๋‹จ์ผ ์‚ฌ์šฉ์ž ์ž…๋ ฅ์ด ์ฐฝ์„ ๋„˜์œผ๋ฉด ์ฒญํฌโ†’์ถ”์ถœโ†’ํ†ตํ•ฉ์œผ๋กœ ์ฒ˜๋ฆฌ. ๋„๋ฉด ๊ธฐ์กด ๋‹จ๋ฐœ ๊ฒฝ๋กœ(์ž˜๋ฆด ์ˆ˜ ์žˆ์Œ). */ + largeInputMapReduce: boolean; + /** ๋‹จ์ผ ์ž…๋ ฅ ํ† ํฐ์ด (์œ ํšจ ์ฐฝ ร— ์ด ๋น„์œจ)์„ ๋„˜์œผ๋ฉด map-reduce ๋ฐœ๋™. ๊ธฐ๋ณธ 0.6. */ + mapReduceTriggerRatio: number; + /** ์ฒญํฌ ์ถ”์ถœ ๋™์‹œ์„ฑ. ๋กœ์ปฌ ๋‹จ์ผ GPU ๋ณดํ˜ธ์šฉ์œผ๋กœ ๋‚ฎ๊ฒŒ. ๊ธฐ๋ณธ 2. */ + mapReduceConcurrency: number; + /** ์ถ”์ถœ ํ•ฉ๋ณธ์ด ์ฐฝ์„ ๋„˜์„ ๋•Œ ๊ณ„์ธต์  ํ†ตํ•ฉ ์ตœ๋Œ€ ๊นŠ์ด. ๊ธฐ๋ณธ 3. */ + mapReduceMaxDepth: number; + /** ์ตœ์ข… ๋‹ต๋ณ€์— `[์กฐ๊ฐ k]` ์ถœ์ฒ˜ ํƒœ๊ทธ๋ฅผ ๋…ธ์ถœ. ๊ธฐ๋ณธ false. */ + mapReduceShowProvenance: boolean; // โ”€โ”€โ”€ ์‘๋‹ต ๋ณต๊ตฌ (Thought Quarantine / Auto-Continuation) โ”€โ”€โ”€ /** ๋‹ต๋ณ€์ด ์ถœ๋ ฅ ํ† ํฐ ํ•œ๊ณ„์— ๊ฑธ๋ฆฌ๋ฉด ์‚ฌ์šฉ์ž ๊ฐœ์ž… ์—†์ด ๋‚ด๋ถ€์ ์œผ๋กœ ์ด์–ด์„œ ์ƒ์„ฑ. */ autoContinueOnOutputLimit: boolean; @@ -500,6 +511,11 @@ export function getConfig(): IAgentConfig { })(), autoCompactHistory: cfg.get('autoCompactHistory', true), smallModelContextCap: Math.max(0, cfg.get('smallModelContextCap', 0)), + largeInputMapReduce: cfg.get('largeInputMapReduce', true), + mapReduceTriggerRatio: Math.min(0.95, Math.max(0.3, cfg.get('mapReduceTriggerRatio', 0.6))), + mapReduceConcurrency: Math.min(8, Math.max(1, cfg.get('mapReduceConcurrency', 2))), + mapReduceMaxDepth: Math.min(6, Math.max(1, cfg.get('mapReduceMaxDepth', 3))), + mapReduceShowProvenance: cfg.get('mapReduceShowProvenance', false), autoContinueOnOutputLimit: cfg.get('autoContinueOnOutputLimit', true), maxAutoContinuations: Math.max(0, Math.min(10, cfg.get('maxAutoContinuations', 4))), finalOnlyRetryOnThoughtLeak: cfg.get('finalOnlyRetryOnThoughtLeak', true), diff --git a/src/lmstudio/client.ts b/src/lmstudio/client.ts index 2628537..3191c66 100644 --- a/src/lmstudio/client.ts +++ b/src/lmstudio/client.ts @@ -39,6 +39,17 @@ export interface ILMStudioClient { * "Model is disposed!" or "lock() request could not be registered" error. */ getModelHandle(modelKey: string, options?: { refresh?: boolean }): Promise; + /** + * The model's *actually-loaded* context window in tokens (LM Studio's + * `llm.getContextLength()`), or `undefined` if it can't be determined. + * + * The user-facing `g1nation.contextLength` setting is only a budgeting + * intent โ€” the real ceiling is whatever window the model was loaded with. + * Budgeting against the larger of the two silently overflows the server, + * which then truncates the prompt or emits EOS as the first token (empty + * answer). Cached per-key because it only changes on reload. + */ + getModelContextLength(modelKey: string): Promise; isReachable(): Promise; setBaseUrl(httpBaseUrl: string): void; } @@ -84,8 +95,10 @@ export class LMStudioClient implements ILMStudioClient { private _wsUrl: string | undefined; private _loadedCache: { value: string[]; expiresAt: number } | undefined; private _downloadedCache: { value: string[]; expiresAt: number } | undefined; + private _contextLengthCache = new Map(); private static readonly DEFAULT_LOADED_CACHE_TTL_MS = 5000; private static readonly DEFAULT_DOWNLOADED_CACHE_TTL_MS = 60_000; + private static readonly DEFAULT_CONTEXT_LENGTH_CACHE_TTL_MS = 60_000; constructor(httpBaseUrl: string) { this.setBaseUrl(httpBaseUrl); @@ -98,6 +111,7 @@ export class LMStudioClient implements ILMStudioClient { this._sdk = undefined; this._loadedCache = undefined; this._downloadedCache = undefined; + this._contextLengthCache.clear(); } } @@ -170,6 +184,7 @@ export class LMStudioClient implements ILMStudioClient { invalidateCaches(): void { this._loadedCache = undefined; this._downloadedCache = undefined; + this._contextLengthCache.clear(); } async listLoaded(): Promise { @@ -243,6 +258,36 @@ export class LMStudioClient implements ILMStudioClient { } } + async getModelContextLength(modelKey: string): Promise { + const key = (modelKey || '').trim(); + if (!key) return undefined; + const now = Date.now(); + const cached = this._contextLengthCache.get(key); + if (cached && cached.expiresAt > now) return cached.value; + try { + // Reuses the same handle the stream will use. If the model isn't + // loaded yet this forces a JIT load โ€” acceptable since the very next + // step streams from it anyway. Best-effort: any failure (incl. the + // load-coalescing "Operation canceled" race) falls back to undefined + // so the caller keeps the configured window. + const handle: any = await this.getSdk().llm.model(key); + const len = typeof handle?.getContextLength === 'function' + ? await handle.getContextLength() + : undefined; + if (typeof len === 'number' && Number.isFinite(len) && len > 0) { + this._contextLengthCache.set(key, { + value: len, + expiresAt: now + LMStudioClient.DEFAULT_CONTEXT_LENGTH_CACHE_TTL_MS, + }); + return len; + } + return undefined; + } catch (e: any) { + logError('Failed to query LM Studio model context length.', { modelKey: key, error: e?.message ?? String(e) }); + return undefined; + } + } + async isReachable(): Promise { try { await this.getSdk().llm.listLoaded(); diff --git a/src/lmstudio/streamer.ts b/src/lmstudio/streamer.ts index 212db2a..1b751a7 100644 --- a/src/lmstudio/streamer.ts +++ b/src/lmstudio/streamer.ts @@ -83,6 +83,12 @@ export interface IChatStreamer { * silently-disposed handle that needs a fresh WebSocket round-trip. */ resetHandle?(modelName: string): Promise; + /** + * The model's actually-loaded context window in tokens, or `undefined` if + * unavailable. Callers use this to budget against the real ceiling instead + * of the user's `contextLength` setting. Best-effort โ€” never throws. + */ + getModelContextLength?(modelName: string): Promise; } /** @@ -115,7 +121,28 @@ export class LMStudioStreamer implements IChatStreamer { // would duplicate tokens. for (let attempt = 1; attempt <= 2; attempt++) { const refresh = attempt > 1; - const model = await this.client.getModelHandle(trimmedModel, refresh ? { refresh: true } : undefined); + // Handle acquisition is guarded on its own: it happens BEFORE the + // stream try/catch below, so without this an "Operation canceled" + // (the lifecycle manager's concurrent load for this same model was + // superseded/aborted and the SDK coalesced our JIT lookup into that + // dead load), a disposed handle, or a dropped WebSocket would crash + // the whole turn with no retry. Large inputs make this far more + // likely: loading a big model to hold a large prompt is slow, which + // widens the window for a concurrent switch/abort to land mid-load. + let model: Awaited>; + try { + model = await this.client.getModelHandle(trimmedModel, refresh ? { refresh: true } : undefined); + } catch (acqErr: any) { + // Genuine user cancel โ€” don't retry, just stop quietly. + if (req.signal?.aborted || acqErr?.name === 'AbortError') return; + const acqMsg = String(acqErr?.message ?? acqErr); + if (this.isTransientHandleError(acqMsg) && attempt === 1) { + logInfo('LM Studio model handle acquisition hit a transient error โ€” retrying with a fresh SDK.', { model: trimmedModel, error: acqMsg }); + continue; // attempt 2 passes { refresh: true } โ†’ recreates the SDK client + } + logError('LM Studio model handle acquisition failed.', { model: trimmedModel, error: acqMsg, attempt }); + throw acqErr; + } logInfo('LM Studio SDK chat stream started.', { model: trimmedModel, messageCount: req.messages.length, attempt }); // Sampling defaults match the historical glitch-suppression preset for small / @@ -216,17 +243,7 @@ export class LMStudioStreamer implements IChatStreamer { } const errMsg = String(caught?.message ?? caught); - // Broaden the "handle is bound to a dead WebSocket binding" detection. All of - // these resolve with the same fix (recreate the SDK client so the next - // llm.model() lookup mints a fresh handle). - const handleDead = - /\bdisposed\b/i.test(errMsg) - || /lock\(\) request could not be registered/i.test(errMsg) - || /channel\s+closed/i.test(errMsg) - || /WebSocket\s+(?:is\s+not\s+open|closed|disconnected)/i.test(errMsg) - || /Connection\s+(?:lost|reset|closed)/i.test(errMsg) - || /\bECONNRESET\b/i.test(errMsg) - || /socket\s+hang\s*up/i.test(errMsg); + const handleDead = this.isTransientHandleError(errMsg); if (handleDead && yielded === 0 && attempt === 1) { logInfo('Dead LM Studio handle detected โ€” retrying with a fresh SDK.', { model: trimmedModel, error: errMsg }); @@ -238,6 +255,38 @@ export class LMStudioStreamer implements IChatStreamer { } } + /** + * True when an error message indicates the SDK handle / WebSocket binding is + * dead, or its in-flight (coalesced) load was canceled out from under us โ€” + * all fixable by recreating the SDK client so the next `llm.model()` lookup + * mints a fresh handle. Deliberately excludes genuine user aborts, which are + * caught earlier via `req.signal.aborted` / `AbortError` before reaching here. + */ + private isTransientHandleError(errMsg: string): boolean { + return ( + /\bdisposed\b/i.test(errMsg) + || /lock\(\) request could not be registered/i.test(errMsg) + || /channel\s+closed/i.test(errMsg) + || /WebSocket\s+(?:is\s+not\s+open|closed|disconnected)/i.test(errMsg) + || /Connection\s+(?:lost|reset|closed)/i.test(errMsg) + || /\bECONNRESET\b/i.test(errMsg) + || /socket\s+hang\s*up/i.test(errMsg) + // The lifecycle manager's load got superseded/aborted and the SDK + // coalesced our JIT model() lookup into that canceled load. + || /\boperation\s+cancell?ed\b/i.test(errMsg) + ); + } + + async getModelContextLength(modelName: string): Promise { + const trimmed = (modelName || '').trim(); + if (!trimmed) return undefined; + try { + return await this.client.getModelContextLength(trimmed); + } catch { + return undefined; // best-effort โ€” caller falls back to the configured window + } + } + async resetHandle(modelName: string): Promise { const trimmed = (modelName || '').trim(); if (!trimmed) return; diff --git a/tests/computeBudgetedRequest.test.ts b/tests/computeBudgetedRequest.test.ts new file mode 100644 index 0000000..75bb816 --- /dev/null +++ b/tests/computeBudgetedRequest.test.ts @@ -0,0 +1,58 @@ +/** + * Phase 1 โ€” context-window alignment. + * + * The budgeter must clamp to the model's ACTUALLY-loaded window when it's + * smaller than the user's `contextLength` setting, so a model loaded with a + * smaller window than the setting never silently overflows the server. + */ + +import { computeBudgetedRequest } from '../src/agent/handlePrompt/computeBudgetedRequest'; +import type { ChatMessage } from '../src/agent'; + +const baseConfig = { + contextLength: 32768, + maxOutputTokens: 4096, + contextSafetyMargin: 512, + smallModelContextCap: 0, // disabled + autoCompactHistory: false, +}; + +function run(overrides: { actualContextLength?: number; config?: Partial } = {}) { + const reqMessages: ChatMessage[] = [{ role: 'user', content: 'hello' }]; + return computeBudgetedRequest({ + fullSystemPrompt: 'You are a helpful assistant.', + reqMessages, + actualModel: 'some-13b-model', + config: { ...baseConfig, ...overrides.config }, + imageCount: 0, + actualContextLength: overrides.actualContextLength, + }); +} + +describe('computeBudgetedRequest โ€” real-window alignment', () => { + test('clamps to the actual loaded window when it is smaller than the setting', () => { + const r = run({ actualContextLength: 8192 }); + expect(r.windowMismatch).toBe(true); + expect(r.effectiveContextLength).toBe(8192); + expect(r.ctxLimits.contextLength).toBe(8192); + }); + + test('keeps the configured window when the actual window is unknown', () => { + const r = run({ actualContextLength: undefined }); + expect(r.windowMismatch).toBe(false); + expect(r.effectiveContextLength).toBe(32768); + expect(r.ctxLimits.contextLength).toBe(32768); + }); + + test('does not raise the window when the actual window is larger than the setting', () => { + const r = run({ actualContextLength: 131072 }); + expect(r.windowMismatch).toBe(false); + expect(r.effectiveContextLength).toBe(32768); // setting is the lower bound here + }); + + test('ignores a non-positive / non-finite actual window (falls back to setting)', () => { + expect(run({ actualContextLength: 0 }).effectiveContextLength).toBe(32768); + expect(run({ actualContextLength: -5 }).effectiveContextLength).toBe(32768); + expect(run({ actualContextLength: NaN }).effectiveContextLength).toBe(32768); + }); +}); diff --git a/tests/largeInputMapReduce.test.ts b/tests/largeInputMapReduce.test.ts new file mode 100644 index 0000000..e0884f1 --- /dev/null +++ b/tests/largeInputMapReduce.test.ts @@ -0,0 +1,159 @@ +/** + * Phase 2 โ€” large-input map-reduce core. + * + * Pure orchestration with an injected `callLLM`, so no network / SDK is touched. + */ + +import { + runMapReduce, + shouldMapReduce, + chunkCharBudget, + inputBudgetTokens, + type MapReduceConfig, + type MapReduceDeps, +} from '../src/agent/handlePrompt/largeInputMapReduce'; +import type { ChatMessage } from '../src/agent'; + +const estimateTokens = (s: string) => Math.ceil((s || '').length / 4); + +const cfg: MapReduceConfig = { + enabled: true, + triggerRatio: 0.6, + concurrency: 2, + maxDepth: 3, + showProvenance: false, +}; + +function isExtract(messages: ChatMessage[]): boolean { + return /์ถ”์ถœ๊ธฐ/.test(messages[0]?.content ?? ''); +} +function chunkLabel(messages: ChatMessage[]): string { + const m = (messages[1]?.content ?? '').match(/์ž๋ฃŒ ์กฐ๊ฐ (\d+)\/(\d+)/); + return m ? m[1] : '?'; +} + +// ~12 short markdown sections โ†’ forces multiple chunks under a small window. +const bigContent = Array.from({ length: 12 }, (_, i) => + `## ์„น์…˜ ${i + 1}\n์•ˆ๊ฑด ${i + 1}: ๊ฒฐ์ •์‚ฌํ•ญ๊ณผ ์ˆ˜์น˜ ${i * 10}. ` + '๋‚ด์šฉ '.repeat(40) +).join('\n\n'); + +describe('shouldMapReduce', () => { + test('triggers only above window * triggerRatio and when enabled', () => { + expect(shouldMapReduce(6200, 10000, cfg)).toBe(true); // > 6000 + expect(shouldMapReduce(5000, 10000, cfg)).toBe(false); // < 6000 + expect(shouldMapReduce(99999, 10000, { ...cfg, enabled: false })).toBe(false); + expect(shouldMapReduce(100, 0, cfg)).toBe(false); // unknown window + }); +}); + +describe('budget helpers', () => { + test('inputBudgetTokens reserves output + safety', () => { + // 10000 - sys(500) - max(2048, 1000)=2048 - safety(512) = 6940 + expect(inputBudgetTokens(10000, 500, 512)).toBe(6940); + }); + test('chunkCharBudget is positive and scales with the window', () => { + const small = chunkCharBudget(4000, 200, 512); + const big = chunkCharBudget(16000, 200, 512); + expect(small).toBeGreaterThan(0); + expect(big).toBeGreaterThan(small); + }); +}); + +describe('runMapReduce', () => { + function deps(callLLM: MapReduceDeps['callLLM']): MapReduceDeps { + return { callLLM, estimateTokens }; + } + const params = { + intent: 'ํšŒ์˜๋ก์„ ์•ˆ๊ฑด๋ณ„๋กœ ์ •๋ฆฌํ•ด์ค˜', + largeContent: bigContent, + windowTokens: 4000, + systemTokens: 200, + safetyMargin: 512, + cfg, + }; + + test('extracts relevant facts per chunk and condenses them', async () => { + const seen: string[] = []; + const r = await runMapReduce( + deps(async (messages) => { + expect(isExtract(messages)).toBe(true); + const k = chunkLabel(messages); + seen.push(k); + return `์ถ”์ถœ-${k}`; + }), + params, + ); + expect(r.allIrrelevant).toBe(false); + expect(r.chunkCount).toBeGreaterThan(1); + expect(r.relevantCount).toBe(r.chunkCount); + expect(r.condensedContext).toContain('์ถ”์ถœ-1'); + // every chunk was visited + expect(seen.length).toBe(r.chunkCount); + }); + + test('all-irrelevant chunks โ†’ allIrrelevant with empty context', async () => { + const r = await runMapReduce( + deps(async () => '(๊ด€๋ จ ์—†์Œ)'), + params, + ); + expect(r.allIrrelevant).toBe(true); + expect(r.relevantCount).toBe(0); + expect(r.condensedContext).toBe(''); + }); + + test('respects concurrency limit', async () => { + let active = 0; + let peak = 0; + await runMapReduce( + deps(async (messages) => { + active++; + peak = Math.max(peak, active); + await new Promise((res) => setTimeout(res, 5)); + active--; + return `x-${chunkLabel(messages)}`; + }), + params, + ); + expect(peak).toBeLessThanOrEqual(cfg.concurrency); + }); + + test('a failing chunk extraction falls back to truncated raw (not a crash)', async () => { + let call = 0; + const r = await runMapReduce( + deps(async (messages) => { + if (isExtract(messages) && ++call === 1) throw new Error('boom'); + return `ok-${chunkLabel(messages)}`; + }), + params, + ); + expect(r.allIrrelevant).toBe(false); + // The failed chunk still contributed (raw fallback), so relevantCount === chunkCount. + expect(r.relevantCount).toBe(r.chunkCount); + }); + + test('tags provenance when showProvenance is on', async () => { + const r = await runMapReduce( + deps(async (messages) => `๋ฐœ์ทŒ-${chunkLabel(messages)}`), + { ...params, cfg: { ...cfg, showProvenance: true } }, + ); + expect(r.condensedContext).toMatch(/\[์กฐ๊ฐ \d+\]/); + }); + + test('hierarchical reduce kicks in when extractions overflow the context ceiling', async () => { + // Tiny window so even a few extractions exceed the ceiling โ†’ reduce rounds run. + let reduceCalls = 0; + const r = await runMapReduce( + deps(async (messages) => { + if (isExtract(messages)) { + return '๊ด€๋ จ ์‚ฌ์‹ค '.repeat(60); // big extraction per chunk + } + reduceCalls++; + return 'ํ†ตํ•ฉ๋ณธ'; // reduce collapses to something small + }), + { ...params, windowTokens: 2200 }, + ); + expect(reduceCalls).toBeGreaterThan(0); + expect(r.reduceDepth).toBeGreaterThan(0); + expect(r.allIrrelevant).toBe(false); + }); +}); diff --git a/tests/lmStudioLifecycle.test.ts b/tests/lmStudioLifecycle.test.ts index af70702..fa68d4d 100644 --- a/tests/lmStudioLifecycle.test.ts +++ b/tests/lmStudioLifecycle.test.ts @@ -78,6 +78,10 @@ class FakeLMStudioClient implements ILMStudioClient { return true; } + async getModelContextLength(_modelKey: string): Promise { + return undefined; + } + async listLoadedCached(): Promise { return [...this.loaded]; } diff --git a/tests/lmStudioStreamer.test.ts b/tests/lmStudioStreamer.test.ts index e02a102..2aedc20 100644 --- a/tests/lmStudioStreamer.test.ts +++ b/tests/lmStudioStreamer.test.ts @@ -69,6 +69,9 @@ class FakeModel { class FakeClient implements ILMStudioClient { public model: FakeModel; public getModelHandleCalls: string[] = []; + public getModelHandleOpts: Array<{ refresh?: boolean } | undefined> = []; + /** Errors to throw on successive getModelHandle calls before returning the model. */ + public handleAcqFailures: Error[] = []; constructor(model: FakeModel = new FakeModel()) { this.model = model; @@ -83,10 +86,18 @@ class FakeClient implements ILMStudioClient { async listDownloadedCached(): Promise { return []; } async isReachable(): Promise { return true; } - async getModelHandle(modelKey: string): Promise { + async getModelHandle(modelKey: string, options?: { refresh?: boolean }): Promise { this.getModelHandleCalls.push(modelKey); + this.getModelHandleOpts.push(options); + const failure = this.handleAcqFailures.shift(); + if (failure) throw failure; return this.model; } + + public contextLength: number | undefined = undefined; + async getModelContextLength(_modelKey: string): Promise { + return this.contextLength; + } } // The streamer emits a trailing { token: '', stopReason } event on normal completion; @@ -209,6 +220,68 @@ describe('LMStudioStreamer', () => { expect(out).toEqual(['a']); }); + test('transient "Operation canceled" on handle acquisition is retried with a fresh SDK', async () => { + // The lifecycle manager's concurrent load for this model got superseded; + // the SDK coalesced our JIT model() lookup into that aborted load. The + // first getModelHandle throws โ€” the streamer must recreate the SDK + // (refresh) and retry rather than crashing the whole turn. + const client = new FakeClient(new FakeModel({ chunks: ['ok'] })); + client.handleAcqFailures = [new Error('Failed to acquire LM Studio model handle "m1": Operation canceled.')]; + const streamer = new LMStudioStreamer(client); + const tokens = await collect(streamer.stream({ + modelName: 'm1', + messages: [{ role: 'user', content: 'hi' }], + temperature: 0.2, + })); + expect(tokens).toEqual(['ok']); + expect(client.getModelHandleCalls).toEqual(['m1', 'm1']); + // First attempt: no refresh. Retry: refresh=true so the SDK is recreated. + expect(client.getModelHandleOpts[0]).toBeUndefined(); + expect(client.getModelHandleOpts[1]).toEqual({ refresh: true }); + }); + + test('non-transient handle acquisition error is thrown without retry', async () => { + const client = new FakeClient(); + client.handleAcqFailures = [new Error('Failed to acquire LM Studio model handle "m1": model not found')]; + const streamer = new LMStudioStreamer(client); + await expect(collect(streamer.stream({ + modelName: 'm1', + messages: [{ role: 'user', content: 'hi' }], + temperature: 0.2, + }))).rejects.toThrow(/model not found/); + expect(client.getModelHandleCalls).toEqual(['m1']); // no retry + }); + + test('handle acquisition failure is swallowed when the user already aborted', async () => { + const client = new FakeClient(); + client.handleAcqFailures = [new Error('Operation canceled')]; + const streamer = new LMStudioStreamer(client); + const ac = new AbortController(); + ac.abort(); + const out = await collect(streamer.stream({ + modelName: 'm1', + messages: [{ role: 'user', content: 'hi' }], + temperature: 0.2, + signal: ac.signal, + })); + expect(out).toEqual([]); + expect(client.getModelHandleCalls).toEqual(['m1']); // no retry โ€” genuine cancel + }); + + test('getModelContextLength delegates to the client (and survives a throwing client)', async () => { + const client = new FakeClient(); + client.contextLength = 8192; + const streamer = new LMStudioStreamer(client); + expect(await streamer.getModelContextLength('m1')).toBe(8192); + expect(await streamer.getModelContextLength('')).toBeUndefined(); + + // A throwing client must degrade to undefined, never reject. + const throwing = new FakeClient(); + throwing.getModelContextLength = async () => { throw new Error('ws down'); }; + const s2 = new LMStudioStreamer(throwing); + expect(await s2.getModelContextLength('m1')).toBeUndefined(); + }); + test('passes messages through to model.respond', async () => { const client = new FakeClient(); const streamer = new LMStudioStreamer(client);