76d5fedfb5
큰 입력 시 "Failed to acquire LM Studio model handle … Operation canceled" 로 턴 전체가 죽던 문제를 3계층으로 해결. 일반 채팅(코어 경로)은 그동안 단일 예산 호출이라 약한 모델·큰 입력에서 무너졌다 — 그 갭을 메움. - 핸들 race 수정: getModelHandle 을 재시도 루프 안으로 이동. 취소/죽은-핸들 류 에러는 SDK 재생성 후 1회 자동 재시도(실제 사용자 취소는 존중). 라이프 사이클의 동시 로드가 abort 되며 SDK 가 coalesce 한 JIT 조회까지 죽던 것. - Phase 1 실제 창 정렬: llm.getContextLength()(캐시)로 실측 창에 예산 클램프. 설정값보다 작은 창으로 로드된 경우 서버 truncation/빈 답변 차단. 배지에 표시. - Phase 2 코어 Map-Reduce: 단일 입력이 (유효 창 × ratio) 초과 시 청크→질의 인지형 추출→통합. 부분/전체 폴백, 무관 시 정직 신호. 동시성 기본 2. - Phase 3 메타 노출: 진행/결과 배지 표시, [조각 k] 출처 옵트인. 신규 설정 5종. /meet·/review 전용 경로는 불변. 테스트 +25건, 전체 684 통과. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
296 lines
12 KiB
TypeScript
296 lines
12 KiB
TypeScript
/**
|
|
* Unit tests for LMStudioStreamer.
|
|
*
|
|
* Strategy: inject a fake ILMStudioClient that returns a fake model handle whose
|
|
* `respond()` yields a controllable async iterable. No real SDK or WebSocket touched.
|
|
*/
|
|
|
|
import { LMStudioStreamer } from '../src/lmstudio/streamer';
|
|
import type { ChatStreamEvent } from '../src/lmstudio/streamer';
|
|
import type { ILMStudioClient } from '../src/lmstudio/client';
|
|
|
|
class FakeModel {
|
|
public lastChat: any = null;
|
|
public lastOpts: any = null;
|
|
public cancelCount = 0;
|
|
public failNext: Error | null = null;
|
|
public chunks: string[] = [];
|
|
|
|
constructor(opts: { chunks?: string[]; failAfter?: number; throwOnRespond?: Error; stopReason?: string } = {}) {
|
|
this.chunks = opts.chunks ?? ['Hel', 'lo, ', 'world'];
|
|
this._failAfter = opts.failAfter;
|
|
this._throwOnRespond = opts.throwOnRespond;
|
|
this.stopReason = opts.stopReason;
|
|
}
|
|
|
|
private _failAfter?: number;
|
|
private _throwOnRespond?: Error;
|
|
public stopReason: string | undefined;
|
|
|
|
respond(chat: any, opts: any) {
|
|
if (this._throwOnRespond) {
|
|
throw this._throwOnRespond;
|
|
}
|
|
this.lastChat = chat;
|
|
this.lastOpts = opts;
|
|
const chunks = this.chunks;
|
|
const failAfter = this._failAfter;
|
|
const stopReason = this.stopReason;
|
|
let i = 0;
|
|
const self = this;
|
|
// Real OngoingPrediction is both async-iterable AND a thenable resolving to a
|
|
// PredictionResult with `.stats.stopReason`. Mirror that shape so the streamer
|
|
// can read the stop reason after the stream drains.
|
|
const prediction: any = {
|
|
cancel: async () => { self.cancelCount++; },
|
|
then(resolve: (v: any) => void) { resolve({ stats: { stopReason } }); },
|
|
[Symbol.asyncIterator]() {
|
|
return {
|
|
async next() {
|
|
if (opts?.signal?.aborted) {
|
|
return { value: undefined, done: true };
|
|
}
|
|
if (failAfter !== undefined && i >= failAfter) {
|
|
throw new Error('mid-stream failure');
|
|
}
|
|
if (i >= chunks.length) {
|
|
return { value: undefined, done: true };
|
|
}
|
|
const fragment = { content: chunks[i++] };
|
|
return { value: fragment, done: false };
|
|
},
|
|
};
|
|
},
|
|
};
|
|
return prediction;
|
|
}
|
|
}
|
|
|
|
class FakeClient implements ILMStudioClient {
|
|
public model: FakeModel;
|
|
public getModelHandleCalls: string[] = [];
|
|
public getModelHandleOpts: Array<{ refresh?: boolean } | undefined> = [];
|
|
/** Errors to throw on successive getModelHandle calls before returning the model. */
|
|
public handleAcqFailures: Error[] = [];
|
|
|
|
constructor(model: FakeModel = new FakeModel()) {
|
|
this.model = model;
|
|
}
|
|
|
|
setBaseUrl(_: string): void { /* noop */ }
|
|
async load(_: string): Promise<void> { /* noop */ }
|
|
async unload(_: string): Promise<void> { /* noop */ }
|
|
async listLoaded(): Promise<string[]> { return []; }
|
|
async listLoadedCached(): Promise<string[]> { return []; }
|
|
async listDownloaded(): Promise<string[]> { return []; }
|
|
async listDownloadedCached(): Promise<string[]> { return []; }
|
|
async isReachable(): Promise<boolean> { return true; }
|
|
|
|
async getModelHandle(modelKey: string, options?: { refresh?: boolean }): Promise<any> {
|
|
this.getModelHandleCalls.push(modelKey);
|
|
this.getModelHandleOpts.push(options);
|
|
const failure = this.handleAcqFailures.shift();
|
|
if (failure) throw failure;
|
|
return this.model;
|
|
}
|
|
|
|
public contextLength: number | undefined = undefined;
|
|
async getModelContextLength(_modelKey: string): Promise<number | undefined> {
|
|
return this.contextLength;
|
|
}
|
|
}
|
|
|
|
// The streamer emits a trailing { token: '', stopReason } event on normal completion;
|
|
// `collect` returns just the non-empty content tokens (what every real consumer uses).
|
|
async function collect(stream: AsyncIterable<ChatStreamEvent>): Promise<string[]> {
|
|
const out: string[] = [];
|
|
for await (const { token } of stream) {
|
|
if (token) out.push(token);
|
|
}
|
|
return out;
|
|
}
|
|
|
|
async function collectEvents(stream: AsyncIterable<ChatStreamEvent>): Promise<ChatStreamEvent[]> {
|
|
const out: ChatStreamEvent[] = [];
|
|
for await (const ev of stream) out.push(ev);
|
|
return out;
|
|
}
|
|
|
|
describe('LMStudioStreamer', () => {
|
|
test('streams tokens from the SDK respond iterator', async () => {
|
|
const client = new FakeClient(new FakeModel({ chunks: ['Hel', 'lo'] }));
|
|
const streamer = new LMStudioStreamer(client);
|
|
const tokens = await collect(streamer.stream({
|
|
modelName: 'm1',
|
|
messages: [{ role: 'user', content: 'hi' }],
|
|
temperature: 0.4,
|
|
}));
|
|
expect(tokens).toEqual(['Hel', 'lo']);
|
|
expect(client.getModelHandleCalls).toEqual(['m1']);
|
|
expect(client.model.lastOpts.temperature).toBe(0.4);
|
|
});
|
|
|
|
test('emits a trailing stopReason event from prediction stats', async () => {
|
|
const client = new FakeClient(new FakeModel({ chunks: ['hi'], stopReason: 'maxPredictedTokensReached' }));
|
|
const streamer = new LMStudioStreamer(client);
|
|
const events = await collectEvents(streamer.stream({
|
|
modelName: 'm1',
|
|
messages: [{ role: 'user', content: 'hi' }],
|
|
temperature: 0.1,
|
|
maxTokens: 64,
|
|
}));
|
|
expect(events.map(e => e.token)).toEqual(['hi', '']);
|
|
expect(events[events.length - 1].stopReason).toBe('maxPredictedTokensReached');
|
|
// maxTokens / contextOverflowPolicy are forwarded to the SDK
|
|
expect(client.model.lastOpts.maxTokens).toBe(64);
|
|
expect(client.model.lastOpts.contextOverflowPolicy).toBe('stopAtLimit');
|
|
});
|
|
|
|
test('passes signal through to the SDK', async () => {
|
|
const client = new FakeClient();
|
|
const streamer = new LMStudioStreamer(client);
|
|
const ac = new AbortController();
|
|
await collect(streamer.stream({
|
|
modelName: 'm1',
|
|
messages: [{ role: 'user', content: 'hi' }],
|
|
temperature: 0.2,
|
|
signal: ac.signal,
|
|
}));
|
|
expect(client.model.lastOpts.signal).toBe(ac.signal);
|
|
});
|
|
|
|
test('aborting mid-stream stops cleanly without throwing', async () => {
|
|
const client = new FakeClient(new FakeModel({ chunks: ['a', 'b', 'c', 'd'] }));
|
|
const streamer = new LMStudioStreamer(client);
|
|
const ac = new AbortController();
|
|
const out: string[] = [];
|
|
const iter = streamer.stream({
|
|
modelName: 'm1',
|
|
messages: [{ role: 'user', content: 'hi' }],
|
|
temperature: 0.3,
|
|
signal: ac.signal,
|
|
});
|
|
for await (const { token } of iter) {
|
|
out.push(token);
|
|
if (out.length === 2) ac.abort();
|
|
}
|
|
expect(out.length).toBeGreaterThanOrEqual(2);
|
|
expect(out.length).toBeLessThanOrEqual(3);
|
|
});
|
|
|
|
test('rejects when modelName is empty', async () => {
|
|
const client = new FakeClient();
|
|
const streamer = new LMStudioStreamer(client);
|
|
await expect(collect(streamer.stream({
|
|
modelName: '',
|
|
messages: [{ role: 'user', content: 'hi' }],
|
|
temperature: 0.2,
|
|
}))).rejects.toThrow(/without a model name/i);
|
|
});
|
|
|
|
test('mid-stream SDK failure is re-thrown when signal not aborted', async () => {
|
|
const client = new FakeClient(new FakeModel({ chunks: ['a', 'b'], failAfter: 1 }));
|
|
const streamer = new LMStudioStreamer(client);
|
|
await expect(collect(streamer.stream({
|
|
modelName: 'm1',
|
|
messages: [{ role: 'user', content: 'hi' }],
|
|
temperature: 0.2,
|
|
}))).rejects.toThrow(/mid-stream failure/);
|
|
});
|
|
|
|
test('mid-stream SDK failure swallowed if signal already aborted', async () => {
|
|
const client = new FakeClient(new FakeModel({ chunks: ['a', 'b'], failAfter: 1 }));
|
|
const streamer = new LMStudioStreamer(client);
|
|
const ac = new AbortController();
|
|
const iter = streamer.stream({
|
|
modelName: 'm1',
|
|
messages: [{ role: 'user', content: 'hi' }],
|
|
temperature: 0.2,
|
|
signal: ac.signal,
|
|
});
|
|
const out: string[] = [];
|
|
try {
|
|
for await (const { token } of iter) {
|
|
out.push(token);
|
|
ac.abort(); // abort right after first token, before failure point
|
|
}
|
|
} catch (e) {
|
|
// expected to be swallowed
|
|
}
|
|
expect(out).toEqual(['a']);
|
|
});
|
|
|
|
test('transient "Operation canceled" on handle acquisition is retried with a fresh SDK', async () => {
|
|
// The lifecycle manager's concurrent load for this model got superseded;
|
|
// the SDK coalesced our JIT model() lookup into that aborted load. The
|
|
// first getModelHandle throws — the streamer must recreate the SDK
|
|
// (refresh) and retry rather than crashing the whole turn.
|
|
const client = new FakeClient(new FakeModel({ chunks: ['ok'] }));
|
|
client.handleAcqFailures = [new Error('Failed to acquire LM Studio model handle "m1": Operation canceled.')];
|
|
const streamer = new LMStudioStreamer(client);
|
|
const tokens = await collect(streamer.stream({
|
|
modelName: 'm1',
|
|
messages: [{ role: 'user', content: 'hi' }],
|
|
temperature: 0.2,
|
|
}));
|
|
expect(tokens).toEqual(['ok']);
|
|
expect(client.getModelHandleCalls).toEqual(['m1', 'm1']);
|
|
// First attempt: no refresh. Retry: refresh=true so the SDK is recreated.
|
|
expect(client.getModelHandleOpts[0]).toBeUndefined();
|
|
expect(client.getModelHandleOpts[1]).toEqual({ refresh: true });
|
|
});
|
|
|
|
test('non-transient handle acquisition error is thrown without retry', async () => {
|
|
const client = new FakeClient();
|
|
client.handleAcqFailures = [new Error('Failed to acquire LM Studio model handle "m1": model not found')];
|
|
const streamer = new LMStudioStreamer(client);
|
|
await expect(collect(streamer.stream({
|
|
modelName: 'm1',
|
|
messages: [{ role: 'user', content: 'hi' }],
|
|
temperature: 0.2,
|
|
}))).rejects.toThrow(/model not found/);
|
|
expect(client.getModelHandleCalls).toEqual(['m1']); // no retry
|
|
});
|
|
|
|
test('handle acquisition failure is swallowed when the user already aborted', async () => {
|
|
const client = new FakeClient();
|
|
client.handleAcqFailures = [new Error('Operation canceled')];
|
|
const streamer = new LMStudioStreamer(client);
|
|
const ac = new AbortController();
|
|
ac.abort();
|
|
const out = await collect(streamer.stream({
|
|
modelName: 'm1',
|
|
messages: [{ role: 'user', content: 'hi' }],
|
|
temperature: 0.2,
|
|
signal: ac.signal,
|
|
}));
|
|
expect(out).toEqual([]);
|
|
expect(client.getModelHandleCalls).toEqual(['m1']); // no retry — genuine cancel
|
|
});
|
|
|
|
test('getModelContextLength delegates to the client (and survives a throwing client)', async () => {
|
|
const client = new FakeClient();
|
|
client.contextLength = 8192;
|
|
const streamer = new LMStudioStreamer(client);
|
|
expect(await streamer.getModelContextLength('m1')).toBe(8192);
|
|
expect(await streamer.getModelContextLength('')).toBeUndefined();
|
|
|
|
// A throwing client must degrade to undefined, never reject.
|
|
const throwing = new FakeClient();
|
|
throwing.getModelContextLength = async () => { throw new Error('ws down'); };
|
|
const s2 = new LMStudioStreamer(throwing);
|
|
expect(await s2.getModelContextLength('m1')).toBeUndefined();
|
|
});
|
|
|
|
test('passes messages through to model.respond', async () => {
|
|
const client = new FakeClient();
|
|
const streamer = new LMStudioStreamer(client);
|
|
const messages = [
|
|
{ role: 'system' as const, content: 'sys' },
|
|
{ role: 'user' as const, content: 'hi' },
|
|
];
|
|
await collect(streamer.stream({ modelName: 'm1', messages, temperature: 0.5 }));
|
|
expect(client.model.lastChat).toEqual(messages);
|
|
});
|
|
});
|