chore: v2.2.73 — ASTRA-DEBUG 로그 레벨 + webview CSP font-src 보강

- ASTRA-DEBUG 정상 흐름 로그를 console.error → logInfo/console.log 로 강등
  (chatHandlers, extension, slashRouter): DevTools에 ERR로 찍히던 오탐 제거
- sidebar webview에 명시적 CSP meta 추가 + font-src에 data: 허용
  (sidebar.html, sidebarProvider._getHtml): VS Code outer iframe이 codicon.ttf를
  data:font/ttf 로 inject하면서 기본 CSP에 막혀 매 prompt 마다 violation
  경고가 찍히던 문제 해소
- 누적된 LM Studio / agent / 컨텍스트 매니저 / 테스트 갱신 동반

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
g1nation
2026-05-23 15:52:19 +09:00
parent 36db170844
commit 0712014fcb
43 changed files with 2417 additions and 977 deletions
+328 -80
View File
@@ -53,6 +53,7 @@ import {
} from './retrieval/knowledgeMix';
import {
extractVisibleFinal,
stripMarkdownFormatting,
shouldFinalOnlyRetry,
shouldAutoContinue,
looksCutOff,
@@ -73,6 +74,7 @@ import {
estimateModelParamsB,
type ContextLimits,
} from './lib/contextManager';
import { samplingToRestBody, type LmStudioSampling, type ChatStreamStats } from './lmstudio/streamer';
export interface ChatMessage {
role: 'user' | 'assistant' | 'system';
@@ -208,6 +210,10 @@ export class AgentExecutor {
private historyChangeListener: HistoryChangeListener | undefined;
private runSerial = 0;
private activeRunId = 0;
// v2.2.69 — 모드 전환 감지용. handlePrompt 진입 시 현재 mode signature 를 계산해
// 직전 값과 다르면 system prompt 에 "이전 대화에서 ... 모드 전환됨" 한 줄을 끼운다.
// mode signature 는 (agent skill, multiAgent, company mode, 활성 brain) 의 해시.
private _lastModeSignature: string | null = null;
private transactionManager: TransactionManager;
private sessionManager: SessionManager;
private statusBarManager: StatusBarManager;
@@ -369,6 +375,9 @@ export class AgentExecutor {
this.onSessionEnd();
}
this.chatHistory = [];
// v2.2.69 — 새 세션엔 "이전 모드" 가 없음. mode signature 초기화하지 않으면 첫 메시지에서
// 직전 세션의 mode 와 비교돼 잘못된 bridge 가 끼는 회귀가 생긴다.
this._lastModeSignature = null;
this.emitHistoryChanged();
}
@@ -387,6 +396,7 @@ export class AgentExecutor {
this.onSessionEnd();
}
this.chatHistory = [];
this._lastModeSignature = null;
this.emitHistoryChanged();
}
@@ -633,6 +643,39 @@ export class AgentExecutor {
// 제거하고 에이전트 프롬프트를 최후단에 배치하여 절대 우선 적용.
// ──────────────────────────────────────────────────────────────────
const isAgentMode = !!options.agentSkillContext;
// v2.2.69 — 모드 전환 bridge. 현재 mode signature 를 직전 값과 비교해 바뀌었으면
// "이전 대화는 X 모드에서 Y 주제로 진행됨 / 지금부터 Z 모드" 한 줄을 system prompt 에 끼운다.
// chatHistory 자체는 손대지 않으므로 사용자 입장에선 대화가 연속되어 보이면서도
// 모델은 "모드가 바뀐 직후" 임을 인지한다.
let modeBridgeCtx = '';
try {
const agentSkillName = options.agentSkillContext
? (options.agentSkillContext.split('\n')[0] || '').slice(0, 60).replace(/^#\s*/, '').trim()
: '';
const currentSig = this.computeModeSignature({
agentSkillName: agentSkillName || undefined,
companyMode: !!(options as any).companyMode,
multiAgent: !!(options as any).multiAgent,
brainName: getActiveBrainProfile()?.name,
});
if (this._lastModeSignature !== null && this._lastModeSignature !== currentSig) {
const topic = this.buildLastTopicLine();
const bridgeLines = [
'',
'[MODE TRANSITION BRIDGE]',
`이전 모드: ${this._lastModeSignature}`,
`현재 모드: ${currentSig}`,
];
if (topic) bridgeLines.push(`직전 대화 주제(한 줄): ${topic}`);
bridgeLines.push('대화 history 는 그대로 이어진다. 새 모드의 페르소나/포맷을 따르되, 직전까지 사용자가 다루던 맥락을 잊지 말 것.');
modeBridgeCtx = bridgeLines.join('\n');
}
this._lastModeSignature = currentSig;
} catch (e: any) {
logError('Mode-bridge computation failed (non-fatal).', { error: e?.message || String(e) });
}
let fullSystemPrompt: string;
if (isAgentMode) {
@@ -665,7 +708,7 @@ export class AgentExecutor {
// [CONTEXT] … [/CONTEXT] 사이만 컨텍스트 초과 시 trim 대상 — agentBlock(앞)·reminder(뒤)·negative 는 보호.
// memoryCtx(RAG/메모리/lessons)도 [CONTEXT] 안에 넣어 토큰이 빡빡할 때 대화 기록보다 먼저 잘리게 한다.
fullSystemPrompt = `${agentBlock}\n\n${strippedSystemPrompt}${designerCtx}${secondBrainTraceCtx}\n\n[CONTEXT]\n${memoryCtx}\n${knowledgeContextForPrompt}\n${contextBlock}\n[/CONTEXT]\n${negativeCtx}${agentTailReminder}`;
fullSystemPrompt = `${agentBlock}${modeBridgeCtx ? '\n\n' + modeBridgeCtx : ''}\n\n${strippedSystemPrompt}${designerCtx}${secondBrainTraceCtx}\n\n[CONTEXT]\n${memoryCtx}\n${knowledgeContextForPrompt}\n${contextBlock}\n[/CONTEXT]\n${negativeCtx}${agentTailReminder}`;
} else {
// 기존 Astra 모드 (에이전트 미선택)
const localProjectKnowledgeCtx = prompt && localPathContext && this.isProjectKnowledgeCreationRequest(prompt)
@@ -700,7 +743,7 @@ export class AgentExecutor {
})()
: '';
// memoryCtx(RAG/메모리/lessons)는 [CONTEXT] 안에 — 토큰이 빡빡하면 대화 기록보다 먼저 잘림.
fullSystemPrompt = `${systemPrompt}${designerCtx}${projectArchitectureCtx}${localProjectKnowledgeCtx}${thinkingPartnerCtx}${astraStanceCtx}${secondBrainTraceCtx}${v4PolicyCtx}${knowledgeMixCtx}${casualCtx}\n\n[CONTEXT]\n${memoryCtx}\n${knowledgeContextForPrompt}\n${contextBlock}\n[/CONTEXT]\n${negativeCtx}`;
fullSystemPrompt = `${systemPrompt}${modeBridgeCtx ? '\n\n' + modeBridgeCtx : ''}${designerCtx}${projectArchitectureCtx}${localProjectKnowledgeCtx}${thinkingPartnerCtx}${astraStanceCtx}${secondBrainTraceCtx}${v4PolicyCtx}${knowledgeMixCtx}${casualCtx}\n\n[CONTEXT]\n${memoryCtx}\n${knowledgeContextForPrompt}\n${contextBlock}\n[/CONTEXT]\n${negativeCtx}`;
}
// ──────────────────────────────────────────────────────────────────
// [Context Limit Manager] context length 는 "답변을 그만큼 길게 써도 된다"
@@ -768,14 +811,17 @@ export class AgentExecutor {
);
let budgetedHistory: ChatMessage[] = reqMessages;
if (config.autoCompactHistory) {
const trim = trimHistoryToBudget<ChatMessage>(reqMessages, historyBudget, (n) => ({
// v2.2.69 — dropped 메시지를 받아 heuristic 요약을 만든 뒤 한 system 메시지로 prepend.
// 단순 count 마커는 "이전에 무슨 얘기를 했는지" 를 전혀 알려주지 않아 후속 턴에서 모델이
// 맥락을 잃어버리는 회귀를 낳았다. 이제는 U1/A1/U2/A2 골자가 남아 sliding window 가 동작.
const trim = trimHistoryToBudget<ChatMessage>(reqMessages, historyBudget, (_n, dropped) => ({
role: 'system',
content: `[이전 대화 ${n}개 메시지는 컨텍스트 한계 때문에 이번 요청에서 생략되었습니다. 필요하면 사용자에게 다시 확인하세요.]`,
content: this.buildDroppedHistorySummary(dropped),
internal: true,
}));
budgetedHistory = trim.messages;
if (trim.droppedCount > 0) {
logInfo('Conversation history compacted to fit the context window.', {
logInfo('Conversation history compacted to fit the context window (with summary).', {
model: actualModel, droppedCount: trim.droppedCount, historyBudget,
});
}
@@ -864,8 +910,12 @@ export class AgentExecutor {
// policy enforcement) emits a final `streamReplace` so the bubble
// ends up matching the cleaned answer regardless of what slipped
// through live.
const postLiveDeltas = loopDepth === 0;
// [Clean Stream] g1nation.liveStreamTokens=false (기본) 이면 토큰을 내부에만
// 누적하고 sanitize 끝난 최종 답변만 한 번에 표시 → Harmony/think 마커가 잠깐
// 화면에 노출되는 누설을 원천 차단한다. true 로 두면 legacy 라이브 스트리밍.
const postLiveDeltas = loopDepth === 0 && getConfig().liveStreamTokens === true;
let lmStudioStats: ChatStreamStats | undefined;
if (useLmStudioSdk) {
apiUrl = `${ollamaUrl} (sdk)`;
logInfo('Streaming chat via LM Studio SDK.', { model: actualModel });
@@ -876,15 +926,35 @@ export class AgentExecutor {
temperature,
maxTokens: maxOutputTokens,
contextOverflowPolicy: config.contextOverflowPolicy,
...this.lmStudioSamplingFromConfig(),
...this.lmStudioRespondExtrasFromConfig(),
signal: this.abortController.signal,
});
for await (const { token, stopReason } of stream) {
for await (const { token, stopReason, stats } of stream) {
if (this.isStaleRun(runId)) return;
if (token) {
aiResponseText += token;
if (postLiveDeltas) this.webview.postMessage({ type: 'streamChunk', value: token });
}
if (stopReason) finishStopReason = stopReason;
if (stats) lmStudioStats = stats;
}
if (lmStudioStats && getConfig().lmStudioShowStatsInBudget && loopDepth === 0) {
this.webview.postMessage({
type: 'lmStudioStats',
value: {
model: actualModel,
tokensPerSecond: lmStudioStats.tokensPerSecond,
timeToFirstTokenSec: lmStudioStats.timeToFirstTokenSec,
predictedTokensCount: lmStudioStats.predictedTokensCount,
promptTokensCount: lmStudioStats.promptTokensCount,
totalTimeSec: lmStudioStats.totalTimeSec,
draftModelKey: lmStudioStats.draftModelKey,
draftTokensCount: lmStudioStats.draftTokensCount,
acceptedDraftTokensCount: lmStudioStats.acceptedDraftTokensCount,
stopReason: finishStopReason,
},
});
}
} catch (err: any) {
if (err?.name === 'AbortError' || this.abortController.signal.aborted) {
@@ -1007,60 +1077,34 @@ export class AgentExecutor {
//
// Only attempts recovery on loopDepth === 0 — we don't want to
// ping-pong inside the autonomous action loop.
//
// Note: the previous SDK handle-reset retry that lived here is now done
// inside `LMStudioStreamer.stream()` itself (it auto-recreates the SDK
// on attempt 2 for both dead-handle errors *and* clean-but-empty streams),
// so by the time we get here with `useLmStudioSdk` and no text, the SDK
// path has already tried twice. Go straight to the REST fallback.
if (!aiResponseText.trim() && !this.abortController?.signal.aborted && loopDepth === 0) {
if (useLmStudioSdk && this.options.lmStudioStreamer?.resetHandle) {
try {
logInfo('Empty SDK stream — resetting LM Studio handle and retrying streaming once.', { model: actualModel });
await this.options.lmStudioStreamer.resetHandle(actualModel);
const retryStream = this.options.lmStudioStreamer.stream({
modelName: actualModel,
messages: messagesForRequest.map((m) => ({ role: m.role, content: m.content })),
temperature,
maxTokens: maxOutputTokens,
contextOverflowPolicy: config.contextOverflowPolicy,
signal: this.abortController.signal,
});
let retryText = '';
for await (const { token, stopReason } of retryStream) {
if (this.isStaleRun(runId)) return;
if (token) {
retryText += token;
if (postLiveDeltas) this.webview.postMessage({ type: 'streamChunk', value: token });
}
if (stopReason) finishStopReason = stopReason;
}
if (retryText.trim()) {
aiResponseText = retryText;
logInfo('Handle-reset retry recovered the answer.', { model: actualModel, length: retryText.length });
}
} catch (retryErr: any) {
logError('Handle-reset retry failed.', { model: actualModel, error: retryErr?.message ?? String(retryErr) });
}
}
if (!aiResponseText.trim() && !this.abortController?.signal.aborted) {
try {
logInfo('Empty stream — trying non-streaming fallback.', { engine, model: actualModel, apiUrl });
const fallback = await this.callNonStreaming({
baseUrl: ollamaUrl,
modelName: actualModel,
engine,
messages: messagesForRequest,
temperature,
maxTokens: maxOutputTokens,
contextLength: ctxLimits.contextLength,
signal: this.abortController?.signal,
});
if (fallback.stopReason) finishStopReason = fallback.stopReason;
if (fallback.text && fallback.text.trim()) {
aiResponseText = fallback.text;
logInfo('Non-streaming fallback recovered the answer.', { engine, model: actualModel, length: fallback.text.length });
}
} catch (recoverErr: any) {
logError('Non-streaming fallback also failed.', {
engine, model: actualModel, error: recoverErr?.message ?? String(recoverErr),
});
try {
logInfo('Empty stream — trying non-streaming fallback.', { engine, model: actualModel, apiUrl });
const fallback = await this.callNonStreaming({
baseUrl: ollamaUrl,
modelName: actualModel,
engine,
messages: messagesForRequest,
temperature,
maxTokens: maxOutputTokens,
contextLength: ctxLimits.contextLength,
signal: this.abortController?.signal,
});
if (fallback.stopReason) finishStopReason = fallback.stopReason;
if (fallback.text && fallback.text.trim()) {
aiResponseText = fallback.text;
logInfo('Non-streaming fallback recovered the answer.', { engine, model: actualModel, length: fallback.text.length });
}
} catch (recoverErr: any) {
logError('Non-streaming fallback also failed.', {
engine, model: actualModel, error: recoverErr?.message ?? String(recoverErr),
});
}
}
@@ -1183,7 +1227,12 @@ export class AgentExecutor {
}
if (this.isStaleRun(runId)) return;
}
const cleanedVisible = cleaned.visible;
// [Plain Text Output] outputFormat='plain' (기본)이면 모델이 무심코 내보낸
// 마크다운 마커(`##`, `**`, `> `, `* ` …) 를 후처리로 모두 제거. 라벨 텍스트는 유지.
// markdown 모드면 legacy 그대로 통과.
const cleanedVisible = getConfig().outputFormat === 'plain'
? stripMarkdownFormatting(cleaned.visible)
: cleaned.visible;
// 5. Execute Actions
const rationale = this.parseRationale(cleanedVisible);
@@ -1235,7 +1284,13 @@ export class AgentExecutor {
if (notice && assistantContent.trim()) {
assistantContent = assistantContent.trimEnd() + notice;
}
const finalAssistantContent = assistantContent;
// [Plain Text Output — FINAL pass] enforcer 들이 `## 경로 확인 결과` 같은 하드코딩 헤더를
// 다시 prepend 한 후에도 마커가 남지 않도록, webview / chatHistory 에 들어가는 최종 문자열을
// 한 번 더 sanitize. cleanedVisible 단계의 1차 sanitize 는 model 출력 자체를 정리하고,
// 이 2차 sanitize 는 enforcer 출력까지 모두 청소한다.
const finalAssistantContent = getConfig().outputFormat === 'plain'
? stripMarkdownFormatting(assistantContent)
: assistantContent;
const assistantMessage: ChatMessage = { role: 'assistant', content: finalAssistantContent, internal: false, rationale };
this.chatHistory.push(assistantMessage);
@@ -1470,21 +1525,33 @@ export class AgentExecutor {
: '';
// 워크플로우 매니저에게 설정 기반 실행 위임
const finalReport = await AgentWorkflowManager.runStrictWorkflow(
// [Clean Stream] 단계 진행 메시지는 채팅 본문(streamChunk) 이 아닌 사이드바
// 상단의 workflowStage 인디케이터로만 표시한다 → "생각 단계가 본문에 계속 보임"
// 답답함 제거. 채팅 버블에는 최종 답변만 한 번에 들어간다.
const rawFinalReport = await AgentWorkflowManager.runStrictWorkflow(
prompt,
modelName,
`${brainContext}${selectedAgentContext}${designerContext}`,
signal,
(step, msg) => {
this.webview?.postMessage({ type: 'autoContinue', value: `${step}: ${msg}` });
// 각 단계별 시작을 알림
this.webview?.postMessage({ type: 'streamChunk', value: `\n\n> **[${step}]** ${msg}\n\n` });
this.webview?.postMessage({
type: 'workflowStage',
value: { step, message: msg, done: step === '완료' || step === '오류' }
});
}
);
if (signal.aborted || !this.webview) return;
this.webview.postMessage({ type: 'streamChunk', value: `\n\n--- \n\n${finalReport}` });
// [Plain Text Output] Synthesizer가 잘 따라줬어도 작은 모델은 `##` `**` 를 흘리는 경우가 있어
// 최종 후처리로 한 번 더 마커를 벗긴다. 채팅 history 에도 정제된 결과만 남겨 다음 턴 컨텍스트에서
// 마커가 재학습되는 일을 막는다.
const finalReport = getConfig().outputFormat === 'plain'
? stripMarkdownFormatting(rawFinalReport)
: rawFinalReport;
this.webview.postMessage({ type: 'streamChunk', value: finalReport });
this.webview.postMessage({ type: 'workflowStage', value: { step: '완료', message: '', done: true } });
this.webview.postMessage({ type: 'streamEnd' });
this.chatHistory.push({ role: 'assistant', content: finalReport });
@@ -1494,6 +1561,8 @@ export class AgentExecutor {
this.webview.postMessage({ type: 'autoContinue', value: '✅ 모든 분석이 성공적으로 완료되었습니다.' });
} catch (error: any) {
// 어떤 종료 경로에서든 stage indicator 는 반드시 닫는다 — 안 닫으면 사이드바에 영원히 "③ 자기 검증..." 가 남는다.
this.webview?.postMessage({ type: 'workflowStage', value: { step: '완료', message: '', done: true } });
if (error.name === 'AbortError' || error.message?.includes('cancelled')) {
this.statusBarManager.updateStatus(AgentStatus.Idle, 'Workflow Cancelled');
return;
@@ -1537,10 +1606,23 @@ export class AgentExecutor {
temperature: 0.3,
maxTokens: subMaxTokens,
contextOverflowPolicy,
...this.lmStudioSamplingFromConfig(),
...this.lmStudioRespondExtrasFromConfig(),
signal: this.abortController?.signal,
});
for await (const { token } of stream) {
let subStopReason: string | undefined;
for await (const { token, stopReason } of stream) {
if (token) responseText += token;
if (stopReason) subStopReason = stopReason;
}
// Sub-agent answers that got cut mid-sentence corrupt the pipeline silently
// (Planner produces a half-step, Writer can't recover). Surface a warn log so
// the operator can raise subMaxTokens or pick a less aggressive output budget.
if (subStopReason && /maxPredicted|context|truncat/i.test(subStopReason)) {
logError('Sub-agent answer hit a generation limit.', {
role, model: modelName, stopReason: subStopReason,
chars: responseText.length, maxTokens: subMaxTokens,
});
}
return responseText;
} catch (err: any) {
@@ -1726,12 +1808,13 @@ export class AgentExecutor {
return [
'Intent operating contract — Code Review:',
'The user wants a real review, not a meta-plan of how to review.',
'Required sections in this exact order, in Korean:',
' 1. ## 한 줄 판단 — one sentence: would you rely on this today, and under what constraint?',
' 2. ## 잘된 점 — 2~4 concrete strengths. Each MUST cite a specific file path (and a function or section if you can name one) and explain WHY it works, not just that it exists.',
' 3. ## 부족한 점 — 2~4 concrete weaknesses or risks. Same rule: cite a specific file/area, name the actual problem (race condition, missing retry, coupling, etc.), and say what breaks because of it.',
' 4. ## 사용자 관점 개선 — 2~4 changes phrased from the END USER\'s perspective ("when X happens, the user currently sees Y; they should see Z"). Tie each to a code location that needs to change.',
' 5. ## 다음 한 수 — exactly one next action, small enough to do this week.',
'OUTPUT FORMAT: PLAIN TEXT only. Section labels are bare words on their own line (no "#", "##", "**", "__", "> "). Bullets use "- ". Long answers MUST start with a "핵심 요약" block (2~4 bullets) before any detail.',
'Required sections in this exact order, in Korean (each label appears as a plain line, NOT a markdown heading):',
' 1) 한 줄 판단 — one sentence: would you rely on this today, and under what constraint?',
' 2) 잘된 점 — 2~4 concrete strengths. Each MUST cite a specific file path (and a function or section if you can name one) and explain WHY it works, not just that it exists.',
' 3) 부족한 점 — 2~4 concrete weaknesses or risks. Same rule: cite a specific file/area, name the actual problem (race condition, missing retry, coupling, etc.), and say what breaks because of it.',
' 4) 사용자 관점 개선 — 2~4 changes phrased from the END USER\'s perspective ("when X happens, the user currently sees Y; they should see Z"). Tie each to a code location that needs to change.',
' 5) 다음 한 수 — exactly one next action, small enough to do this week.',
'',
'Hard rules — these are the things that made past reviews feel like a template:',
'- Do NOT write meta-sentences like "확인해야 합니다", "다음 리뷰에서는 ~를 보면 됩니다", "~로 보입니다", "~인지 확인하는 것이 핵심입니다". Either you observed it or you read the file with <read_file> right now.',
@@ -1998,12 +2081,53 @@ export class AgentExecutor {
return false;
}
const complexByShape = prompt.length > 180 || /(보고서|심층|종합\s*분석|리서치|조사|전략\s*수립|기획안|제안서|roadmap|research|report|deep\s*analysis|strategy|proposal)/i.test(prompt);
if (!complexByShape) {
const cfg = getConfig();
const mode = cfg.workflowMultiAgentMode || 'auto';
// 'off' → 기존 키워드/길이 휴리스틱만 사용 (legacy multiAgentEnabled 토글 존중).
if (mode === 'off') {
const legacyComplex = prompt.length > 180 || /(보고서|심층|종합\s*분석|리서치|조사|전략\s*수립|기획안|제안서|roadmap|research|report|deep\s*analysis|strategy|proposal)/i.test(prompt);
if (!legacyComplex) return false;
return configEnabled || /(보고서|심층|종합\s*분석|리서치|조사|전략\s*수립|기획안|제안서|research|report|deep\s*analysis|strategy|proposal)/i.test(prompt);
}
// 인사·잡담은 5단계 파이프라인 낭비. 짧은 casual prompt 는 제외.
if (this.isCasualConversationPrompt(prompt)) {
return false;
}
if (prompt.trim().length < 12) {
return false;
}
return configEnabled || /(보고서|심층|종합\s*분석|리서치|조사|전략\s*수립|기획안|제안서|research|report|deep\s*analysis|strategy|proposal)/i.test(prompt);
// 'always' → 위 가드만 통과하면 무조건 발동.
if (mode === 'always') return true;
// 'auto' → 다음 중 하나라도 만족하면 발동:
// (1) 사용자가 multiAgentEnabled 를 명시적으로 켰다,
// (2) 작은 모델 (≤4B params) 이라 한 번에 처리하기 위험,
// (3) prompt 토큰이 효과적 context window 의 임계 이상을 차지한다,
// (4) "보고서/리뷰/심층 분석" 같은 명백한 복합 작업 키워드 매치,
// (5) prompt 길이 자체가 큼 (>240 chars).
if (configEnabled) return true;
const paramB = estimateModelParamsB(cfg.defaultModel);
if (paramB !== null && paramB <= 4) return true;
try {
const effectiveCtx = cfg.smallModelContextCap > 0 && paramB !== null && paramB <= 4
? cfg.smallModelContextCap
: cfg.contextLength;
const promptTokens = estimateTokens(prompt);
const threshold = Math.floor(effectiveCtx * cfg.workflowAutoCtxFractionThreshold);
if (promptTokens >= threshold) return true;
} catch { /* 안전한 폴백: 키워드/길이 체크로 진행 */ }
if (/(보고서|심층|종합\s*분석|리서치|조사|전략\s*수립|기획안|제안서|코드\s*리뷰|리뷰|아키텍처|architecture|research|report|deep\s*analysis|strategy|proposal|review)/i.test(prompt)) {
return true;
}
if (prompt.length > 240) return true;
return false;
}
private buildAstraModeArchitectureContext(prompt: string): string {
@@ -2129,6 +2253,78 @@ export class AgentExecutor {
}
}
/**
* v2.2.69 — sliding-window 가 잘라낸 메시지들을 한 줄 요약으로 압축.
* 추가 LLM 호출 없이 heuristic 으로:
* - 사용자 prompt 첫 문장
* - assistant 답변 첫 문장 (conclusion-first 가정 — R1)
* 만 추출해 시간순으로 이어붙인다. 모델이 "이전에 무슨 얘기를 했는지" 의 골자만 알면 충분.
* `## ` 같은 마크다운 마커는 떼서 깔끔한 plain text 로 만든다.
*/
private buildDroppedHistorySummary(dropped: ChatMessage[]): string {
if (dropped.length === 0) return '';
const lines: string[] = [];
const firstSentence = (s: string): string => {
const cleaned = String(s || '')
.replace(/^\s{0,3}#{1,6}\s+/gm, '')
.replace(/\*\*/g, '')
.replace(/`{3}[\s\S]*?`{3}/g, '[code]')
.replace(/\s+/g, ' ')
.trim();
// 첫 문장 (마침표/물음표/줄바꿈 기준) — 너무 길면 140자 cap.
const m = cleaned.match(/^[^.!?。\n]{1,140}[.!?。]?/);
const out = (m ? m[0] : cleaned.slice(0, 140)).trim();
return out;
};
let userTurnIdx = 0;
for (const msg of dropped) {
if (msg.internal) continue;
const content = typeof msg.content === 'string' ? msg.content : '';
if (!content.trim()) continue;
if (msg.role === 'user') {
userTurnIdx++;
lines.push(`U${userTurnIdx}: ${firstSentence(content)}`);
} else if (msg.role === 'assistant') {
lines.push(`A${userTurnIdx}: ${firstSentence(content)}`);
}
}
// 너무 많으면 가장 오래된 절반은 한 줄로 합치고 최근 N개만 보존.
const MAX_LINES = 8;
if (lines.length > MAX_LINES) {
const tail = lines.slice(-MAX_LINES);
const head = lines.slice(0, lines.length - MAX_LINES);
return `[이전 대화 요약 — 총 ${dropped.length}개 메시지가 컨텍스트 한계로 생략됨]\n(더 오래된 ${head.length}개 턴 생략됨)\n${tail.join('\n')}`;
}
return `[이전 대화 요약 — 총 ${dropped.length}개 메시지가 컨텍스트 한계로 생략됨]\n${lines.join('\n')}`;
}
/**
* v2.2.69 — 현재 요청의 mode signature 를 계산.
* mode 가 직전과 다르면 system prompt 에 "이전 모드: X / 현재 모드: Y" 한 줄 brige 를 끼울 수 있다.
*/
private computeModeSignature(opts: { agentSkillName?: string; companyMode?: boolean; multiAgent?: boolean; brainName?: string }): string {
const parts = [
`agent=${opts.agentSkillName || 'none'}`,
`company=${opts.companyMode ? 'on' : 'off'}`,
`multi=${opts.multiAgent ? 'on' : 'off'}`,
`brain=${opts.brainName || '?'}`,
];
return parts.join('|');
}
/**
* v2.2.69 — chatHistory 의 마지막 user/assistant 턴에서 사용자가 무슨 주제를 다루고 있었는지
* 한 줄로 뽑아 모드 전환 bridge 에 쓸 "이전 맥락" 문장을 만든다. 비어 있으면 빈 문자열.
*/
private buildLastTopicLine(): string {
const recent = this.chatHistory.filter(m => !m.internal && (m.role === 'user' || m.role === 'assistant'));
if (recent.length === 0) return '';
const lastUser = [...recent].reverse().find(m => m.role === 'user');
if (!lastUser || typeof lastUser.content !== 'string') return '';
const topic = lastUser.content.replace(/\s+/g, ' ').trim().slice(0, 120);
return topic;
}
private buildRequestHistory(history: ChatMessage[]): ChatMessage[] {
return history.map((message) => {
if (message.role !== 'assistant' || typeof message.content !== 'string') {
@@ -2957,17 +3153,23 @@ export class AgentExecutor {
// 같은 엔진 내에서만 model candidate / message variant retry
for (const candidateModel of modelCandidates) {
for (const variant of messageVariants) {
const sampling = samplingToRestBody(this.lmStudioSamplingFromConfig());
const streamBody = {
model: candidateModel,
messages: variant.messages,
stream: true,
...(engine === 'lmstudio'
? { max_tokens: maxTokens, temperature }
: { options: { num_ctx: numCtx, num_predict: maxTokens, temperature } }),
// LM Studio's OpenAI-compatible REST extends the schema with top_k/min_p/
// repeat_penalty (same names as Ollama). Spread the shared sampling block so
// the REST fallback matches the SDK path — without it a fallback after a
// dead handle quietly loses the glitch-suppression preset.
? { max_tokens: maxTokens, temperature, ...sampling }
: { options: { num_ctx: numCtx, num_predict: maxTokens, temperature, ...sampling } }),
};
// 일시적 네트워크 오류용 retry (최대 2회, 지수 backoff)
const MAX_RETRIES = 2;
let serviceDown = false;
for (let attempt = 0; attempt <= MAX_RETRIES; attempt++) {
try {
if (attempt > 0) {
@@ -3013,13 +3215,33 @@ export class AgentExecutor {
if (lastError.name === 'AbortError') {
throw lastError;
}
// ECONNREFUSED / DNS-level failures mean the engine process isn't even
// listening — no amount of retries or message-variant juggling will help.
// Abandon the candidate/variant loops now and surface the "is X running?"
// error fast instead of burning 12 fetch attempts before giving up.
const errCode = (error?.cause?.code ?? error?.code ?? '').toString();
const errMsg = lastError.message;
if (
errCode === 'ECONNREFUSED' || errCode === 'ENOTFOUND' || errCode === 'EAI_AGAIN'
|| /ECONNREFUSED|ENOTFOUND|getaddrinfo|fetch failed/i.test(errMsg)
) {
serviceDown = true;
logError('AI streaming request: engine appears to be down.', {
engine, apiUrl, code: errCode, error: errMsg,
});
break; // exit retry loop
}
logError('AI streaming request failed.', {
engine, variant: variant.name, apiUrl, model: candidateModel,
attempt, error: lastError.message
});
}
}
if (serviceDown) break; // skip remaining variants
}
// serviceDown also short-circuits the model-candidate loop — there is no
// candidate / variant the engine can answer if it isn't listening at all.
if (lastError && /ECONNREFUSED|ENOTFOUND|fetch failed/i.test(lastError.message)) break;
}
// 명확한 에러 메시지: 어느 엔진이 실패했는지 사용자에게 알림
@@ -3151,13 +3373,14 @@ export class AgentExecutor {
const numCtx = Math.max(2048, params.contextLength ?? 32768);
const apiUrl = buildApiUrl(baseUrl, engine, 'chat');
const variants = this.buildEngineMessageVariants(messages, engine);
const sampling = samplingToRestBody(this.lmStudioSamplingFromConfig());
const body = {
model: modelName,
messages: variants[0].messages,
stream: false,
...(engine === 'lmstudio'
? { max_tokens: maxTokens, temperature }
: { options: { num_ctx: numCtx, num_predict: maxTokens, temperature } }),
? { max_tokens: maxTokens, temperature, ...sampling }
: { options: { num_ctx: numCtx, num_predict: maxTokens, temperature, ...sampling } }),
};
const response = await fetch(apiUrl, {
method: 'POST',
@@ -3231,6 +3454,8 @@ export class AgentExecutor {
temperature: params.temperature,
maxTokens: params.maxTokens,
contextOverflowPolicy: params.contextOverflowPolicy,
...this.lmStudioSamplingFromConfig(),
...this.lmStudioRespondExtrasFromConfig(),
signal: params.signal,
});
for await (const { token, stopReason } of stream) {
@@ -3356,6 +3581,29 @@ export class AgentExecutor {
];
}
/**
* Build the shared LM Studio sampling block from current config. Used by both the
* SDK streamer (topPSampling/topKSampling/…) and the REST body (top_p/top_k/…)
* so the two paths produce equivalent answers — otherwise a REST fallback after a
* dead SDK handle would silently drop the glitch-suppression that the SDK applies
* (한글 토큰 깨짐 등). Ollama also accepts these field names inside `options`.
*/
private lmStudioSamplingFromConfig(): LmStudioSampling {
const c = getConfig();
return {
topP: c.lmStudioTopP,
topK: c.lmStudioTopK,
minP: c.lmStudioMinP,
repeatPenalty: c.lmStudioRepeatPenalty,
};
}
/** SDK-only extras for `respond()` — currently the draft model for speculative decoding. */
private lmStudioRespondExtrasFromConfig(): { draftModel?: string } {
const c = getConfig();
return c.lmStudioDraftModel ? { draftModel: c.lmStudioDraftModel } : {};
}
private buildModelCandidates(modelName: string, engine: 'lmstudio' | 'ollama'): string[] {
const candidates = [modelName];
if (engine === 'lmstudio') {
+13 -7
View File
@@ -1,4 +1,4 @@
import { PlannerAgent, ResearcherAgent, ReflectorAgent, WriterAgent } from './factory';
import { PlannerAgent, ResearcherAgent, ReflectorAgent, WriterAgent, SynthesizerAgent } from './factory';
import { AgentEngine, PipelineStage, AgentExecuteOptions } from '../lib/engine';
import { getConfig } from '../config';
@@ -17,9 +17,13 @@ export class AgentWorkflowManager {
const researcher = new ResearcherAgent(modelName);
const writer = new WriterAgent(modelName);
// [Self-Reflection] 설정으로 비활성화하지 않은 경우에만 Reflector를 주입.
const enableReflection = getConfig().enableReflection !== false;
const cfg = getConfig();
const enableReflection = cfg.enableReflection !== false;
const reflector = enableReflection ? new ReflectorAgent(modelName) : undefined;
const engine = new AgentEngine(planner, researcher, writer, reflector);
// [5-stage pipeline] 최종 합성 단계. 설정으로 끄지 않은 한 항상 주입.
const enableSynth = cfg.workflowSynthesizerEnabled !== false;
const synthesizer = enableSynth ? new SynthesizerAgent(modelName) : undefined;
const engine = new AgentEngine(planner, researcher, writer, reflector, synthesizer);
const missionId = `mission_${Date.now()}`;
const runOptions: AgentExecuteOptions = {
@@ -46,12 +50,14 @@ export class AgentWorkflowManager {
}
private static mapStageToUI(stage: PipelineStage): string {
// 사용자가 보는 라벨은 한국어 + 단계 번호로 통일. 5단계 파이프라인이 명확하게 드러나도록.
const maps: Record<PipelineStage, string> = {
idle: '대기',
planner: 'Planner',
researcher: 'Researcher',
reflector: 'Reflector',
writer: 'Writer',
planner: '① 계획',
researcher: '② 자료 수집',
reflector: '③ 자기 검증',
writer: '④ 초안 작성',
synthesizer: '⑤ 최종 정리',
completed: '완료',
error: '오류'
};
+63 -9
View File
@@ -134,13 +134,17 @@ Your mission is to extract, filter, and synthesize critical data based on a stra
}
export class WriterAgent extends BaseAgent {
private readonly persona = `You are the [Lead Synthesis Writer & Editor].
Your goal is to produce a state-of-the-art final report that wows the user.
- TONE: Authoritative yet accessible. Professional developer/consultant style.
- STRUCTURE: Use an executive summary, detailed analysis sections, and a "Final Recommendation" block.
- LANGUAGE: Always respond in the user's language (KOREAN).
- POLISHING: Ensure logical flow between sections. Make it look like a premium report.
- SELF-CORRECTION: When a [REFLECTION CRITIQUE] block is provided, you MUST address each listed gap, contradiction, or missing-evidence item explicitly before producing the final report. Do not silently ignore the critique.`;
// [5-stage pipeline] Writer는 이제 "Drafter" 역할: 빠르게 1차 초안만 생성한다.
// 최종 다듬기/요약/critique 반영은 후속 SynthesizerAgent가 담당하므로,
// 작은 모델이 한 번에 모든 것을 끝내려 컨텍스트를 폭주시키는 일이 없도록 한다.
private readonly persona = `You are the [Section Drafter].
Your goal is to produce a STRUCTURED FIRST DRAFT that the downstream Synthesizer will polish.
- SCOPE: Cover each major topic from the research as its own section. Each section starts with a short plain-text label on its own line (e.g. "잘된 점", "부족한 점") — NO "#", "##", "**", "__", ">" markers. Use "- " for bullets, never "* ".
- DENSITY: Pack facts; skip flowery prose, executive summaries, and closing remarks (the Synthesizer adds those).
- TONE: Plain, factual, developer-readable Korean.
- BREVITY: Keep each section tight — better to leave the Synthesizer something to merge than to run out of tokens mid-section.
- SELF-CORRECTION: When a [REFLECTION CRITIQUE] block is provided, address each listed gap inline in the relevant section. Do not silently ignore the critique.
- LANGUAGE: KOREAN.`;
async execute(input: string, originalRequest?: string, signal?: AbortSignal, options?: AgentExecuteOptions): Promise<string> {
// [Astra v4.0] Advisor 모드 처리
@@ -163,11 +167,61 @@ Analyze the provided report and suggest 3 high-impact next actions for the user.
? `\n5. [REFLECTION CRITIQUE — must be addressed]:\n${reflection.length > 4000 ? reflection.substring(0, 4000) + '... [Critique Trimmed]' : reflection}`
: '';
const wrappedInput = `### SYSTEM INSTRUCTION: FINAL SYNTHESIS
const wrappedInput = `### SYSTEM INSTRUCTION: SECTIONED DRAFT
1. Gathered Research Data: ${trimmedData}
2. User's Original Objective: ${originalRequest}
3. Applied Knowledge & Filtering Policy: ${policy}
4. Mission: Write the definitive final report in KOREAN.${reflectionBlock}`;
4. Mission: Produce a STRUCTURED FIRST DRAFT in KOREAN — section per topic, factual bullets allowed.
Do NOT add a final executive summary or closing remarks; the Synthesizer will handle those.${reflectionBlock}`;
return this.callLLM(this.persona, wrappedInput, signal);
}
}
/**
* [5-stage pipeline] SynthesizerAgent
* Drafter가 작성한 1차 초안을 받아 최종 사용자 답변으로 다듬는다.
* - 입력이 "이미 정리된 draft" 라서 컨텍스트가 작다 → 작은 로컬 모델도 한 번에 처리 가능.
* - 역할은 (a) 도입 한 줄 (b) 섹션 흐름 정리 (c) 결론/제안 한 단락. 새로운 사실을 만들지 않는다.
* - Reflector critique이 함께 전달되면, 그 항목들이 답변에 정말 반영되었는지 한 번 더 점검한다.
*/
export class SynthesizerAgent extends BaseAgent {
private readonly persona = `You are the [Final Editor & Synthesizer].
You receive a structured FIRST DRAFT (already broken into sections) plus the user's original request and (optionally) a reflection critique.
Your only job is to produce the FINAL user-facing answer.
[OUTPUT FORMAT — 7 hard rules — these override every other formatting habit]
R1. CONCLUSION FIRST. The very first sentence is the conclusion / verdict / recommendation. No greeting, no "분석해보겠습니다", no scene-setting paragraph, no "핵심 요약" label line on top. Just the conclusion as sentence 1. A reader who stops after sentence 1 must know what you decided.
R2. AT MOST 3 SECTIONS. Total. A section = a label line + body, or a clearly separated numbered group. If the answer fits without sections, use none. Three is the ceiling, not a target.
R3. NO REPETITION. Each sentence carries new information. If you said it in the conclusion, do NOT restate it in a later section.
R4. BOLD ≤ 3 INSTANCES. Across the entire answer, use bold at most 3 times — reserve it for truly load-bearing words (file name, verdict word, hard number). Most answers should have zero.
R5. JUDGE WITHOUT ASKING. If a defensible decision is reachable from the draft + original request, deliver it and act. Do NOT ask permission, do NOT bounce the question back.
R6. ASK ONE QUESTION ONLY WHEN: (a) the path forks into two materially different directions and user intent is unknown, OR (b) the next step is irreversible (delete, force-push, drop table, overwrite uncommitted work, send external message). One plain sentence on its own line at the end. No "핵심 확인 질문" label, no "질문 의도", no follow-ups.
R7. GUESS-AND-ACT WITH STATED ASSUMPTION. If a detail is missing but a reasonable guess exists, guess and act, declaring the assumption in one line prefixed "가정:".
[PLAIN TEXT]
- NEVER emit "#", "##", "###", "__", "> " markers. Section labels are plain text on their own line.
- Bullets: "- " only. No "* " / "• ".
- No tables. No HTML.
- Inline code with backticks is OK (e.g. \`src/agent.ts\`). Triple-backtick code blocks only for actual code.
[CONTENT]
- Preserve every factual claim from the draft. Do NOT invent new facts, do NOT add hidden reasoning, do NOT write meta-commentary.
- DO NOT EMIT: <think>, <analysis>, <|channel|> markers, "Thinking Process:", planning notes, or any hidden reasoning.
- If a [REFLECTION CRITIQUE] is provided, verify each item is addressed. If something is missing, say so explicitly rather than fabricating coverage.
- LANGUAGE: KOREAN. Tone: direct, technical, developer-friendly.`;
async execute(input: string, originalRequest?: string, signal?: AbortSignal, options?: AgentExecuteOptions): Promise<string> {
const draft = input.length > 12000 ? input.substring(0, 12000) + '... [Draft Trimmed]' : input;
const reflection = options?.priorResults?.reflection;
const reflectionBlock = reflection && reflection.trim().length > 0
? `\n4. [REFLECTION CRITIQUE — verify the draft addresses each item]:\n${reflection.length > 3000 ? reflection.substring(0, 3000) + '... [Critique Trimmed]' : reflection}`
: '';
const wrappedInput = `### SYSTEM INSTRUCTION: FINAL SYNTHESIS
1. User's Original Request: ${originalRequest || '(unavailable)'}
2. Structured Draft (from Drafter — your input to polish):
${draft}
3. Mission: Produce the FINAL user-facing answer in KOREAN. Do not restart from scratch — polish, smooth, and conclude.${reflectionBlock}`;
return this.callLLM(this.persona, wrappedInput, signal);
}
}
+93
View File
@@ -143,6 +143,65 @@ export interface IAgentConfig {
* 누적됨. false면 critique은 그 미션 한정으로만 사용되고 사라짐.
*/
autoLessonFromReflection: boolean;
// ─── 5-stage workflow (Drafter + Synthesizer) ───
/** Drafter(=Writer) 출력 뒤에 SynthesizerAgent로 최종 다듬기 패스를 한 번 더 돌릴지. 기본 true. */
workflowSynthesizerEnabled: boolean;
/**
* Multi-Agent 발동 모드:
* - 'auto' (기본): 작은 모델(≤4B) 감지 OR prompt가 컨텍스트의 큰 비중을 차지할 때만 자동 발동.
* - 'always': 인사·짧은 잡담을 제외한 모든 요청에 5단계 파이프라인 사용.
* - 'off': 기존 single-agent 동작 (수동 토글 / 키워드 매칭만 사용).
*/
workflowMultiAgentMode: 'auto' | 'always' | 'off';
/**
* 'auto' 모드에서 prompt + brain context 토큰이 contextLength 의 이 비율(0~1)을 넘으면 강제 5단계.
* 기본 0.30 — 작은 모델이 30% 이상을 input으로 먹기 시작하면 한 번에 끝내려는 시도가 위험.
*/
workflowAutoCtxFractionThreshold: number;
// ─── Stream 표시 ───
/**
* 모델 토큰을 받는 즉시 채팅 버블에 흘려보낼지 여부.
* - false(기본): 토큰은 내부에서만 누적, sanitize 끝난 최종 답변만 한 번에 표시 → Harmony/think 마커 누설 원천 차단.
* - true: legacy 라이브 스트리밍. 모델 출력에 control token 이 섞여 나오면 잠깐 화면에 보일 수 있음.
*/
liveStreamTokens: boolean;
/**
* 최종 답변 포맷.
* - 'plain' (기본): 모델이 무심코 내보낸 `##`, `**`, `__`, `> `, `* ` 등의 마크다운 마커를 후처리로 모두 제거.
* 섹션 라벨 텍스트(예: "핵심 요약")는 유지되지만 헤더 마커는 사라져 깔끔한 plain text 로 표시.
* - 'markdown': legacy 동작. 모델 출력을 그대로 렌더러에 넘김.
*/
outputFormat: 'plain' | 'markdown';
/**
* 자동 기록 (project chronicle auto-record). true 면 매 prompt 후 의미 있는 turn 을
* Wiki/Chronicle 폴더에 자동으로 저장. false 면 자동 저장 OFF (수동 기록은 계속 가능).
* 사이드바 도구 드롭다운의 토글 항목으로 즉시 변경 가능.
*/
chronicleAutoRecord: boolean;
// ─── LM Studio sampling (applied to both SDK and REST paths) ───
/** LM Studio nucleus sampling cutoff (0~1). Lower tightens; 1 disables. */
lmStudioTopP: number;
/** LM Studio top-K cutoff (0 disables). */
lmStudioTopK: number;
/** LM Studio min-P floor (0~1, 0 disables). */
lmStudioMinP: number;
/** LM Studio repeat penalty (1 disables, 1.051.2 typical). */
lmStudioRepeatPenalty: number;
/** Render tok/s + TTFT from prediction stats into context-budget badge. */
lmStudioShowStatsInBudget: boolean;
/** LM Studio model key of a small draft model for speculative decoding ('' = disabled). */
lmStudioDraftModel: string;
/** Load-time options. Read once per load(); changing these after load needs a reload. */
lmStudioLoad: {
flashAttention: boolean;
/** "max" | "off" | number 0-1 */
gpuOffloadRatio: 'max' | 'off' | number;
offloadKVCacheToGpu: boolean;
keepModelInMemory: boolean;
useFp16ForKVCache: boolean;
/** 0 = engine default */
evalBatchSize: number;
};
}
// ─── 경로 정규화 유틸리티 ───
@@ -245,6 +304,40 @@ export function getConfig(): IAgentConfig {
companyPixelOfficeBubbles: cfg.get<boolean>('company.pixelOffice.bubbles', true),
enableReflection: cfg.get<boolean>('enableReflection', true),
autoLessonFromReflection: cfg.get<boolean>('autoLessonFromReflection', true),
workflowSynthesizerEnabled: cfg.get<boolean>('workflow.synthesizerEnabled', true),
workflowMultiAgentMode: ((): 'auto' | 'always' | 'off' => {
const v = (cfg.get<string>('workflow.multiAgentMode', 'auto') || 'auto').trim().toLowerCase();
return v === 'always' || v === 'off' ? v : 'auto';
})(),
workflowAutoCtxFractionThreshold: Math.max(0.05, Math.min(0.95,
cfg.get<number>('workflow.autoCtxFractionThreshold', 0.30)
)),
liveStreamTokens: cfg.get<boolean>('liveStreamTokens', true),
outputFormat: ((): 'plain' | 'markdown' => {
const v = (cfg.get<string>('outputFormat', 'plain') || 'plain').trim().toLowerCase();
return v === 'markdown' ? 'markdown' : 'plain';
})(),
chronicleAutoRecord: cfg.get<boolean>('chronicleAutoRecord', true),
lmStudioTopP: Math.max(0, Math.min(1, cfg.get<number>('lmStudio.sampling.topP', 0.9))),
lmStudioTopK: Math.max(0, cfg.get<number>('lmStudio.sampling.topK', 20)),
lmStudioMinP: Math.max(0, Math.min(1, cfg.get<number>('lmStudio.sampling.minP', 0.05))),
lmStudioRepeatPenalty: Math.max(1, Math.min(2, cfg.get<number>('lmStudio.sampling.repeatPenalty', 1.1))),
lmStudioShowStatsInBudget: cfg.get<boolean>('lmStudio.statsInBudget', true),
lmStudioDraftModel: (cfg.get<string>('lmStudio.draftModel', '') || '').trim(),
lmStudioLoad: {
flashAttention: cfg.get<boolean>('lmStudio.load.flashAttention', true),
gpuOffloadRatio: ((): 'max' | 'off' | number => {
const raw = (cfg.get<string>('lmStudio.load.gpuOffloadRatio', 'max') || 'max').trim().toLowerCase();
if (raw === 'max' || raw === 'off') return raw;
const n = Number(raw);
if (Number.isFinite(n)) return Math.max(0, Math.min(1, n));
return 'max';
})(),
offloadKVCacheToGpu: cfg.get<boolean>('lmStudio.load.offloadKVCacheToGpu', true),
keepModelInMemory: cfg.get<boolean>('lmStudio.load.keepModelInMemory', true),
useFp16ForKVCache: cfg.get<boolean>('lmStudio.load.useFp16ForKVCache', false),
evalBatchSize: Math.max(0, cfg.get<number>('lmStudio.load.evalBatchSize', 0)),
},
};
}
+68
View File
@@ -223,3 +223,71 @@ export function mergeContinuationParts(prev: string, next: string): string {
/** Rough token count of a string — re-exported helper so callers don't need contextManager directly. */
export const countTokens = estimateTokens;
/**
* ── Plain-text 출력 위생 ──────────────────────────────────────────────
* 사용자가 마크다운 렌더 없이 깔끔한 한국어 plain text 답변을 원함.
* 모델/페르소나가 학습된 습관으로 `##`, `**`, `> `, `* ` 등을 섞어 내보내면 화면에 그대로 노출되므로,
* 최종 답변 직전 한 번 더 마커를 벗겨낸다.
*
* 보존:
* - 코드 블록 (```fence``` 사이 본문은 손대지 않음)
* - 인라인 코드 `code` (백틱 유지)
* - 숫자 목록 `1. ` `1) ` 같은 자연 표기
* - 줄 시작 대시 `- ` (자연스러운 plain text bullet)
*
* 제거 / 변환:
* - 줄 시작 `#`,`##`,`###`,... `[space]` → 헤더 마커 제거 (라벨 텍스트는 유지)
* - `**bold**` / `__bold__` → bold (강조 마커만 제거)
* - 단일 `*텍스트*` 강조 → 텍스트 (단, `* ` 불릿 / 곱셈/와일드카드 패턴은 보존)
* - 줄 시작 `> ` blockquote 마커 → 제거
* - 줄 시작 `* ` 불릿 → `- ` 로 정규화 (asterisk 가 강조로 오인되는 일을 줄임)
* - 헤더 줄에 붙어 있던 trailing colon/space 정리
*/
export function stripMarkdownFormatting(text: string): string {
if (!text) return '';
// 1. 코드 블록은 통째로 보호. fenced(```...```) 만 보호하고 본문 내부는 어떤 치환도 적용 안 함.
const fenceParts: string[] = [];
let src = String(text).replace(/```[\s\S]*?```/g, (m) => {
fenceParts.push(m);
return `FENCE${fenceParts.length - 1}`;
});
// 2. 인라인 코드도 보호 (백틱 안 표현은 손대지 않는다).
const inlineParts: string[] = [];
src = src.replace(/`[^`\n]+`/g, (m) => {
inlineParts.push(m);
return `INL${inlineParts.length - 1}`;
});
// 3. 줄 단위 정리.
src = src.split('\n').map((rawLine) => {
let line = rawLine;
// 줄 시작 헤더 마커 제거 ("## 핵심 요약" → "핵심 요약")
line = line.replace(/^\s{0,3}#{1,6}\s+/, '');
// 줄 시작 blockquote 제거
line = line.replace(/^\s{0,3}>\s?/, '');
// 줄 시작 `* ` 또는 `+ ` 불릿 → `- ` 로 통일
line = line.replace(/^(\s*)[*+]\s+/, '$1- ');
return line;
}).join('\n');
// 4. 강조 마커 제거.
src = src.replace(/\*\*(.+?)\*\*/g, '$1'); // **bold**
src = src.replace(/__([^_\n]+?)__/g, '$1'); // __bold__
// 단일 별 강조: 양쪽 공백/줄경계로 둘러싸인 경우만 (`a*b*c` 같은 코드/수식은 건드리지 않음).
src = src.replace(/(^|[\s(\[])\*([^\s*][^*\n]*?[^\s*])\*(?=[\s).,!?;:]|$)/g, '$1$2');
src = src.replace(/(^|[\s(\[])\*([^\s*])\*(?=[\s).,!?;:]|$)/g, '$1$2');
// 5. 헤더가 라벨처럼 단독 줄로 남았으면 뒤에 콜론을 보장 (가독성 — "핵심 요약" → "핵심 요약" 그대로 유지하고,
// 사용자가 라벨임을 인지하기 쉽도록 직후 줄에 본문이 오는 형태를 유도). 콜론은 자동 추가하지 않는다 — 모델이
// 이미 본문을 줄바꿈해 두는 케이스가 더 흔함.
// 6. 연속 빈 줄 3개+ → 2개로.
src = src.replace(/\n{3,}/g, '\n\n');
// 7. 보호했던 코드/인라인 복원.
src = src.replace(/INL(\d+)/g, (_, i) => inlineParts[Number(i)] || '');
src = src.replace(/FENCE(\d+)/g, (_, i) => fenceParts[Number(i)] || '');
return src.trim();
}
+14 -2
View File
@@ -55,7 +55,7 @@ export async function activate(context: vscode.ExtensionContext) {
// 과 별개 채널 — popup도 OutputChannel도 못 보는 경우의 마지막 안전망).
const ext = vscode.extensions.getExtension('g1nation.astra');
const version = ext?.packageJSON?.version || '(unknown)';
console.error(`[ASTRA-DEBUG] activate v${version} pid=${process.pid}`);
console.log(`[ASTRA-DEBUG] activate v${version} pid=${process.pid}`);
void vscode.window.showInformationMessage(`📡 Astra v${version} activated (PID=${process.pid})`);
logInfo(`Astra activating... version=${version} pid=${process.pid}`);
@@ -88,10 +88,22 @@ export async function activate(context: vscode.ExtensionContext) {
client: lmStudioClient,
activity: activityTracker,
getConfig: () => {
// Read from getConfig() so we share the same setting parsers (incl. gpuOffloadRatio coercion)
// with the rest of the codebase instead of duplicating the logic here.
const ag = getConfig();
const cfg = vscode.workspace.getConfiguration('g1nation');
return {
idleTimeoutMs: cfg.get<number>('lmStudio.idleTimeoutMs', 300000),
autoLoadOnSelect: cfg.get<boolean>('lmStudio.autoLoadOnSelect', true),
loadConfig: {
flashAttention: ag.lmStudioLoad.flashAttention,
gpuOffloadRatio: ag.lmStudioLoad.gpuOffloadRatio,
offloadKVCacheToGpu: ag.lmStudioLoad.offloadKVCacheToGpu,
keepModelInMemory: ag.lmStudioLoad.keepModelInMemory,
useFp16ForKVCache: ag.lmStudioLoad.useFp16ForKVCache,
evalBatchSize: ag.lmStudioLoad.evalBatchSize,
},
draftModel: ag.lmStudioDraftModel || undefined,
};
},
notifyError: (msg) => provider?.postLmStudioError(msg),
@@ -157,7 +169,7 @@ export async function activate(context: vscode.ExtensionContext) {
lifecycle,
activity: activityTracker,
loadedModels: () => lmStudioClient.listLoadedCached(),
downloadedModels: () => lmStudioClient.listDownloaded(),
downloadedModels: () => lmStudioClient.listDownloadedCached(),
});
// One-time repair: rewrite any chronicle projects that were saved with the
// workspace parent as their `projectRoot` (a side-effect of the old
+1 -1
View File
@@ -57,7 +57,7 @@ export async function handleSlashCommand(
const head = (spaceIdx === -1 ? trimmed : trimmed.slice(0, spaceIdx)).toLowerCase() as SlashCommand;
const arg = spaceIdx === -1 ? '' : trimmed.slice(spaceIdx + 1).trim();
console.error(`[ASTRA-DEBUG] slashRouter handleSlashCommand head=${head} arg=${arg.slice(0, 40)}`);
logInfo(`[ASTRA-DEBUG] slashRouter handleSlashCommand head=${head} arg=${arg.slice(0, 40)}`);
logInfo(`[SLASH] handleSlashCommand start head=${head} arg="${arg.slice(0, 60)}" bridge=${getBridgeBaseUrl()}`);
void vscode.window.showInformationMessage(`📻 Datacollect Radio: ${head} 진입`);
void vscode.window.setStatusBarMessage(`📻 Datacollect Radio: ${head} 처리 중…`, 5000);
+7 -4
View File
@@ -118,19 +118,21 @@ export interface TrimResult<M extends BudgetMessage> {
}
/**
* 대화 기록을 토큰 예산 안에 맞춥니다.
* 대화 기록을 토큰 예산 안에 맞춥니다 (sliding window).
*
* 전략:
* 1. 항상 마지막 메시지(보통 현재 사용자 질문)는 유지.
* 2. 최근 메시지부터 역순으로 예산이 허용하는 만큼 채움.
* 3. 하나라도 잘렸으면 맨 앞에 `[이전 대화 N개 생략]` 마커를 끼워 모델이 맥락 누락을 인지하게 함.
* 3. 하나라도 잘렸으면 맨 앞에 marker 를 끼워 모델이 맥락 누락을 인지하게 함.
* v2.2.69+ — marker 콜백은 droppedCount 뿐 아니라 *잘려나간 메시지 배열* 도 받아
* 단순 count 가 아닌 진짜 요약/맥락을 작성할 수 있다.
*
* 주의: 여기서 잘라내는 것은 *요청에 보낼* 메시지 배열일 뿐, UI에 표시되는 전체 기록은 그대로 둡니다.
*/
export function trimHistoryToBudget<M extends BudgetMessage>(
messages: M[],
budgetTokens: number,
makeMarker: (droppedCount: number) => M
makeMarker: (droppedCount: number, droppedMessages: M[]) => M
): TrimResult<M> {
if (messages.length === 0) {
return { messages, droppedCount: 0, tokensAfter: 0 };
@@ -154,7 +156,8 @@ export function trimHistoryToBudget<M extends BudgetMessage>(
const droppedCount = messages.length - kept.length;
if (droppedCount > 0) {
const marker = makeMarker(droppedCount);
const droppedMessages = messages.slice(0, droppedCount);
const marker = makeMarker(droppedCount, droppedMessages);
kept.unshift(marker);
used += estimateMessageTokens(marker);
}
+40 -6
View File
@@ -51,7 +51,7 @@ export interface IAgent {
/**
* 파이프라인 단계 상태 정의
*/
export type PipelineStage = 'idle' | 'planner' | 'researcher' | 'reflector' | 'writer' | 'completed' | 'error';
export type PipelineStage = 'idle' | 'planner' | 'researcher' | 'reflector' | 'writer' | 'synthesizer' | 'completed' | 'error';
/**
* 감사(Audit) 이력에 기록되는 단일 상태 전환 엔트리.
@@ -453,7 +453,10 @@ export class AgentEngine {
private readonly researcher: IAgent,
private readonly writer: IAgent,
// [Self-Reflection] Researcher와 Writer 사이에 주입되는 메타인지 노드. 미주입 시 기존 3단계 파이프라인을 그대로 유지.
private readonly reflector?: IAgent
private readonly reflector?: IAgent,
// [5-stage pipeline] Writer(=Drafter)가 만든 초안을 사용자용 최종 답변으로 다듬는 노드.
// 미주입 시 Writer 출력이 그대로 최종 답변이 된다(기존 동작 유지).
private readonly synthesizer?: IAgent
) {}
/**
@@ -600,14 +603,45 @@ export class AgentEngine {
);
state.setResult('finalReport', finalReport);
// --- Phase 4.5: Synthesizer (final polish) ---
// Drafter(=Writer) 출력은 "초안"이다. Synthesizer가 주어졌으면 한 번 더 압축/매끄럽게 정리한다.
// 입력이 작은 draft 뿐이라 컨텍스트가 가벼워, 작은 로컬 모델도 한 번에 처리할 수 있다.
// 실패해도 미션을 막지 않고 Drafter 출력을 그대로 사용한다(soft-fail).
let polishedReport = finalReport;
if (this.synthesizer) {
try {
polishedReport = await this.executeStep(
state, 'synthesizer', '최종 답변 다듬기 중...',
() => this.resilientExecute(state, this.synthesizer!, 'Synthesizer', finalReport, prompt, signal, onProgress, {
...options,
context: brainContext,
signal,
config: { ...options?.config, role: 'synthesizer', isSamePrompt: true },
priorResults: { plan, reflection, originalPrompt: prompt, ...options?.priorResults },
abstractionLevel: 'balanced'
}),
`synthesizer::${finalReport}`, prompt, signal, onProgress
);
if (!polishedReport || polishedReport.trim().length < 24) {
// 합성기가 빈/잘린 결과를 내면 안전하게 초안 사용.
logError('[AgentEngine] Synthesizer returned empty/tiny output — using Drafter output.');
polishedReport = finalReport;
}
} catch (synthErr: any) {
if (synthErr?.name === 'AbortError') throw synthErr;
logError(`[AgentEngine] Synthesizer soft-fail — using Drafter output: ${synthErr?.message || synthErr}`);
polishedReport = finalReport;
}
}
// --- Phase 5: Advice & Standardization ---
const proactiveAdvice = await this.generateProactiveAdvice(finalReport, prompt, brainContext, signal);
const proactiveAdvice = await this.generateProactiveAdvice(polishedReport, prompt, brainContext, signal);
// [Structural Fix] 생성된 제안의 무결성 검증 (최소 길이 50자 이상일 때만 append)
const enrichedReport = proactiveAdvice && proactiveAdvice.length > 50
? `${finalReport}\n\n---\n## 💡 Astra의 선제적 제안 (Proactive Next Actions)\n${proactiveAdvice}`
: finalReport;
? `${polishedReport}\n\n---\n## 💡 Astra의 선제적 제안 (Proactive Next Actions)\n${proactiveAdvice}`
: polishedReport;
const standardizedReport = WikiFormatter.format(enrichedReport, state);
+94 -7
View File
@@ -1,8 +1,20 @@
import { LMStudioClient as SDKClient, LLM } from '@lmstudio/sdk';
import { LMStudioClient as SDKClient, LLM, type LLMLoadModelConfig } from '@lmstudio/sdk';
import { logError, logInfo } from '../utils';
/** Load-time options forwarded to LM Studio's `llm.load()`. Subset of `LLMLoadModelConfig`. */
export interface LMStudioLoadConfig {
flashAttention?: boolean;
/** "max" | "off" | number 0-1 */
gpuOffloadRatio?: 'max' | 'off' | number;
offloadKVCacheToGpu?: boolean;
keepModelInMemory?: boolean;
useFp16ForKVCache?: boolean;
/** 0 / undefined = engine default */
evalBatchSize?: number;
}
export interface ILMStudioClient {
load(modelKey: string, signal?: AbortSignal): Promise<void>;
load(modelKey: string, signal?: AbortSignal, loadConfig?: LMStudioLoadConfig): Promise<void>;
unload(modelKey: string): Promise<void>;
listLoaded(): Promise<string[]>;
/** Like listLoaded() but caches the result for `ttlMs` to avoid hammering the SDK. */
@@ -15,6 +27,10 @@ export interface ILMStudioClient {
* only returns loaded models when JIT is off).
*/
listDownloaded(): Promise<string[]>;
/** Cached variant; the downloaded list only changes when the user installs/removes a model. */
listDownloadedCached(ttlMs?: number): Promise<string[]>;
/** Pre-warm a draft model for speculative decoding. Idempotent + best-effort. */
preloadDraftModel?(draftModelKey: string): Promise<void>;
/**
* Resolve a chat-ready handle for an already-loaded (or just-loaded) model.
*
@@ -42,8 +58,20 @@ export function httpToWebSocketUrl(httpBaseUrl: string): string | undefined {
if (url.protocol === 'http:') url.protocol = 'ws:';
else if (url.protocol === 'https:') url.protocol = 'wss:';
else if (url.protocol !== 'ws:' && url.protocol !== 'wss:') return undefined;
if (url.pathname.endsWith('/v1')) url.pathname = url.pathname.slice(0, -3);
if (url.pathname.endsWith('/api')) url.pathname = url.pathname.slice(0, -4);
// Strip every REST-only path suffix LM Studio ships with so the SDK lands on the
// WebSocket root. Loop because /api/v0 → /api → '' should fully unwind.
const REST_SUFFIXES = ['/api/v0', '/api/v1', '/v1', '/api'];
let changed = true;
while (changed) {
changed = false;
for (const suffix of REST_SUFFIXES) {
if (url.pathname.endsWith(suffix)) {
url.pathname = url.pathname.slice(0, -suffix.length);
changed = true;
break;
}
}
}
const out = url.toString().replace(/\/+$/, '');
return out;
} catch {
@@ -55,7 +83,9 @@ export class LMStudioClient implements ILMStudioClient {
private _sdk: SDKClient | undefined;
private _wsUrl: string | undefined;
private _loadedCache: { value: string[]; expiresAt: number } | undefined;
private _downloadedCache: { value: string[]; expiresAt: number } | undefined;
private static readonly DEFAULT_LOADED_CACHE_TTL_MS = 5000;
private static readonly DEFAULT_DOWNLOADED_CACHE_TTL_MS = 60_000;
constructor(httpBaseUrl: string) {
this.setBaseUrl(httpBaseUrl);
@@ -67,6 +97,7 @@ export class LMStudioClient implements ILMStudioClient {
this._wsUrl = ws;
this._sdk = undefined;
this._loadedCache = undefined;
this._downloadedCache = undefined;
}
}
@@ -77,17 +108,53 @@ export class LMStudioClient implements ILMStudioClient {
return this._sdk;
}
async load(modelKey: string, signal?: AbortSignal): Promise<void> {
async load(modelKey: string, signal?: AbortSignal, loadConfig?: LMStudioLoadConfig): Promise<void> {
try {
await this.getSdk().llm.load(modelKey, signal ? { signal } : undefined);
const opts: { signal?: AbortSignal; config?: LLMLoadModelConfig } = {};
if (signal) opts.signal = signal;
const config = this._buildLoadConfig(loadConfig);
if (Object.keys(config).length > 0) opts.config = config;
await this.getSdk().llm.load(modelKey, Object.keys(opts).length > 0 ? opts : undefined);
this._loadedCache = undefined;
logInfo('LM Studio model loaded.', { modelKey });
// Loading does not change the downloaded-models set; leave _downloadedCache alone.
logInfo('LM Studio model loaded.', { modelKey, configKeys: Object.keys(config) });
} catch (e: any) {
const msg = e?.message ?? String(e);
throw new LMStudioLifecycleError(`Failed to load LM Studio model "${modelKey}": ${msg}`, e);
}
}
/** Translate our flat LMStudioLoadConfig into LM Studio's nested LLMLoadModelConfig shape. */
private _buildLoadConfig(lc: LMStudioLoadConfig | undefined): LLMLoadModelConfig {
const out: LLMLoadModelConfig = {};
if (!lc) return out;
if (typeof lc.flashAttention === 'boolean') out.flashAttention = lc.flashAttention;
if (typeof lc.offloadKVCacheToGpu === 'boolean') out.offloadKVCacheToGpu = lc.offloadKVCacheToGpu;
if (typeof lc.keepModelInMemory === 'boolean') out.keepModelInMemory = lc.keepModelInMemory;
if (typeof lc.useFp16ForKVCache === 'boolean') out.useFp16ForKVCache = lc.useFp16ForKVCache;
if (typeof lc.evalBatchSize === 'number' && lc.evalBatchSize > 0) out.evalBatchSize = lc.evalBatchSize;
if (lc.gpuOffloadRatio !== undefined) {
// GPUSetting is deprecated but still accepted — wraps a single `ratio`.
out.gpu = { ratio: lc.gpuOffloadRatio as any };
}
return out;
}
async preloadDraftModel(draftModelKey: string): Promise<void> {
const key = (draftModelKey || '').trim();
if (!key) return;
try {
const llm: any = this.getSdk().llm;
if (typeof llm.unstable_preloadDraftModel === 'function') {
await llm.unstable_preloadDraftModel(key);
logInfo('LM Studio draft model preloaded.', { draftModelKey: key });
}
} catch (e: any) {
// Best-effort — the main model's respond({draftModel}) will still load it lazily.
logError('LM Studio draft model preload failed.', { draftModelKey: key, error: e?.message ?? String(e) });
}
}
async unload(modelKey: string): Promise<void> {
try {
await this.getSdk().llm.unload(modelKey);
@@ -99,6 +166,12 @@ export class LMStudioClient implements ILMStudioClient {
}
}
/** Force the next downloaded/loaded-models call to re-fetch (use after install / remove). */
invalidateCaches(): void {
this._loadedCache = undefined;
this._downloadedCache = undefined;
}
async listLoaded(): Promise<string[]> {
try {
const items: any[] = await this.getSdk().llm.listLoaded();
@@ -138,6 +211,20 @@ export class LMStudioClient implements ILMStudioClient {
}
}
async listDownloadedCached(ttlMs: number = LMStudioClient.DEFAULT_DOWNLOADED_CACHE_TTL_MS): Promise<string[]> {
const now = Date.now();
if (this._downloadedCache && this._downloadedCache.expiresAt > now) {
return this._downloadedCache.value.slice();
}
const value = await this.listDownloaded();
// Only cache non-empty results — an empty array often signals a transient SDK error,
// and caching that for 60s would hide a freshly-started LM Studio process.
if (value.length > 0) {
this._downloadedCache = { value, expiresAt: now + ttlMs };
}
return value.slice();
}
async getModelHandle(modelKey: string, options?: { refresh?: boolean }): Promise<LLM> {
try {
if (options?.refresh) {
+11 -2
View File
@@ -1,4 +1,4 @@
import type { ILMStudioClient } from './client';
import type { ILMStudioClient, LMStudioLoadConfig } from './client';
import type { IActivityTracker } from './activityTracker';
import type { EngineKind } from '../utils';
import type { ISystemSpecsProvider, IModelMemoryEstimator } from '../system/specs';
@@ -9,6 +9,10 @@ export type LifecycleState = 'idle' | 'loading' | 'loaded' | 'streaming' | 'unlo
export interface LifecycleConfig {
idleTimeoutMs: number;
autoLoadOnSelect: boolean;
/** Forwarded to `llm.load()` config field. Omit to use engine defaults. */
loadConfig?: LMStudioLoadConfig;
/** When set, the lifecycle manager pre-warms this draft model after every successful load. */
draftModel?: string;
}
export interface LifecycleManagerDeps {
@@ -274,11 +278,16 @@ export class ModelLifecycleManager {
const ac = new AbortController();
this.loadAbort = ac;
try {
await this.deps.client.load(modelKey, ac.signal);
const cfg = this.deps.getConfig();
await this.deps.client.load(modelKey, ac.signal, cfg.loadConfig);
if (this.loadAbort !== ac) return; // superseded by a newer switch
this.loadAbort = undefined;
this.state = 'loaded';
this.resetIdleTimer();
// Pre-warm the draft model so the first speculative prediction doesn't pay a cold-load cost.
if (cfg.draftModel && this.deps.client.preloadDraftModel) {
void this.deps.client.preloadDraftModel(cfg.draftModel);
}
} catch (e: any) {
if (ac.signal.aborted) return; // superseded — newer switch owns state
logError('LM Studio model load failed.', { model: modelKey, error: e?.message ?? String(e) });
+98 -17
View File
@@ -7,6 +7,30 @@ export interface ChatStreamMessage {
content: string;
}
/** Shared sampling block. SDK and REST paths both read this — keep them in sync. */
export interface LmStudioSampling {
topP?: number;
topK?: number;
minP?: number;
repeatPenalty?: number;
}
/**
* Translate the sampling block into the OpenAI-compatible REST body extension that LM Studio
* understands. Ollama uses the same field names inside `options`. Returns an object you can
* spread into either body. Values <= 0 / <= 1 (penalty) are dropped so they fall back to engine
* defaults instead of effectively disabling sampling.
*/
export function samplingToRestBody(s: LmStudioSampling | undefined): Record<string, number> {
const out: Record<string, number> = {};
if (!s) return out;
if (typeof s.topP === 'number' && s.topP > 0 && s.topP <= 1) out.top_p = s.topP;
if (typeof s.topK === 'number' && s.topK > 0) out.top_k = s.topK;
if (typeof s.minP === 'number' && s.minP > 0 && s.minP <= 1) out.min_p = s.minP;
if (typeof s.repeatPenalty === 'number' && s.repeatPenalty > 1) out.repeat_penalty = s.repeatPenalty;
return out;
}
export interface ChatStreamRequest {
modelName: string;
messages: ChatStreamMessage[];
@@ -15,17 +39,39 @@ export interface ChatStreamRequest {
maxTokens?: number;
/** LM Studio context-overflow safety net used only if the prompt still exceeds the window. */
contextOverflowPolicy?: 'stopAtLimit' | 'truncateMiddle' | 'rollingWindow';
/** Sampling — defaults match small-model glitch-suppression presets. Each is omitted from the SDK call when undefined. */
topP?: number;
topK?: number;
minP?: number;
repeatPenalty?: number;
/** Draft model key for speculative decoding. Empty/undefined disables. */
draftModel?: string;
signal?: AbortSignal;
}
/** Subset of LM Studio's `PredictionResult.stats` we expose to callers. */
export interface ChatStreamStats {
tokensPerSecond?: number;
timeToFirstTokenSec?: number;
predictedTokensCount?: number;
promptTokensCount?: number;
totalTimeSec?: number;
/** Speculative decoding (only set when `draftModel` was used). */
draftModelKey?: string;
draftTokensCount?: number;
acceptedDraftTokensCount?: number;
}
/**
* One stream event. `token` carries generated text (possibly empty for the final event);
* `stopReason` is set on the *last* event only and is the SDK's `stats.stopReason`
* (e.g. `eosFound`, `maxPredictedTokensReached`, `contextLengthReached`, `userStopped`).
* `stats` is also set on the *last* event when LM Studio reports prediction stats.
*/
export interface ChatStreamEvent {
token: string;
stopReason?: string;
stats?: ChatStreamStats;
}
export interface IChatStreamer {
@@ -72,24 +118,25 @@ export class LMStudioStreamer implements IChatStreamer {
const model = await this.client.getModelHandle(trimmedModel, refresh ? { refresh: true } : undefined);
logInfo('LM Studio SDK chat stream started.', { model: trimmedModel, messageCount: req.messages.length, attempt });
const prediction = (model as any).respond(req.messages, {
// Sampling defaults match the historical glitch-suppression preset for small /
// quantized models (한글 토큰 깨짐 방지) but are now overridable per-call.
const respondOpts: any = {
temperature: req.temperature,
maxTokens: req.maxTokens ?? 4096,
// Glitch suppression: a small / quantized model samples wrong
// neighbour tokens (Korean syllable corruption like 붕괴→붕점,
// 핵심→핵점) when the distribution is left wide. A tight nucleus
// + top-k and a min-p floor cut the low-probability tail;
// repeatPenalty curbs stutter (것입니다서입니다).
topPSampling: 0.9,
topKSampling: 20,
minPSampling: 0.05,
repeatPenalty: 1.1,
// Safety net: if our own token budgeting still underestimated and the prompt
// exceeds the model's context window, decide whether the SDK should fail
// loudly (stopAtLimit — default) or silently drop content.
contextOverflowPolicy: req.contextOverflowPolicy ?? 'stopAtLimit',
signal: req.signal,
});
};
if (typeof req.topP === 'number') respondOpts.topPSampling = req.topP;
if (typeof req.topK === 'number' && req.topK > 0) respondOpts.topKSampling = req.topK;
if (typeof req.minP === 'number' && req.minP > 0) respondOpts.minPSampling = req.minP;
if (typeof req.repeatPenalty === 'number' && req.repeatPenalty > 1) respondOpts.repeatPenalty = req.repeatPenalty;
// Speculative decoding — LM Studio loads the draft model lazily on first use if needed
// (we also `preloadDraftModel` after main load to avoid that cold cost).
if (req.draftModel && req.draftModel.trim()) respondOpts.draftModel = req.draftModel.trim();
const prediction = (model as any).respond(req.messages, respondOpts);
// Bridge AbortSignal → prediction.cancel(): without this, an
// aborted request keeps generating on the LM Studio server. The
@@ -128,24 +175,58 @@ export class LMStudioStreamer implements IChatStreamer {
if (req.signal?.aborted) return;
// The prediction object is also a Promise<PredictionResult>; awaiting it after
// the stream drains gives us stats.stopReason so callers can tell a truncated
// answer (maxPredictedTokensReached / contextLengthReached) from a normal one.
// answer (maxPredictedTokensReached / contextLengthReached) from a normal one,
// plus throughput numbers (tok/s, TTFT) we surface to the UI.
let stopReason: string | undefined;
let stats: ChatStreamEvent['stats'];
try {
const result: any = await prediction;
stopReason = result?.stats?.stopReason;
if (stopReason) {
logInfo('LM Studio SDK chat stream finished.', { model: trimmedModel, stopReason, tokensYielded: yielded });
const s = result?.stats;
if (s) {
stats = {
tokensPerSecond: typeof s.tokensPerSecond === 'number' ? s.tokensPerSecond : undefined,
timeToFirstTokenSec: typeof s.timeToFirstTokenSec === 'number' ? s.timeToFirstTokenSec : undefined,
predictedTokensCount: typeof s.predictedTokensCount === 'number' ? s.predictedTokensCount : undefined,
promptTokensCount: typeof s.promptTokensCount === 'number' ? s.promptTokensCount : undefined,
totalTimeSec: typeof s.totalTimeSec === 'number' ? s.totalTimeSec : undefined,
draftModelKey: typeof s.usedDraftModelKey === 'string' ? s.usedDraftModelKey : undefined,
draftTokensCount: typeof s.totalDraftTokensCount === 'number' ? s.totalDraftTokensCount : undefined,
acceptedDraftTokensCount: typeof s.acceptedDraftTokensCount === 'number' ? s.acceptedDraftTokensCount : undefined,
};
}
if (stopReason || stats) {
logInfo('LM Studio SDK chat stream finished.', {
model: trimmedModel, stopReason, tokensYielded: yielded,
tokensPerSecond: stats?.tokensPerSecond, ttftSec: stats?.timeToFirstTokenSec,
});
}
} catch { /* result unavailable on some SDK versions — non-fatal */ }
// Empty-but-clean stream is treated like a dead handle on attempt 1:
// recreate the SDK and try once more. Same root cause (handle bound to
// a stale prediction) but no exception is thrown — just an empty stream.
if (yielded === 0 && attempt === 1) {
logInfo('Empty SDK stream with no error — retrying with a fresh SDK.', { model: trimmedModel });
continue;
}
// Don't claim `eosFound` if we couldn't actually read the stop reason — leave it
// undefined so the caller treats it as 'unknown' (and its mid-sentence heuristics kick in).
yield { token: '', stopReason };
yield { token: '', stopReason, stats };
return;
}
const errMsg = String(caught?.message ?? caught);
const handleDead = /\bdisposed\b/i.test(errMsg)
|| /lock\(\) request could not be registered/i.test(errMsg);
// Broaden the "handle is bound to a dead WebSocket binding" detection. All of
// these resolve with the same fix (recreate the SDK client so the next
// llm.model() lookup mints a fresh handle).
const handleDead =
/\bdisposed\b/i.test(errMsg)
|| /lock\(\) request could not be registered/i.test(errMsg)
|| /channel\s+closed/i.test(errMsg)
|| /WebSocket\s+(?:is\s+not\s+open|closed|disconnected)/i.test(errMsg)
|| /Connection\s+(?:lost|reset|closed)/i.test(errMsg)
|| /\bECONNRESET\b/i.test(errMsg)
|| /socket\s+hang\s*up/i.test(errMsg);
if (handleDead && yielded === 0 && attempt === 1) {
logInfo('Dead LM Studio handle detected — retrying with a fresh SDK.', { model: trimmedModel, error: errMsg });
+2 -2
View File
@@ -16,7 +16,7 @@ export async function handleChatMessage(provider: SidebarChatProvider, data: any
switch (data.type) {
case 'prompt':
case 'promptWithFile':
console.error(`[ASTRA-DEBUG] prompt case entered type=${data?.type} value=${JSON.stringify(String(data?.value ?? '').slice(0, 80))}`);
logInfo(`[ASTRA-DEBUG] prompt case entered type=${data?.type} value=${JSON.stringify(String(data?.value ?? '').slice(0, 80))}`);
provider._lmStudio?.activity.bump();
// ── 📻 Datacollect Radio (slash 명령) 우선 분기 ──
// 주의: globalState.update보다 *먼저* 잡는다 — 글로벌 state가 ~1MB까지
@@ -25,7 +25,7 @@ export async function handleChatMessage(provider: SidebarChatProvider, data: any
if (typeof data.value === 'string') {
const { isSlashCommand, handleSlashCommand } = await import('../features/datacollect/slashRouter');
const matched = isSlashCommand(data.value);
console.error(`[ASTRA-DEBUG] slash check matched=${matched} hasView=${!!provider._view}`);
logInfo(`[ASTRA-DEBUG] slash check matched=${matched} hasView=${!!provider._view}`);
logInfo(`[SLASH] prompt received: ${JSON.stringify(data.value).slice(0, 100)} matched=${matched} hasView=${!!provider._view}`);
if (matched) {
if (!provider._view?.webview) {
+7
View File
@@ -46,6 +46,13 @@ export async function handleChronicleMessage(provider: SidebarChatProvider, data
case 'writeChronicleRecord':
await provider._writeChronicleRecord(data.recordType);
return true;
case 'setChronicleAutoRecord':
// v2.2.70 — 자동 기록 On/Off 토글. 도구 드롭다운 메뉴에서 호출.
await provider._setChronicleAutoRecord(!!data.enabled);
return true;
case 'getChronicleAutoRecord':
await provider._sendChronicleAutoRecordStatus();
return true;
default:
return false;
}
+51
View File
@@ -886,6 +886,12 @@ export class SidebarChatProvider implements vscode.WebviewViewProvider, BridgeIn
void this._restoreActiveSessionIntoView();
void this._sendReadyStatus();
// v2.2.66 — initial-load 단계에서도 brain/models/agents 를 한 번 더 푸시한다.
// 기존엔 webview 의 'ready' 핸드셰이크에만 의존했는데, 그 체인 도중 하나가 throw 하면
// 나머지 populate 가 통째로 안 돌아 dropdown 이 비는 회귀가 발생할 수 있다. 이중 보장.
void this._sendBrainProfiles();
void this._sendAgentsList();
void this._sendModels();
viewDisposables.push(webviewView.webview.onDidReceiveMessage(async (data) => {
// dispatch root 진입 trace — "/benchmark 입력했는데 아무 응답 없음" 같은
@@ -1263,6 +1269,8 @@ export class SidebarChatProvider implements vscode.WebviewViewProvider, BridgeIn
description: profile.description || '',
repo: profile.secondBrainRepo || ''
}));
// v2.2.66 — dropdown 이 갑자기 비는 회귀가 보고됨. 무엇이 실제로 전송되는지 추적.
logInfo(`[_sendBrainProfiles] profiles=${profiles.length} activeBrainId=${activeBrain.id} active=${activeBrain.name}`);
this._view.webview.postMessage({
type: 'brainProfiles',
value: {
@@ -3368,7 +3376,37 @@ export class SidebarChatProvider implements vscode.WebviewViewProvider, BridgeIn
}
}
/**
* v2.2.70 — 도구 드롭다운의 "자동 기록" 토글에서 호출. config 를 즉시 갱신하고 webview 에
* 새 상태를 푸시. globalState 갱신이 아닌 vscode 설정 갱신이므로 다음 세션까지 영구 유지.
*/
async _setChronicleAutoRecord(enabled: boolean): Promise<void> {
try {
await vscode.workspace.getConfiguration('g1nation').update(
'chronicleAutoRecord', !!enabled, vscode.ConfigurationTarget.Global
);
logInfo(`[chronicleAutoRecord] toggled → ${enabled ? 'ON' : 'OFF'}`);
} catch (e: any) {
logError('[chronicleAutoRecord] update failed', { error: e?.message || String(e) });
}
await this._sendChronicleAutoRecordStatus();
}
/** Send current 자동 기록 enabled flag to the webview so the Tools menu can render the toggle state. */
async _sendChronicleAutoRecordStatus(): Promise<void> {
if (!this._view) return;
this._view.webview.postMessage({
type: 'chronicleAutoRecordStatus',
value: { enabled: getConfig().chronicleAutoRecord !== false }
});
}
async _autoWriteChronicleAfterPrompt() {
// v2.2.70 — 자동 기록 OFF (g1nation.chronicleAutoRecord=false) 면 즉시 종료.
// 수동 기록 (도구 메뉴, /wiki 명령 등) 은 영향받지 않는다.
if (getConfig().chronicleAutoRecord === false) {
return;
}
const history = this._agent.getHistory();
const latestUser = [...history].reverse().find(message => message.role === 'user')?.content || '';
const latestAssistant = [...history].reverse().find(message => message.role === 'assistant')?.content || '';
@@ -4056,7 +4094,20 @@ export class SidebarChatProvider implements vscode.WebviewViewProvider, BridgeIn
const mediaRoot = vscode.Uri.joinPath(this._extensionUri, 'media');
const stylesUri = webview.asWebviewUri(vscode.Uri.joinPath(mediaRoot, 'sidebar.css')).toString();
const scriptUri = webview.asWebviewUri(vscode.Uri.joinPath(mediaRoot, 'sidebar.js')).toString();
// VS Code의 outer webview iframe이 codicon.ttf를 data:font/ttf 로 inject한다.
// 기본 CSP는 font-src 'self' https://*.vscode-cdn.net 라 data: 가 빠져 있어
// DevTools에 violation 경고가 매번 찍힘. 우리가 명시적 CSP를 박아 data: 를
// 허용해 주면 호스트 iframe도 같은 CSP를 상속하면서 경고가 사라진다.
const csp = [
`default-src 'none'`,
`img-src ${webview.cspSource} https: data:`,
`style-src ${webview.cspSource} 'unsafe-inline'`,
`script-src ${webview.cspSource} https://cdn.jsdelivr.net 'unsafe-inline'`,
`font-src ${webview.cspSource} https: data:`,
`connect-src ${webview.cspSource} https:`,
].join('; ');
return SidebarChatProvider._htmlTemplateCache
.replace('__CSP__', csp)
.replace('__STYLES_URI__', stylesUri)
.replace('__SCRIPT_URI__', scriptUri);
}
+18 -22
View File
@@ -230,41 +230,37 @@ Step 2 (after the real scripts are known — pick the actual one, never a guesse
Then reply with one short line stating what was started and where.
[STRICT GLOBAL RULES]
1. [NO EMOJIS - ABSOLUTE RULE] NEVER use ANY emojis, emoticons, Unicode pictorial symbols (including but not limited to emoji, kaomoji, Unicode icons), or decorative symbols anywhere in your response. NO EXCEPTIONS. Use plain text dashes (-) or asterisks (*) for bullets. Use plain markdown ## for headers. This rule overrides ALL other formatting instructions.
2. [HEADINGS] Every markdown heading must be unique, appear exactly once, and start with exactly one "## " — never "## ##", never "### ###". One space after the hashes.
1. [NO EMOJIS - ABSOLUTE RULE] NEVER use ANY emojis, emoticons, Unicode pictorial symbols (including but not limited to emoji, kaomoji, Unicode icons), or decorative symbols anywhere in your response. NO EXCEPTIONS. Use plain text dashes (-) for bullets. This rule overrides ALL other formatting instructions.
2. [NO MARKDOWN MARKERS] PLAIN TEXT ONLY. Do NOT emit "#", "##", "###", "**", "__", "> ", "* " as formatting. Section labels are bare Korean words on their own line (e.g. a line that says just "핵심 요약" — no "#", no "**"). Bullets use "- " only. Inline code with backticks (e.g. \`src/agent.ts\`) and triple-backtick code blocks for actual code are fine.
3. [NO INTERNAL LOGS] Never output <details>, "2nd Brain Trace", or "Debug JSON" blocks.
4. [NO SECTION LEAKAGE] Never output sections named "요청 요약", "사용자 의도 추론", "프로젝트 기록 대상 확인", "핵심 확인 질문", or "근거 파일 경로".
[OUTPUT FORMAT]
LENGTH decides structure — not topic. Count how long your answer will be:
[OUTPUT FORMAT — 7 hard rules]
These rules override any other formatting habit. Apply them to EVERY answer.
- If the answer is longer than ~4 sentences (analysis, advice, planning, troubleshooting, or any multi-part answer), you MUST lead with a summary block, then the detail:
R1. CONCLUSION FIRST. The very first sentence of the response is the conclusion / verdict / recommendation. No greeting, no "분석해보겠습니다", no scene-setting paragraph, no "핵심 요약" label line. Just the conclusion as the opening sentence. The user must be able to stop after sentence 1 and still know what you decided.
## 핵심 요약
- 2 to 4 bullet points. Each bullet is one scannable, self-contained takeaway that captures the WHOLE answer — a reader who stops here still gets the gist.
- This block is ALWAYS the very first thing in the response. NEVER place a summary at the bottom. NEVER write an intro paragraph before it — the summary block IS the opening.
R2. AT MOST 3 SECTIONS. Total. Across the entire answer. A "section" = a labeled block (a label line followed by its body) OR a clearly separated numbered group. If you can answer without sections, do so. Three is the ceiling, not a target.
## 상세 설명
Free-form depth. You MAY use your own sub-headers here (e.g. "### 1. ...", "### 2. ..."). This is where the full reasoning and steps go.
R3. NO REPETITION. Never restate the same point twice in different words. Each sentence contributes new information. If you already said it in the conclusion, do NOT say it again in a later section.
## 제안 ← Optional. Only include if a meaningfully better alternative exists. Omit otherwise.
R4. BOLD ≤ 3 INSTANCES. Across the whole answer, use bold for emphasis at most 3 times. Reserve it for the truly load-bearing words (a file name, a verdict word, a hard number). Most answers should have zero.
- If the answer is ~4 sentences or fewer (quick fact, simple update, casual or emotional reply) — answer directly, no headers, no summary block.
R5. JUDGE WITHOUT ASKING. If you can reach a defensible decision from the current context, deliver the decision and act. Do NOT ask permission to proceed, do NOT ask the user to clarify what they already implied, do NOT bounce the question back ("어떻게 진행할까요?").
The summary block is named exactly "## 핵심 요약" and goes at the TOP. A section literally named "요약" placed at the end is a bug — never do that.
R6. ASK ONE QUESTION ONLY WHEN. Exactly one of these holds:
(a) The path forks into two materially different directions and you cannot tell which the user wants, OR
(b) The next concrete step is irreversible (delete, force-push, drop table, overwrite uncommitted work, send external message).
In those cases: ONE plain sentence on its own line at the end. No "핵심 확인 질문" label, no "질문 의도" explanation, no follow-ups.
[FOLLOW-UP QUESTION RULES]
A follow-up question is a precision tool, not a ritual.
Ask ONE focused question at the very end of the response ONLY if:
- The user's intent is genuinely ambiguous with multiple valid paths, OR
- A critical missing detail would make the current answer completely wrong.
If neither condition is met, give a definitive answer and stop.
When you do ask: it is ONE plain sentence on its own line. NEVER put it under a heading, NEVER label the section ("핵심 확인 질문", "확인 질문" etc.), NEVER attach a "질문 의도" explanation, NEVER ask two or more questions.
R7. GUESS-AND-ACT WITH STATED ASSUMPTION. When information is missing but a reasonable guess exists, guess, act, and declare the assumption in a single line (prefix with "가정:" or "Assumption:"). Do NOT stop to ask just because a detail is fuzzy.
[OUTPUT — plain text]
PLAIN TEXT only. Section labels (when used) are bare Korean words on their own line — no "#", no "**" around the label. Bullets use "- " only. Inline code with backticks (e.g. \`src/agent.ts\`) and triple-backtick code blocks for actual code are fine.
[ENGINEERING STANCE]
- Be a direct engineering partner. Technical precision over polite filler.
- Give the verdict first, then explain tradeoffs.
- Collapse checklists into: verdict → reason → risk → next move.
- Collapse checklists into: verdict → reason → risk → next move. (R1 already requires the verdict to be sentence 1.)
- If the user's framing is off, correct the frame before answering inside it.
- Simplify complex choices into 1-2 crisp options. Never write a balanced essay when a recommendation is possible.
- Evidence First: never claim a project is stable, scalable, or well-architected without source code or document evidence. If evidence is thin, say so and name the files to inspect next.