refactor: optimize core engine and retrieval logic for v2.80.43
This commit is contained in:
@@ -518,6 +518,18 @@
|
||||
chat.scrollTop = chat.scrollHeight;
|
||||
}
|
||||
break;
|
||||
case 'streamReplace':
|
||||
// Progressive answering: the backend streamed raw tokens
|
||||
// live (including hidden reasoning, pre-sanitize text);
|
||||
// once everything is finalized it sends the cleaned full
|
||||
// text via streamReplace so the bubble ends up correct
|
||||
// regardless of what slipped through during streaming.
|
||||
if (streamBody) {
|
||||
streamBody._parent._raw = String(msg.value ?? '');
|
||||
streamBody.innerHTML = fmt(streamBody._parent._raw);
|
||||
chat.scrollTop = chat.scrollHeight;
|
||||
}
|
||||
break;
|
||||
case 'streamEnd':
|
||||
if (streamBody) {
|
||||
streamBody.classList.remove('stream-active');
|
||||
|
||||
+391
-11
@@ -30,6 +30,7 @@ import { StatusBarManager, AgentStatus } from './core/statusBar';
|
||||
import { lockManager } from './core/lock';
|
||||
import { actionQueue } from './core/queue';
|
||||
import { ConflictResolver } from './core/conflict';
|
||||
import { recordTelemetry } from './core/telemetry';
|
||||
import {
|
||||
buildSecondBrainTrace,
|
||||
enforceProjectClaimPolicyInAnswer,
|
||||
@@ -40,6 +41,8 @@ import {
|
||||
import { MemoryManager } from './memory';
|
||||
import { RetrievalOrchestrator } from './retrieval';
|
||||
import { buildLessonChecklistBlock, isQaRegressionFeedback, findUnaddressedChecklistItems } from './retrieval/lessonHelpers';
|
||||
import { embedQuery, embedTexts } from './retrieval/embeddings';
|
||||
import { backfillBrainEmbeddings } from './retrieval/brainIndex';
|
||||
import { resolveScopeForAgent } from './skills/agentKnowledgeMap';
|
||||
import {
|
||||
extractVisibleFinal,
|
||||
@@ -117,6 +120,51 @@ const AGENT_PROMPTS: Record<AgentRole, string> = {
|
||||
3. Deliver a logical, consistent, and polished response.`
|
||||
};
|
||||
|
||||
/**
|
||||
* Compact recent chat sessions for medium-term memory retrieval.
|
||||
*
|
||||
* Returns up to `limit + 5` recently-touched sessions (excluding the active
|
||||
* one) as small summaries: title + first user message + tail of the last
|
||||
* assistant message. The retrieval orchestrator then scores these against the
|
||||
* current query and selects the top `limit` matches inside the shared budget.
|
||||
*
|
||||
* We pull a few more than `limit` so TF-IDF scoring has room to rerank — the
|
||||
* persisted list is timestamp-ordered, which isn't the same as topical fit.
|
||||
*/
|
||||
function compactRecentSessions(
|
||||
rawSessions: any[],
|
||||
activeSessionId: string | null,
|
||||
limit: number,
|
||||
): Array<{ id: string; title: string; firstUserMsg: string; lastAssistantExcerpt: string; summary?: string; timestamp: number }> {
|
||||
if (!Array.isArray(rawSessions) || rawSessions.length === 0 || limit <= 0) return [];
|
||||
const pool = rawSessions.length > limit + 5 ? limit + 5 : rawSessions.length;
|
||||
const out: Array<{ id: string; title: string; firstUserMsg: string; lastAssistantExcerpt: string; summary?: string; timestamp: number }> = [];
|
||||
for (let i = 0; i < rawSessions.length && out.length < pool; i++) {
|
||||
const s = rawSessions[i];
|
||||
if (!s || typeof s !== 'object') continue;
|
||||
const id = String(s.id ?? '');
|
||||
if (!id || id === activeSessionId) continue;
|
||||
const history: any[] = Array.isArray(s.history) ? s.history : [];
|
||||
if (history.length === 0) continue;
|
||||
const firstUser = history.find((m) => m?.role === 'user');
|
||||
const lastAssistant = [...history].reverse().find((m) => m?.role === 'assistant');
|
||||
const firstUserMsg = String(firstUser?.content ?? '').replace(/\s+/g, ' ').trim().slice(0, 200);
|
||||
const lastTxt = String(lastAssistant?.content ?? '').replace(/\s+/g, ' ').trim();
|
||||
const lastAssistantExcerpt = lastTxt.length <= 200 ? lastTxt : lastTxt.slice(-200);
|
||||
const summary = typeof s.summary === 'string' ? s.summary.trim().slice(0, 600) : undefined;
|
||||
if (!firstUserMsg && !lastAssistantExcerpt && !summary) continue;
|
||||
out.push({
|
||||
id,
|
||||
title: String(s.title ?? '').trim() || firstUserMsg.slice(0, 50),
|
||||
firstUserMsg,
|
||||
lastAssistantExcerpt,
|
||||
summary,
|
||||
timestamp: typeof s.timestamp === 'number' ? s.timestamp : 0,
|
||||
});
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
// Local-path detectors used to decide whether a user prompt refers to a file/dir on disk.
|
||||
// POSIX: /Volumes/, /Users/, /home/, /opt/, ... or ~/ — backtick excluded (markdown code spans).
|
||||
const POSIX_ABS_PATH_SRC = "(?:\\/(?:Volumes|Users|home|opt|srv|mnt|data|workspace)\\/|~\\/)[^\\s`\"'<>|*?]+";
|
||||
@@ -328,6 +376,10 @@ export class AgentExecutor {
|
||||
|
||||
if (!this.webview) return;
|
||||
|
||||
// Telemetry: wall-clock start of the user-visible turn. Only meaningful
|
||||
// at loopDepth===0 (action-loop recursions roll up into the same turn).
|
||||
const turnStartMs = loopDepth === 0 ? Date.now() : 0;
|
||||
|
||||
try {
|
||||
// 0. Safety Check: Rollback any dangling transaction from previous runs
|
||||
if (this.transactionManager.isActive()) {
|
||||
@@ -471,9 +523,19 @@ export class AgentExecutor {
|
||||
const secondBrainTraceCtx = secondBrainTrace
|
||||
? `\n\n${renderSecondBrainTraceContext(secondBrainTrace)}`
|
||||
: '';
|
||||
const retrievalStartMs = Date.now();
|
||||
const memoryCtx = isCasualConversation
|
||||
? ''
|
||||
: this.buildMemoryContext(prompt || '', activeBrain, options.agentSkillFile);
|
||||
: await this.buildMemoryContext(prompt || '', activeBrain, options.agentSkillFile);
|
||||
if (loopDepth === 0 && !isCasualConversation && this._lastRetrievalInfo) {
|
||||
recordTelemetry({
|
||||
kind: 'retrieval',
|
||||
durationMs: Date.now() - retrievalStartMs,
|
||||
brainFiles: this._lastRetrievalInfo.usedBrainFiles.length,
|
||||
memoryLayers: this._lastRetrievalInfo.usedMemoryLayers,
|
||||
note: `chunks=${this._lastRetrievalInfo.selectedChunks}/${this._lastRetrievalInfo.totalChunks} lessons=${this._lastRetrievalInfo.lessonFiles.length}`,
|
||||
});
|
||||
}
|
||||
const knowledgeContextForPrompt = isCasualConversation
|
||||
? ''
|
||||
: `${brainContext}${brainInventoryCtx}`;
|
||||
@@ -677,6 +739,16 @@ export class AgentExecutor {
|
||||
this.options.onStreamLifecycle?.start();
|
||||
}
|
||||
|
||||
// Progressive answering: live-stream tokens to the webview during
|
||||
// the user-visible first turn (loopDepth === 0). The bubble fills
|
||||
// as the model generates instead of dropping all at once at the end,
|
||||
// and any auto-continuation rounds keep posting deltas through the
|
||||
// same channel. Post-processing (reasoning strip / sanitize /
|
||||
// policy enforcement) emits a final `streamReplace` so the bubble
|
||||
// ends up matching the cleaned answer regardless of what slipped
|
||||
// through live.
|
||||
const postLiveDeltas = loopDepth === 0;
|
||||
|
||||
if (useLmStudioSdk) {
|
||||
apiUrl = `${ollamaUrl} (sdk)`;
|
||||
logInfo('Streaming chat via LM Studio SDK.', { model: actualModel });
|
||||
@@ -691,7 +763,10 @@ export class AgentExecutor {
|
||||
});
|
||||
for await (const { token, stopReason } of stream) {
|
||||
if (this.isStaleRun(runId)) return;
|
||||
if (token) aiResponseText += token;
|
||||
if (token) {
|
||||
aiResponseText += token;
|
||||
if (postLiveDeltas) this.webview.postMessage({ type: 'streamChunk', value: token });
|
||||
}
|
||||
if (stopReason) finishStopReason = stopReason;
|
||||
}
|
||||
} catch (err: any) {
|
||||
@@ -747,6 +822,7 @@ export class AgentExecutor {
|
||||
const token = engine === 'lmstudio' ? json.choices?.[0]?.delta?.content || '' : json.message?.content || json.response || '';
|
||||
if (token) {
|
||||
aiResponseText += token;
|
||||
if (postLiveDeltas) this.webview.postMessage({ type: 'streamChunk', value: token });
|
||||
}
|
||||
const fr = engine === 'lmstudio'
|
||||
? json.choices?.[0]?.finish_reason
|
||||
@@ -778,6 +854,7 @@ export class AgentExecutor {
|
||||
const token = engine === 'lmstudio' ? json.choices?.[0]?.delta?.content || '' : json.message?.content || json.response || '';
|
||||
if (token) {
|
||||
aiResponseText += token;
|
||||
if (postLiveDeltas) this.webview.postMessage({ type: 'streamChunk', value: token });
|
||||
}
|
||||
const fr = engine === 'lmstudio'
|
||||
? json.choices?.[0]?.finish_reason
|
||||
@@ -829,7 +906,10 @@ export class AgentExecutor {
|
||||
let retryText = '';
|
||||
for await (const { token, stopReason } of retryStream) {
|
||||
if (this.isStaleRun(runId)) return;
|
||||
if (token) retryText += token;
|
||||
if (token) {
|
||||
retryText += token;
|
||||
if (postLiveDeltas) this.webview.postMessage({ type: 'streamChunk', value: token });
|
||||
}
|
||||
if (stopReason) finishStopReason = stopReason;
|
||||
}
|
||||
if (retryText.trim()) {
|
||||
@@ -922,6 +1002,7 @@ export class AgentExecutor {
|
||||
&& !this.isStaleRun(runId)
|
||||
) {
|
||||
continuationCount++;
|
||||
const continuationStartMs = Date.now();
|
||||
this.webview.postMessage({ type: 'autoContinue', value: `답변이 길어 이어서 정리하는 중입니다... (${continuationCount}/${config.maxAutoContinuations})` });
|
||||
try {
|
||||
const contMsgs: ChatMessage[] = [
|
||||
@@ -929,11 +1010,24 @@ export class AgentExecutor {
|
||||
{ role: 'user', content: buildContinuationUserPrompt(originalUserPrompt, cleaned.visible) },
|
||||
];
|
||||
lastMaxOutputTokens = computeOutputBudget(estimateMessagesTokens(contMsgs), ctxLimits).maxOutputTokens;
|
||||
const cr = await this.callNonStreaming({
|
||||
baseUrl: ollamaUrl, modelName: actualModel, engine, messages: contMsgs,
|
||||
temperature, maxTokens: lastMaxOutputTokens, contextLength: ctxLimits.contextLength,
|
||||
signal: this.abortController?.signal,
|
||||
// Stream the continuation through the same channel as the main turn so
|
||||
// the user sees the answer keep growing instead of freezing for 10–30s
|
||||
// while we silently call non-streaming. The trailing streamReplace
|
||||
// (after sanitize / merge) corrects any overlap the model re-emits.
|
||||
const cr = await this.streamChatOnce({
|
||||
runId, useLmStudioSdk, engine, ollamaUrl, modelName: actualModel,
|
||||
messages: contMsgs,
|
||||
temperature,
|
||||
maxTokens: lastMaxOutputTokens,
|
||||
contextLength: ctxLimits.contextLength,
|
||||
contextOverflowPolicy: config.contextOverflowPolicy,
|
||||
signal: this.abortController!.signal,
|
||||
postLiveDeltas,
|
||||
});
|
||||
if (cr.aborted) {
|
||||
logInfo('Auto-continuation aborted mid-stream.', { model: actualModel, round: continuationCount });
|
||||
break;
|
||||
}
|
||||
finishStopReason = cr.stopReason;
|
||||
const ccl = extractVisibleFinal(cr.text);
|
||||
if (!ccl.visible.trim()) {
|
||||
@@ -944,6 +1038,15 @@ export class AgentExecutor {
|
||||
cleaned = { ...cleaned, visible: mergeContinuationParts(cleaned.visible, ccl.visible), wasThoughtOnly: false };
|
||||
lastOutputTokens = estimateTokens(ccl.visible);
|
||||
logInfo('Auto-continued the answer.', { model: actualModel, round: continuationCount, addedChars: ccl.visible.length, totalChars: cleaned.visible.length, contStopReason: cr.stopReason, contMaxTokens: lastMaxOutputTokens });
|
||||
recordTelemetry({
|
||||
kind: 'continuation',
|
||||
durationMs: Date.now() - continuationStartMs,
|
||||
model: actualModel, engine,
|
||||
outputTokens: lastOutputTokens,
|
||||
round: continuationCount,
|
||||
stopReason: cr.stopReason,
|
||||
note: `addedChars=${ccl.visible.length} mergedAdd=${cleaned.visible.length - before.length}`,
|
||||
});
|
||||
// Guard against a continuation that adds (almost) nothing new after dedup — stop instead of spinning.
|
||||
if (cleaned.visible.length - before.length < 20) {
|
||||
logInfo('Continuation added negligible new text — stopping.', { model: actualModel, round: continuationCount });
|
||||
@@ -1099,7 +1202,32 @@ export class AgentExecutor {
|
||||
value: { ...this._lastRetrievalInfo, hasAgentSelected: !!options.agentSkillFile, unaddressedChecklist },
|
||||
});
|
||||
}
|
||||
this.webview.postMessage({ type: 'streamChunk', value: finalAssistantContent });
|
||||
// Progressive answering: the bubble was filled live with raw tokens
|
||||
// during streaming (and during any auto-continuation rounds). Now
|
||||
// that we have the cleaned + merged + policy-enforced text, swap the
|
||||
// bubble's content for the final version so the user sees the
|
||||
// correct answer regardless of what slipped through live —
|
||||
// hidden reasoning, mid-stream artifacts, continuation-overlap re-
|
||||
// emits, truncation notice. Action-loop turns (loopDepth > 0) still
|
||||
// append via streamChunk because the bubble has multiple action
|
||||
// segments and we don't have a single "final" to replace with.
|
||||
if (loopDepth === 0) {
|
||||
this.webview.postMessage({ type: 'streamReplace', value: finalAssistantContent });
|
||||
recordTelemetry({
|
||||
kind: 'turn',
|
||||
durationMs: Date.now() - turnStartMs,
|
||||
model: actualModel, engine,
|
||||
inputTokens,
|
||||
outputTokens,
|
||||
contextLength: ctxLimits.contextLength,
|
||||
stopReason: finishStopReason,
|
||||
brainFiles: this._lastRetrievalInfo?.usedBrainFiles.length ?? 0,
|
||||
memoryLayers: this._lastRetrievalInfo?.usedMemoryLayers ?? [],
|
||||
note: `continuations=${continuationCount} historyDropped=${reqMessages.length - budgetedHistory.length}`,
|
||||
});
|
||||
} else {
|
||||
this.webview.postMessage({ type: 'streamChunk', value: finalAssistantContent });
|
||||
}
|
||||
|
||||
} catch (error: any) {
|
||||
this.statusBarManager.updateStatus(AgentStatus.Error, error.message);
|
||||
@@ -2309,7 +2437,7 @@ export class AgentExecutor {
|
||||
});
|
||||
}
|
||||
|
||||
private buildMemoryContext(currentPrompt: string, activeBrain: BrainProfile, agentSkillFile?: string): string {
|
||||
private async buildMemoryContext(currentPrompt: string, activeBrain: BrainProfile, agentSkillFile?: string): Promise<string> {
|
||||
const config = getConfig();
|
||||
this._lastRetrievalInfo = null;
|
||||
this._lastLessonContents = [];
|
||||
@@ -2331,6 +2459,44 @@ export class AgentExecutor {
|
||||
// keeping the legacy behavior intact.
|
||||
const scope = resolveScopeForAgent(agentSkillFile, activeBrain.localBrainPath);
|
||||
|
||||
// Scale retrieval/memory budget with the configured context window so
|
||||
// that raising g1nation.contextLength actually gives the RAG pipeline
|
||||
// more room. At 32K context we keep the legacy 8K total (≈3.2K
|
||||
// retrieval); at 230K we allocate ~57K total (≈23K retrieval). Capped
|
||||
// at 80K so scoring stays fast on huge contexts.
|
||||
const scaledTotalBudget = Math.min(
|
||||
80000,
|
||||
Math.max(8000, Math.floor(config.contextLength * 0.25))
|
||||
);
|
||||
|
||||
// Pull recent session summaries for the medium-term layer. We read
|
||||
// from the sidebar's persisted store directly (same key it writes to)
|
||||
// to avoid threading another callback through the agent constructor.
|
||||
const rawSessions = this.context.globalState.get<any[]>('chat_sessions', []) || [];
|
||||
const recentSessions = compactRecentSessions(
|
||||
rawSessions,
|
||||
this.currentTaskId,
|
||||
Math.max(0, config.memoryMediumTermSessions ?? 0)
|
||||
);
|
||||
|
||||
// Hybrid retrieval (optional): when the user has configured an
|
||||
// embedding model, fetch a query embedding so searchBrainFiles can
|
||||
// blend cosine similarity with TF-IDF. Time-bounded — if the
|
||||
// embedding endpoint is slow or down, we fall through with no
|
||||
// embedding and the retriever stays in pure-TF-IDF mode.
|
||||
let queryEmbedding: number[] | undefined;
|
||||
if (config.embeddingModel) {
|
||||
const EMBED_QUERY_TIMEOUT_MS = 4000;
|
||||
try {
|
||||
queryEmbedding = await Promise.race([
|
||||
embedQuery(currentPrompt, { baseUrl: config.ollamaUrl, model: config.embeddingModel }),
|
||||
new Promise<undefined>((resolve) => setTimeout(() => resolve(undefined), EMBED_QUERY_TIMEOUT_MS)),
|
||||
]);
|
||||
} catch {
|
||||
queryEmbedding = undefined;
|
||||
}
|
||||
}
|
||||
|
||||
// Use the Unified RAG Pipeline
|
||||
const result = this.retrievalOrchestrator.retrieve(currentPrompt, {
|
||||
brain: activeBrain,
|
||||
@@ -2338,13 +2504,36 @@ export class AgentExecutor {
|
||||
workspacePath,
|
||||
chatHistory: visibleHistory,
|
||||
contextBudget: {
|
||||
totalBudget: 8000,
|
||||
totalBudget: scaledTotalBudget,
|
||||
retrievalRatio: 0.4
|
||||
},
|
||||
brainFileLimit: config.memoryLongTermFiles,
|
||||
scopeFolders: scope.folders
|
||||
scopeFolders: scope.folders,
|
||||
recentSessions,
|
||||
mediumTermLimit: config.memoryMediumTermSessions ?? 0,
|
||||
queryEmbedding,
|
||||
embeddingModel: config.embeddingModel || undefined,
|
||||
embeddingBlendAlpha: config.embeddingBlendAlpha,
|
||||
});
|
||||
|
||||
// Fire-and-forget background embedding for the files we just scored.
|
||||
// Embeds only files that lack a vector for the current model — so
|
||||
// steady-state turns do no embedding work. The next turn benefits.
|
||||
if (config.embeddingModel) {
|
||||
const scoredFilePaths = result.selectedChunks
|
||||
.filter((c) => c.source === 'brain-memory' && c.metadata.filePath)
|
||||
.map((c) => c.metadata.filePath!)
|
||||
.filter((p, i, arr) => arr.indexOf(p) === i);
|
||||
if (scoredFilePaths.length > 0) {
|
||||
void backfillBrainEmbeddings(
|
||||
activeBrain.localBrainPath,
|
||||
scoredFilePaths,
|
||||
config.embeddingModel,
|
||||
(texts) => embedTexts(texts, { baseUrl: config.ollamaUrl, model: config.embeddingModel }),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Stash what actually fed this turn so handlePrompt can show it under the answer.
|
||||
const brainRoot = activeBrain.localBrainPath;
|
||||
const rel = (p?: string) => (p ? (path.relative(brainRoot, p) || p) : '');
|
||||
@@ -2406,11 +2595,74 @@ export class AgentExecutor {
|
||||
workspacePath
|
||||
);
|
||||
logInfo('Memory extraction completed for session end.', { taskId: this.currentTaskId });
|
||||
recordTelemetry({
|
||||
kind: 'session-end',
|
||||
note: `taskId=${this.currentTaskId} messages=${this.chatHistory.filter((m) => !m.internal).length}`,
|
||||
});
|
||||
// Fire-and-forget LLM compression: turns the raw transcript into a
|
||||
// 2–3 sentence summary that medium-term retrieval can use instead
|
||||
// of just "first user msg + last assistant 200 chars". Cheap call
|
||||
// (~256 output tokens), runs in the background so it never blocks
|
||||
// the next chat turn.
|
||||
void this.compressSessionSummary(this.currentTaskId, this.chatHistory.slice());
|
||||
} catch (error: any) {
|
||||
logError('Memory extraction failed on session end.', { error: error?.message || String(error) });
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Compress a finished session into a short summary and persist it to the
|
||||
* session record. The summary is later read by `compactRecentSessions` so
|
||||
* the medium-term memory layer carries a real recap instead of a fragment.
|
||||
*
|
||||
* Skips sessions with fewer than 3 visible messages — they're typically
|
||||
* single-question pings where the raw first message is already a good
|
||||
* summary. Failures are logged and swallowed: a missing summary just
|
||||
* falls back to the legacy "first user msg" representation.
|
||||
*/
|
||||
private async compressSessionSummary(taskId: string, history: ChatMessage[]): Promise<void> {
|
||||
const visible = history.filter((m) => !m.internal && (m.role === 'user' || m.role === 'assistant'));
|
||||
if (visible.length < 3) return;
|
||||
const cfg = getConfig();
|
||||
const transcript = visible
|
||||
.map((m) => `${m.role.toUpperCase()}: ${String(m.content).replace(/\s+/g, ' ').slice(0, 400)}`)
|
||||
.join('\n\n');
|
||||
const messages: ChatMessage[] = [
|
||||
{
|
||||
role: 'system',
|
||||
content: [
|
||||
'You compress chat transcripts into a 2-3 sentence summary.',
|
||||
'Capture: (1) the user\'s topic or task, (2) the main decision or answer reached, (3) any open issue.',
|
||||
'Reply in the user\'s primary language (mirror Korean ↔ English exactly as in the transcript).',
|
||||
'Reply with ONLY the summary text. No headers, no quotes, no preamble.',
|
||||
].join(' '),
|
||||
internal: true,
|
||||
},
|
||||
{ role: 'user', content: `[TRANSCRIPT]\n${transcript}\n[END]` },
|
||||
];
|
||||
try {
|
||||
const result = await this.callNonStreaming({
|
||||
baseUrl: cfg.ollamaUrl,
|
||||
modelName: cfg.defaultModel,
|
||||
engine: resolveEngine(cfg.ollamaUrl),
|
||||
messages,
|
||||
temperature: 0.3,
|
||||
maxTokens: 256,
|
||||
contextLength: cfg.contextLength,
|
||||
});
|
||||
const summary = (result.text || '').trim().replace(/^["'`]+|["'`]+$/g, '');
|
||||
if (!summary || summary.length < 12) return;
|
||||
const sessions = this.context.globalState.get<any[]>('chat_sessions', []) || [];
|
||||
const idx = sessions.findIndex((s) => String(s?.id) === String(taskId));
|
||||
if (idx < 0) return;
|
||||
sessions[idx].summary = summary;
|
||||
await this.context.globalState.update('chat_sessions', sessions);
|
||||
logInfo('Session summary stored for medium-term recall.', { taskId, length: summary.length });
|
||||
} catch (e: any) {
|
||||
logError('Session summary compression failed.', { taskId, error: e?.message ?? String(e) });
|
||||
}
|
||||
}
|
||||
|
||||
private async createStreamingRequest(params: {
|
||||
baseUrl: string;
|
||||
modelName: string;
|
||||
@@ -2568,6 +2820,134 @@ export class AgentExecutor {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Single streaming call used by progressive answering (live-delta main
|
||||
* stream + auto-continuation rounds). Mirrors the main streaming block in
|
||||
* handlePrompt but without the empty-stream recovery / non-streaming
|
||||
* fallback machinery — those only matter for the very first generation.
|
||||
*
|
||||
* When `postLiveDeltas` is true, every token is also forwarded to the
|
||||
* webview as a `streamChunk`, giving the user a real-time view of the
|
||||
* answer (and of continuation rounds) instead of one big drop at the end.
|
||||
*
|
||||
* Returns the accumulated text and the final stop reason. Aborts and
|
||||
* stale runs surface as `aborted: true` and an empty/partial text — the
|
||||
* caller decides what to do with that.
|
||||
*/
|
||||
private async streamChatOnce(params: {
|
||||
runId: number;
|
||||
useLmStudioSdk: boolean;
|
||||
engine: 'lmstudio' | 'ollama';
|
||||
ollamaUrl: string;
|
||||
modelName: string;
|
||||
messages: ChatMessage[];
|
||||
temperature: number;
|
||||
maxTokens: number;
|
||||
contextLength: number;
|
||||
contextOverflowPolicy: 'stopAtLimit' | 'truncateMiddle' | 'rollingWindow';
|
||||
signal: AbortSignal;
|
||||
postLiveDeltas: boolean;
|
||||
}): Promise<{ text: string; stopReason?: string; aborted: boolean }> {
|
||||
let accumulated = '';
|
||||
let finishStopReason: string | undefined;
|
||||
const post = (token: string) => {
|
||||
if (params.postLiveDeltas && token) {
|
||||
this.webview?.postMessage({ type: 'streamChunk', value: token });
|
||||
}
|
||||
};
|
||||
|
||||
if (params.useLmStudioSdk) {
|
||||
try {
|
||||
const stream = this.options.lmStudioStreamer!.stream({
|
||||
modelName: params.modelName,
|
||||
messages: params.messages.map((m) => ({ role: m.role, content: m.content })),
|
||||
temperature: params.temperature,
|
||||
maxTokens: params.maxTokens,
|
||||
contextOverflowPolicy: params.contextOverflowPolicy,
|
||||
signal: params.signal,
|
||||
});
|
||||
for await (const { token, stopReason } of stream) {
|
||||
if (this.isStaleRun(params.runId)) {
|
||||
return { text: accumulated, stopReason: finishStopReason, aborted: true };
|
||||
}
|
||||
if (token) {
|
||||
accumulated += token;
|
||||
post(token);
|
||||
}
|
||||
if (stopReason) finishStopReason = stopReason;
|
||||
}
|
||||
} catch (err: any) {
|
||||
if (err?.name === 'AbortError' || params.signal.aborted) {
|
||||
return { text: accumulated, stopReason: finishStopReason, aborted: true };
|
||||
}
|
||||
const msg = err?.message ?? String(err);
|
||||
if (/context\s*length|contextlengthreached|exceed|too\s*long/i.test(msg)) {
|
||||
finishStopReason = 'contextLengthReached';
|
||||
}
|
||||
logError('streamChatOnce SDK path failed.', { engine: params.engine, error: msg });
|
||||
throw err;
|
||||
}
|
||||
return { text: accumulated, stopReason: finishStopReason, aborted: false };
|
||||
}
|
||||
|
||||
const request = await this.createStreamingRequest({
|
||||
baseUrl: params.ollamaUrl,
|
||||
modelName: params.modelName,
|
||||
reqMessages: params.messages,
|
||||
temperature: params.temperature,
|
||||
maxTokens: params.maxTokens,
|
||||
contextLength: params.contextLength,
|
||||
});
|
||||
const reader = request.response.body?.getReader();
|
||||
if (!reader) throw new Error('Response body is not readable.');
|
||||
const decoder = new TextDecoder();
|
||||
let buffer = '';
|
||||
const consumeJsonLine = (line: string) => {
|
||||
const trimmed = line.trim();
|
||||
if (!trimmed || trimmed === 'data: [DONE]') return;
|
||||
try {
|
||||
const raw = trimmed.startsWith('data: ') ? trimmed.slice(6) : trimmed;
|
||||
const json = JSON.parse(raw);
|
||||
const token = params.engine === 'lmstudio'
|
||||
? json.choices?.[0]?.delta?.content || ''
|
||||
: json.message?.content || json.response || '';
|
||||
if (token) {
|
||||
accumulated += token;
|
||||
post(token);
|
||||
}
|
||||
const fr = params.engine === 'lmstudio'
|
||||
? json.choices?.[0]?.finish_reason
|
||||
: (json.done_reason ?? (json.done === true ? 'stop' : undefined));
|
||||
if (fr) finishStopReason = fr;
|
||||
} catch (e: any) {
|
||||
logError('streamChatOnce: failed to parse chunk.', { engine: params.engine, chunk: summarizeText(trimmed, 200), error: e?.message ?? String(e) });
|
||||
}
|
||||
};
|
||||
try {
|
||||
while (true) {
|
||||
const { done, value } = await reader.read();
|
||||
if (done) break;
|
||||
if (this.isStaleRun(params.runId)) {
|
||||
return { text: accumulated, stopReason: finishStopReason, aborted: true };
|
||||
}
|
||||
buffer += decoder.decode(value, { stream: true });
|
||||
const lines = buffer.split('\n');
|
||||
buffer = lines.pop() || '';
|
||||
for (const line of lines) consumeJsonLine(line);
|
||||
}
|
||||
if (buffer.trim()) consumeJsonLine(buffer);
|
||||
} catch (err: any) {
|
||||
if (err?.name === 'AbortError') {
|
||||
return { text: accumulated, stopReason: finishStopReason, aborted: true };
|
||||
}
|
||||
logError('streamChatOnce REST path failed.', { engine: params.engine, error: err?.message ?? String(err) });
|
||||
throw err;
|
||||
} finally {
|
||||
try { reader.releaseLock(); } catch { /* already released on abort */ }
|
||||
}
|
||||
return { text: accumulated, stopReason: finishStopReason, aborted: false };
|
||||
}
|
||||
|
||||
private normalizeMessages(messages: ChatMessage[]) {
|
||||
return messages.map((message) => {
|
||||
const normalizedContent = typeof message.content === 'string'
|
||||
|
||||
+16
-1
@@ -45,6 +45,19 @@ export interface IAgentConfig {
|
||||
maxAutoContinuations: number;
|
||||
/** 모델이 내부 사고만 출력하고 답변이 없으면 "최종 답변만" 지시로 1회 재생성. */
|
||||
finalOnlyRetryOnThoughtLeak: boolean;
|
||||
// ─── Hybrid Semantic Search ───
|
||||
/**
|
||||
* Embedding model name as registered in LM Studio / Ollama. Empty disables
|
||||
* semantic search and the retriever falls back to TF-IDF only. The user
|
||||
* must load this model in the engine before enabling it here.
|
||||
*/
|
||||
embeddingModel: string;
|
||||
/**
|
||||
* Blend between TF-IDF (sparse) and embedding cosine (dense) scoring.
|
||||
* 0 = TF-IDF only (status quo), 1 = embedding only.
|
||||
* Default 0.5 = equal weight, a reasonable starting point.
|
||||
*/
|
||||
embeddingBlendAlpha: number;
|
||||
}
|
||||
|
||||
// ─── 경로 정규화 유틸리티 ───
|
||||
@@ -125,7 +138,9 @@ export function getConfig(): IAgentConfig {
|
||||
smallModelContextCap: Math.max(0, cfg.get<number>('smallModelContextCap', 0)),
|
||||
autoContinueOnOutputLimit: cfg.get<boolean>('autoContinueOnOutputLimit', true),
|
||||
maxAutoContinuations: Math.max(0, Math.min(10, cfg.get<number>('maxAutoContinuations', 4))),
|
||||
finalOnlyRetryOnThoughtLeak: cfg.get<boolean>('finalOnlyRetryOnThoughtLeak', true)
|
||||
finalOnlyRetryOnThoughtLeak: cfg.get<boolean>('finalOnlyRetryOnThoughtLeak', true),
|
||||
embeddingModel: (cfg.get<string>('embeddingModel', '') || '').trim(),
|
||||
embeddingBlendAlpha: Math.max(0, Math.min(1, cfg.get<number>('embeddingBlendAlpha', 0.5))),
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
+19
-3
@@ -1,8 +1,24 @@
|
||||
import * as os from 'os';
|
||||
import { logInfo, logError } from '../utils';
|
||||
|
||||
/**
|
||||
* ActionQueueManager: Manages large-scale tasks by processing them
|
||||
* with a concurrency limit to prevent resource exhaustion and I/O bottlenecks
|
||||
* Default concurrency = max(2, cpus - 1). Leaves one core for the VS Code UI
|
||||
* thread and the extension host, scales up on bigger boxes. Static per-process
|
||||
* (no dynamic adjustment) — kept simple because the heavy work (LLM calls)
|
||||
* is gated by `missionId` locks elsewhere, not the action queue.
|
||||
*/
|
||||
function defaultConcurrencyLimit(): number {
|
||||
try {
|
||||
const cpus = os.cpus()?.length ?? 4;
|
||||
return Math.max(2, cpus - 1);
|
||||
} catch {
|
||||
return 3;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* ActionQueueManager: Manages large-scale tasks by processing them
|
||||
* with a concurrency limit to prevent resource exhaustion and I/O bottlenecks
|
||||
* while maintaining high throughput under maximum load.
|
||||
*/
|
||||
export class ActionQueueManager {
|
||||
@@ -10,7 +26,7 @@ export class ActionQueueManager {
|
||||
private activeCount: number = 0;
|
||||
private readonly concurrencyLimit: number;
|
||||
|
||||
constructor(concurrencyLimit: number = 3) {
|
||||
constructor(concurrencyLimit: number = defaultConcurrencyLimit()) {
|
||||
this.concurrencyLimit = concurrencyLimit;
|
||||
}
|
||||
|
||||
|
||||
@@ -0,0 +1,129 @@
|
||||
/**
|
||||
* ============================================================
|
||||
* Telemetry — append-only usage events to `.astra/usage.jsonl`
|
||||
*
|
||||
* Why local-file telemetry instead of a webview dashboard or remote endpoint:
|
||||
* - Astra is local-first. No data leaves the machine.
|
||||
* - JSONL is trivial to inspect manually (`tail`, jq) and trivial to ingest
|
||||
* into a future webview chart without schema migrations.
|
||||
* - Append-only means the writer never blocks on history.
|
||||
*
|
||||
* Event shape is intentionally flat — top-level scalar fields only, so a future
|
||||
* dashboard can sum/group/filter without parsing nested structures.
|
||||
* ============================================================
|
||||
*/
|
||||
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import { getAstraDataDir } from './astraPath';
|
||||
import { logError } from '../utils';
|
||||
|
||||
/** Top-level event kinds. Add sparingly — each is a stable contract for the JSONL. */
|
||||
export type TelemetryEventKind =
|
||||
| 'turn' // one user-visible chat turn (input → final answer)
|
||||
| 'continuation' // an auto-continuation round inside a turn
|
||||
| 'retrieval' // brain + memory retrieval summary
|
||||
| 'session-end'; // session closed (used to bound aggregation queries)
|
||||
|
||||
export interface TelemetryEvent {
|
||||
kind: TelemetryEventKind;
|
||||
/** ISO timestamp. Always present so a viewer can plot on a time axis without recomputing. */
|
||||
ts: string;
|
||||
/** Wall-clock milliseconds the event took, when applicable. 0 for instantaneous events. */
|
||||
durationMs?: number;
|
||||
/** Model identifier the request was bound to, when applicable. */
|
||||
model?: string;
|
||||
/** Engine name (lmstudio | ollama), when applicable. */
|
||||
engine?: string;
|
||||
/** Input token estimate that went into this event, when applicable. */
|
||||
inputTokens?: number;
|
||||
/** Output token estimate produced by this event, when applicable. */
|
||||
outputTokens?: number;
|
||||
/** Configured context window for this event, when applicable. */
|
||||
contextLength?: number;
|
||||
/** Continuation round index for `kind: 'continuation'`. */
|
||||
round?: number;
|
||||
/** Stop reason from the engine, when applicable. */
|
||||
stopReason?: string;
|
||||
/** Brain files actually used this turn. */
|
||||
brainFiles?: number;
|
||||
/** Memory layers that contributed chunks this turn. */
|
||||
memoryLayers?: string[];
|
||||
/** Free-form structured details. Keep small — this lives in the JSONL forever. */
|
||||
note?: string;
|
||||
}
|
||||
|
||||
const MAX_FILE_BYTES = 5 * 1024 * 1024; // 5 MB → ~25k events worst case
|
||||
const ROTATE_KEEP = 2; // keep usage.jsonl + usage.1.jsonl
|
||||
|
||||
function jsonlPath(): string {
|
||||
return path.join(getAstraDataDir(), 'usage.jsonl');
|
||||
}
|
||||
|
||||
function rotateIfNeeded(p: string): void {
|
||||
try {
|
||||
const stat = fs.statSync(p);
|
||||
if (stat.size <= MAX_FILE_BYTES) return;
|
||||
// Shift usage.{N-1}.jsonl → usage.{N}.jsonl, drop the oldest.
|
||||
for (let i = ROTATE_KEEP; i >= 1; i--) {
|
||||
const older = path.join(getAstraDataDir(), `usage.${i}.jsonl`);
|
||||
const newer = i === 1 ? p : path.join(getAstraDataDir(), `usage.${i - 1}.jsonl`);
|
||||
if (fs.existsSync(newer)) {
|
||||
if (i === ROTATE_KEEP && fs.existsSync(older)) {
|
||||
try { fs.unlinkSync(older); } catch { /* non-fatal */ }
|
||||
}
|
||||
try { fs.renameSync(newer, older); } catch { /* non-fatal */ }
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
// File doesn't exist yet — first write will create it.
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Append one event to the rotating JSONL. Best-effort: failures are logged but
|
||||
* never thrown, because telemetry must not break a live chat turn.
|
||||
*/
|
||||
export function recordTelemetry(event: Omit<TelemetryEvent, 'ts'> & { ts?: string }): void {
|
||||
try {
|
||||
const full: TelemetryEvent = { ts: new Date().toISOString(), ...event };
|
||||
const line = JSON.stringify(full) + '\n';
|
||||
const p = jsonlPath();
|
||||
rotateIfNeeded(p);
|
||||
fs.appendFile(p, line, { encoding: 'utf8' }, (err) => {
|
||||
if (err) logError('Telemetry append failed.', { error: err.message });
|
||||
});
|
||||
} catch (e: any) {
|
||||
// Final safety net — telemetry must never escape.
|
||||
logError('Telemetry recordTelemetry threw.', { error: e?.message ?? String(e) });
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Read the last `limit` events from the current and prior usage files. Used by
|
||||
* a future Settings panel chart; here so the viewer doesn't have to parse paths
|
||||
* or worry about rotation.
|
||||
*/
|
||||
export function readRecentTelemetry(limit = 500): TelemetryEvent[] {
|
||||
const dir = getAstraDataDir();
|
||||
const files: string[] = [];
|
||||
const head = path.join(dir, 'usage.jsonl');
|
||||
if (fs.existsSync(head)) files.push(head);
|
||||
for (let i = 1; i <= ROTATE_KEEP; i++) {
|
||||
const p = path.join(dir, `usage.${i}.jsonl`);
|
||||
if (fs.existsSync(p)) files.push(p);
|
||||
}
|
||||
const out: TelemetryEvent[] = [];
|
||||
for (const f of files) {
|
||||
try {
|
||||
const raw = fs.readFileSync(f, 'utf8');
|
||||
for (const line of raw.split('\n')) {
|
||||
const trimmed = line.trim();
|
||||
if (!trimmed) continue;
|
||||
try { out.push(JSON.parse(trimmed) as TelemetryEvent); } catch { /* skip bad line */ }
|
||||
}
|
||||
} catch { /* skip unreadable file */ }
|
||||
if (out.length >= limit * 2) break; // rough upper bound to bound work
|
||||
}
|
||||
return out.slice(-limit);
|
||||
}
|
||||
+19
-4
@@ -119,11 +119,13 @@ export async function activate(context: vscode.ExtensionContext) {
|
||||
);
|
||||
|
||||
// 3. Initialize Approval subsystem (queue + panel webview + status bar badge)
|
||||
// Astra 2.81: sidebar view container is gone; all webviews open in editor
|
||||
// column 3 instead. We don't register a WebviewViewProvider — panels are
|
||||
// created on-demand via openAsPanel().
|
||||
const approvalQueue = new ApprovalQueue();
|
||||
const approvalPanel = new ApprovalPanelProvider(context.extensionUri, approvalQueue);
|
||||
const approvalStatusBar = new ApprovalStatusBar(approvalQueue);
|
||||
context.subscriptions.push(
|
||||
vscode.window.registerWebviewViewProvider(ApprovalPanelProvider.viewType, approvalPanel),
|
||||
approvalStatusBar,
|
||||
{ dispose: () => approvalQueue.dispose() },
|
||||
vscode.commands.registerCommand(ApprovalStatusBar.focusCommand, () => approvalPanel.focus()),
|
||||
@@ -140,14 +142,16 @@ export async function activate(context: vscode.ExtensionContext) {
|
||||
approvalQueue,
|
||||
});
|
||||
|
||||
// 4. Initialize Sidebar Provider
|
||||
// 4. Initialize Chat Provider (renders into an editor column, not a sidebar view)
|
||||
provider = new SidebarChatProvider(context.extensionUri, context, agent, {
|
||||
lifecycle,
|
||||
activity: activityTracker,
|
||||
loadedModels: () => lmStudioClient.listLoadedCached(),
|
||||
});
|
||||
context.subscriptions.push(
|
||||
vscode.window.registerWebviewViewProvider(SidebarChatProvider.viewType, provider)
|
||||
vscode.commands.registerCommand('g1nation.openChat', () => {
|
||||
provider!.openAsPanel(vscode.ViewColumn.Three);
|
||||
})
|
||||
);
|
||||
|
||||
// 4. Initialize Bridge Server (Port 4825)
|
||||
@@ -559,7 +563,6 @@ export async function activate(context: vscode.ExtensionContext) {
|
||||
telegramBot,
|
||||
});
|
||||
context.subscriptions.push(
|
||||
vscode.window.registerWebviewViewProvider(SettingsPanelProvider.viewType, settingsPanel),
|
||||
// Refresh the settings UI whenever any g1nation.* config changes (toggle, allowedChatIds, …).
|
||||
vscode.workspace.onDidChangeConfiguration((e) => {
|
||||
if (e.affectsConfiguration('g1nation')) void settingsPanel.refresh();
|
||||
@@ -628,6 +631,18 @@ export async function activate(context: vscode.ExtensionContext) {
|
||||
if (!setupComplete) {
|
||||
await runInitialSetup(context);
|
||||
}
|
||||
|
||||
// 7. Auto-open all three Astra webviews as tabs in editor column 3.
|
||||
// The sidebar/activity-bar entry point was removed in 2.81 — all three views
|
||||
// (Chat, Approvals, Settings) now stack as tabs in the third editor column.
|
||||
// Order matters: Chat opens last so it ends up as the active tab.
|
||||
try {
|
||||
approvalPanel.openAsPanel(vscode.ViewColumn.Three);
|
||||
await settingsPanel.openAsPanel(vscode.ViewColumn.Three);
|
||||
provider!.openAsPanel(vscode.ViewColumn.Three);
|
||||
} catch (e) {
|
||||
logError('Failed to auto-open Astra panels.', e);
|
||||
}
|
||||
}
|
||||
|
||||
export async function deactivate() {
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import * as vscode from 'vscode';
|
||||
import { ApprovalQueue, Approval } from './approvalQueue';
|
||||
import { wrapPanelAsView } from '../../sidebarProvider';
|
||||
|
||||
/**
|
||||
* A small webview view that surfaces the currently pending approval, separate
|
||||
@@ -14,6 +15,7 @@ export class ApprovalPanelProvider implements vscode.WebviewViewProvider {
|
||||
public static readonly viewType = 'g1nation-approval-panel';
|
||||
|
||||
private _view?: vscode.WebviewView;
|
||||
private _panel?: vscode.WebviewPanel;
|
||||
private _subscription?: vscode.Disposable;
|
||||
|
||||
constructor(
|
||||
@@ -22,6 +24,32 @@ export class ApprovalPanelProvider implements vscode.WebviewViewProvider {
|
||||
) {}
|
||||
|
||||
public resolveWebviewView(view: vscode.WebviewView): void {
|
||||
this._initView(view);
|
||||
}
|
||||
|
||||
/** Open the approvals UI as an editor panel (Column 3 by default). */
|
||||
public openAsPanel(column: vscode.ViewColumn = vscode.ViewColumn.Three): vscode.WebviewPanel {
|
||||
if (this._panel) {
|
||||
this._panel.reveal(column);
|
||||
return this._panel;
|
||||
}
|
||||
const panel = vscode.window.createWebviewPanel(
|
||||
ApprovalPanelProvider.viewType,
|
||||
'Pending Approvals',
|
||||
column,
|
||||
{ enableScripts: true, localResourceRoots: [this._extensionUri], retainContextWhenHidden: true }
|
||||
);
|
||||
this._panel = panel;
|
||||
const adapter = wrapPanelAsView(panel);
|
||||
panel.onDidDispose(() => {
|
||||
if (this._panel === panel) this._panel = undefined;
|
||||
if (this._view === adapter) this._view = undefined;
|
||||
});
|
||||
this._initView(adapter);
|
||||
return panel;
|
||||
}
|
||||
|
||||
private _initView(view: vscode.WebviewView): void {
|
||||
this._view = view;
|
||||
view.webview.options = { enableScripts: true, localResourceRoots: [this._extensionUri] };
|
||||
view.webview.html = this._render(this._queue.current());
|
||||
@@ -40,13 +68,17 @@ export class ApprovalPanelProvider implements vscode.WebviewViewProvider {
|
||||
view.onDidDispose(() => {
|
||||
this._subscription?.dispose();
|
||||
this._subscription = undefined;
|
||||
this._view = undefined;
|
||||
if (this._view === view) this._view = undefined;
|
||||
});
|
||||
}
|
||||
|
||||
/** Bring the panel into focus; used by the status bar badge. */
|
||||
public focus(): void {
|
||||
void vscode.commands.executeCommand(`${ApprovalPanelProvider.viewType}.focus`);
|
||||
if (this._panel) {
|
||||
this._panel.reveal(this._panel.viewColumn ?? vscode.ViewColumn.Three);
|
||||
return;
|
||||
}
|
||||
this.openAsPanel();
|
||||
}
|
||||
|
||||
private _render(approval: Approval | null): string {
|
||||
|
||||
@@ -123,45 +123,23 @@ export class SettingsPanelProvider implements vscode.WebviewViewProvider {
|
||||
}
|
||||
|
||||
public async focus(): Promise<void> {
|
||||
// Reveal the Astra activity-bar container so a focus() doesn't silently
|
||||
// no-op against a collapsed sidebar.
|
||||
try {
|
||||
await vscode.commands.executeCommand('workbench.view.extension.g1nation-sidebar');
|
||||
} catch {
|
||||
// Older VS Code versions may not expose this command.
|
||||
}
|
||||
try {
|
||||
await vscode.commands.executeCommand(`${SettingsPanelProvider.viewType}.focus`);
|
||||
} catch (e: any) {
|
||||
// The view-focus command is auto-generated only when VS Code parsed
|
||||
// the package.json `views` entry. If a stale .vsix is installed
|
||||
// (or the user hasn't reloaded after a fresh install) the command
|
||||
// is missing and we hit `command not found`. Fall back to a
|
||||
// floating panel so the user still gets the same UI.
|
||||
if (this._isCommandNotFound(e)) {
|
||||
logInfo('Settings view command missing — opening as floating panel.');
|
||||
await this.openAsPanel();
|
||||
return;
|
||||
}
|
||||
throw e;
|
||||
}
|
||||
await this.openAsPanel();
|
||||
}
|
||||
|
||||
/**
|
||||
* Open the same settings UI as a stand-alone editor panel. Used when the
|
||||
* sidebar `WebviewView` isn't registered yet (e.g. user installed a fresh
|
||||
* .vsix without reloading) — keeps the feature reachable without forcing
|
||||
* the user back through `vsce package` cycles.
|
||||
* Open the settings UI as a stand-alone editor panel (Column 3 by default).
|
||||
* Astra's sidebar view container was removed in 2.81 — all three webviews
|
||||
* (Chat, Approvals, Settings) now live in the editor area.
|
||||
*/
|
||||
public async openAsPanel(): Promise<void> {
|
||||
public async openAsPanel(column: vscode.ViewColumn = vscode.ViewColumn.Three): Promise<vscode.WebviewPanel> {
|
||||
if (this._panel) {
|
||||
this._panel.reveal(vscode.ViewColumn.Active);
|
||||
return;
|
||||
this._panel.reveal(column);
|
||||
return this._panel;
|
||||
}
|
||||
const panel = vscode.window.createWebviewPanel(
|
||||
'g1nation-settings-panel-floating',
|
||||
SettingsPanelProvider.viewType,
|
||||
'Astra Settings',
|
||||
vscode.ViewColumn.Active,
|
||||
column,
|
||||
{ enableScripts: true, localResourceRoots: [this._deps.extensionUri], retainContextWhenHidden: true }
|
||||
);
|
||||
this._panel = panel;
|
||||
@@ -169,11 +147,7 @@ export class SettingsPanelProvider implements vscode.WebviewViewProvider {
|
||||
panel.onDidDispose(() => { this._panel = undefined; });
|
||||
await this._refreshState();
|
||||
void this._fetchModelsAndRefresh();
|
||||
}
|
||||
|
||||
private _isCommandNotFound(e: unknown): boolean {
|
||||
const msg = (e as any)?.message ?? String(e ?? '');
|
||||
return /command\s+'.+'\s+not found/i.test(msg);
|
||||
return panel;
|
||||
}
|
||||
|
||||
/** Re-pull state from sources of truth and broadcast to the webview. */
|
||||
|
||||
@@ -17,7 +17,10 @@ import { tokenize, countConflictIndicators } from './scoring';
|
||||
import { detectLessonKind } from './lessonHelpers';
|
||||
import { logInfo } from '../utils';
|
||||
|
||||
const INDEX_VERSION = 3;
|
||||
// v4 adds optional per-file `embedding` for hybrid (sparse+dense) retrieval.
|
||||
// Older v3 indexes are auto-rebuilt on first load — no migration needed because
|
||||
// the cache is derivable from the brain itself.
|
||||
const INDEX_VERSION = 4;
|
||||
const INDEX_DIR = '.astra';
|
||||
const INDEX_FILE = 'brain-index.json';
|
||||
/** 인덱스가 이 개수를 넘으면 이번 스캔에서 못 본 항목을 정리합니다 (삭제된 파일 누적 방지). */
|
||||
@@ -34,6 +37,14 @@ interface IndexEntry {
|
||||
titleTokens: string[]; // tokenize(title)
|
||||
conflictCount: number; // countConflictIndicators(`${title} ${content}`)
|
||||
kind: string; // '' for an ordinary note, else 'lesson' | 'playbook' | 'qa-finding'
|
||||
/**
|
||||
* Dense embedding for hybrid retrieval. Populated lazily by a background
|
||||
* pass after the file is tokenized — TF-IDF queries don't wait on it.
|
||||
* Cleared when mtimeMs/size change because the content moved on.
|
||||
*/
|
||||
embedding?: number[];
|
||||
/** Embedding model the vector was produced with — invalidates the vector when the user switches models. */
|
||||
embeddingModel?: string;
|
||||
}
|
||||
|
||||
interface PersistedIndex {
|
||||
@@ -212,6 +223,93 @@ export function getBrainTokenIndex(brainPath: string, files: string[]): IndexedB
|
||||
return out;
|
||||
}
|
||||
|
||||
/**
|
||||
* Pull (filePath, embedding) for every file in `filePaths` that has a current
|
||||
* cached vector under `model`. Caller uses this to rank top TF-IDF candidates
|
||||
* by cosine similarity. Files missing an embedding are silently omitted.
|
||||
*/
|
||||
export function getBrainEmbeddings(brainPath: string, filePaths: string[], model: string): Map<string, number[]> {
|
||||
const out = new Map<string, number[]>();
|
||||
if (!brainPath || !model.trim() || !Array.isArray(filePaths) || filePaths.length === 0) return out;
|
||||
const st = _states.get(brainPath);
|
||||
if (!st) return out;
|
||||
for (const fp of filePaths) {
|
||||
const entry = st.index.entries[fp];
|
||||
if (!entry?.embedding || entry.embeddingModel !== model) continue;
|
||||
if (!Array.isArray(entry.embedding) || entry.embedding.length === 0) continue;
|
||||
out.set(fp, entry.embedding);
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
/**
|
||||
* Background fill: for each file under `filePaths`, embed its content with
|
||||
* `embedFn` if no current vector exists for `model`. Calls `embedFn` in
|
||||
* caller-controlled batches (caller can chunk filePaths as wanted), and saves
|
||||
* the disk index. Designed to be fire-and-forget — failures are logged and
|
||||
* swallowed.
|
||||
*
|
||||
* Returns the count of newly embedded files (0 when everything was cached
|
||||
* already or the model is empty).
|
||||
*/
|
||||
export async function backfillBrainEmbeddings(
|
||||
brainPath: string,
|
||||
filePaths: string[],
|
||||
model: string,
|
||||
embedFn: (texts: string[]) => Promise<number[][]>,
|
||||
): Promise<number> {
|
||||
if (!brainPath || !model.trim() || !Array.isArray(filePaths) || filePaths.length === 0) return 0;
|
||||
const st = _states.get(brainPath);
|
||||
if (!st) return 0;
|
||||
const stale: string[] = [];
|
||||
for (const fp of filePaths) {
|
||||
const entry = st.index.entries[fp];
|
||||
if (!entry) continue;
|
||||
if (entry.embedding && entry.embeddingModel === model) continue;
|
||||
stale.push(fp);
|
||||
}
|
||||
if (stale.length === 0) return 0;
|
||||
// Build embedding inputs from cached tokens (much cheaper than re-reading
|
||||
// the file). We re-read content only when the cached tokens are missing
|
||||
// somehow — defensive, but the index always has them after tokenization.
|
||||
const texts: string[] = [];
|
||||
const keys: string[] = [];
|
||||
for (const fp of stale) {
|
||||
const entry = st.index.entries[fp];
|
||||
if (!entry) continue;
|
||||
let text = '';
|
||||
if (Array.isArray(entry.tokens) && entry.tokens.length > 0) {
|
||||
text = `${entry.title}\n${entry.tokens.join(' ')}`;
|
||||
} else {
|
||||
try { text = fs.readFileSync(fp, 'utf8'); } catch { continue; }
|
||||
}
|
||||
if (!text.trim()) continue;
|
||||
texts.push(text);
|
||||
keys.push(fp);
|
||||
}
|
||||
if (texts.length === 0) return 0;
|
||||
try {
|
||||
const vectors = await embedFn(texts);
|
||||
for (let i = 0; i < vectors.length && i < keys.length; i++) {
|
||||
const v = vectors[i];
|
||||
if (!Array.isArray(v) || v.length === 0) continue;
|
||||
const entry = st.index.entries[keys[i]];
|
||||
if (!entry) continue;
|
||||
entry.embedding = v;
|
||||
entry.embeddingModel = model;
|
||||
st.dirty = true;
|
||||
}
|
||||
if (st.dirty) {
|
||||
logInfo('Brain embeddings backfilled.', { brainPath, model, embedded: vectors.length });
|
||||
scheduleWrite(st, brainPath);
|
||||
}
|
||||
return vectors.length;
|
||||
} catch (e: any) {
|
||||
logInfo('Brain embedding backfill failed (TF-IDF still works).', { brainPath, model, error: e?.message ?? String(e) });
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
/** Drop the in-memory index (and pending write) for one brain, or all brains. The disk file is left as-is. */
|
||||
export function clearBrainTokenIndex(brainPath?: string): void {
|
||||
if (brainPath === undefined) {
|
||||
|
||||
@@ -101,6 +101,7 @@ export function assembleContext(chunks: RetrievalChunk[]): string {
|
||||
'brain-trace': '📚 Second Brain Knowledge',
|
||||
'brain-memory': '📚 Brain Knowledge',
|
||||
'long-term-memory': '🧠 Long-Term Memory (사용자 규칙/결정)',
|
||||
'medium-term-memory': '🗂️ Medium-Term Memory (최근 세션 요약)',
|
||||
'project-memory': '📂 Project Memory (프로젝트 컨텍스트)',
|
||||
'procedural-memory': '📋 Procedural Memory (반복 절차)',
|
||||
'episodic-memory': '📖 Episodic Memory (과거 대화 흐름)',
|
||||
|
||||
@@ -0,0 +1,167 @@
|
||||
/**
|
||||
* ============================================================
|
||||
* Embeddings — local hybrid (sparse + dense) retrieval support
|
||||
*
|
||||
* TF-IDF is fast and zero-cost but misses synonyms / paraphrase. A small local
|
||||
* embedding model (BGE-small, multilingual-e5-small, nomic-embed-text, …)
|
||||
* loaded in LM Studio or Ollama bridges that gap without sending anything
|
||||
* off the machine.
|
||||
*
|
||||
* Design choices:
|
||||
* - Opt-in via g1nation.embeddingModel (empty = disabled). We don't auto-
|
||||
* pick a model because the user has to load it in LM Studio/Ollama first.
|
||||
* - Calls are best-effort: a missing model / network blip falls back to
|
||||
* pure TF-IDF without breaking the query.
|
||||
* - We never block retrieval on embedding work. Missing-file embeddings are
|
||||
* populated by a separate fire-and-forget pass after the TF-IDF answer
|
||||
* ships, so the *next* query benefits.
|
||||
*
|
||||
* Numerical format:
|
||||
* - Vectors are `number[]` (not Float32Array) so they JSON-serialize for
|
||||
* the brain-index cache without per-element conversion. The hot loop
|
||||
* (cosine) is small enough that the extra precision is irrelevant to
|
||||
* throughput on typical brain sizes.
|
||||
* ============================================================
|
||||
*/
|
||||
|
||||
import { resolveEngine, buildApiUrl, logError, logInfo } from '../utils';
|
||||
|
||||
/** Maximum characters of a single text chunk fed to the embedding model. */
|
||||
const EMBED_INPUT_CAP = 4000;
|
||||
/** Maximum texts per embedding API call. */
|
||||
const BATCH_SIZE = 16;
|
||||
/** Request timeout for one embedding batch. */
|
||||
const REQ_TIMEOUT_MS = 30000;
|
||||
|
||||
export interface EmbeddingCallOptions {
|
||||
/** OpenAI-compatible base URL (e.g. http://127.0.0.1:1234 for LM Studio). */
|
||||
baseUrl: string;
|
||||
/** Embedding model name as registered in LM Studio / Ollama. Empty disables. */
|
||||
model: string;
|
||||
/** AbortSignal for cancellation propagation. */
|
||||
signal?: AbortSignal;
|
||||
}
|
||||
|
||||
/**
|
||||
* Embed a batch of texts. Returns one vector per input. Throws if the call
|
||||
* fails — callers wrap with try/catch and fall back to TF-IDF.
|
||||
*
|
||||
* Engine selection mirrors the chat path: LM Studio takes precedence when the
|
||||
* URL points at port 1234 or includes the /v1/ prefix, otherwise Ollama.
|
||||
*/
|
||||
export async function embedTexts(texts: string[], opts: EmbeddingCallOptions): Promise<number[][]> {
|
||||
if (!opts.model.trim()) throw new Error('Embedding model not configured.');
|
||||
if (!texts || texts.length === 0) return [];
|
||||
const engine = resolveEngine(opts.baseUrl);
|
||||
const url = buildApiUrl(opts.baseUrl, engine, 'embeddings');
|
||||
const out: number[][] = [];
|
||||
for (let i = 0; i < texts.length; i += BATCH_SIZE) {
|
||||
const batch = texts.slice(i, i + BATCH_SIZE).map((t) => clipForEmbedding(t));
|
||||
const body = engine === 'lmstudio'
|
||||
? { model: opts.model, input: batch }
|
||||
: { model: opts.model, input: batch }; // Ollama 0.1.30+ also accepts array input
|
||||
const controller = opts.signal ? undefined : new AbortController();
|
||||
const timer = controller ? setTimeout(() => controller.abort(), REQ_TIMEOUT_MS) : undefined;
|
||||
try {
|
||||
const response = await fetch(url, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify(body),
|
||||
signal: opts.signal ?? controller?.signal,
|
||||
});
|
||||
if (!response.ok) {
|
||||
const errText = await response.text().catch(() => '');
|
||||
throw new Error(`Embedding endpoint returned ${response.status}: ${errText.slice(0, 200)}`);
|
||||
}
|
||||
const json = await response.json() as any;
|
||||
// OpenAI-compatible: { data: [{ embedding: [...] }, ...] }
|
||||
// Ollama: { embedding: [...] } (single) or { embeddings: [[...], ...] } (newer)
|
||||
if (Array.isArray(json?.data)) {
|
||||
for (const row of json.data) {
|
||||
if (Array.isArray(row?.embedding)) out.push(row.embedding as number[]);
|
||||
}
|
||||
} else if (Array.isArray(json?.embeddings)) {
|
||||
for (const v of json.embeddings) {
|
||||
if (Array.isArray(v)) out.push(v as number[]);
|
||||
}
|
||||
} else if (Array.isArray(json?.embedding)) {
|
||||
out.push(json.embedding as number[]);
|
||||
}
|
||||
} finally {
|
||||
if (timer) clearTimeout(timer);
|
||||
}
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
/** Cosine similarity for equal-length vectors. Returns 0 when either vector is empty / zero. */
|
||||
export function cosineSimilarity(a: number[], b: number[]): number {
|
||||
if (!a || !b || a.length === 0 || b.length === 0) return 0;
|
||||
const n = Math.min(a.length, b.length);
|
||||
let dot = 0, na = 0, nb = 0;
|
||||
for (let i = 0; i < n; i++) {
|
||||
const va = a[i], vb = b[i];
|
||||
dot += va * vb;
|
||||
na += va * va;
|
||||
nb += vb * vb;
|
||||
}
|
||||
if (na === 0 || nb === 0) return 0;
|
||||
return dot / (Math.sqrt(na) * Math.sqrt(nb));
|
||||
}
|
||||
|
||||
/** Clip a text to a length the embedding model will accept without truncation surprises. */
|
||||
function clipForEmbedding(text: string): string {
|
||||
if (!text) return '';
|
||||
return text.length <= EMBED_INPUT_CAP ? text : text.slice(0, EMBED_INPUT_CAP);
|
||||
}
|
||||
|
||||
/**
|
||||
* Tiny LRU for query embeddings: typing the same query twice (or retrying)
|
||||
* shouldn't re-hit the embedding endpoint. Keyed on `model + text`.
|
||||
*
|
||||
* Capped at QUERY_CACHE_MAX entries; oldest evicted. Strictly process-local
|
||||
* (no disk persistence) because the query strings are short and the gains
|
||||
* across restarts are marginal.
|
||||
*/
|
||||
const QUERY_CACHE_MAX = 32;
|
||||
const _queryCache = new Map<string, number[]>();
|
||||
function queryCacheKey(model: string, text: string): string { return `${model}|${text}`; }
|
||||
export function getCachedQueryEmbedding(model: string, text: string): number[] | undefined {
|
||||
const k = queryCacheKey(model, text);
|
||||
const v = _queryCache.get(k);
|
||||
if (!v) return undefined;
|
||||
// refresh recency
|
||||
_queryCache.delete(k);
|
||||
_queryCache.set(k, v);
|
||||
return v;
|
||||
}
|
||||
export function setCachedQueryEmbedding(model: string, text: string, vec: number[]): void {
|
||||
const k = queryCacheKey(model, text);
|
||||
_queryCache.set(k, vec);
|
||||
if (_queryCache.size > QUERY_CACHE_MAX) {
|
||||
const oldest = _queryCache.keys().next().value;
|
||||
if (oldest !== undefined) _queryCache.delete(oldest);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Embed a single query string, using the in-process LRU. Returns `undefined`
|
||||
* if the embedding endpoint fails — callers treat that as "semantic
|
||||
* scoring unavailable for this turn, fall back to TF-IDF".
|
||||
*/
|
||||
export async function embedQuery(text: string, opts: EmbeddingCallOptions): Promise<number[] | undefined> {
|
||||
if (!opts.model.trim() || !text.trim()) return undefined;
|
||||
const cached = getCachedQueryEmbedding(opts.model, text);
|
||||
if (cached) return cached;
|
||||
try {
|
||||
const [vec] = await embedTexts([text], opts);
|
||||
if (vec && vec.length > 0) {
|
||||
setCachedQueryEmbedding(opts.model, text, vec);
|
||||
logInfo('Query embedding computed.', { model: opts.model, dim: vec.length });
|
||||
return vec;
|
||||
}
|
||||
} catch (e: any) {
|
||||
logError('Query embedding failed.', { model: opts.model, error: e?.message ?? String(e) });
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
+163
-10
@@ -19,15 +19,32 @@ import { findBrainFiles, summarizeText } from '../utils';
|
||||
import { isInside } from '../lib/paths';
|
||||
import { MemoryManager } from '../memory';
|
||||
import { RetrievalChunk, RetrievalResult, ContextBudgetConfig } from './types';
|
||||
import { tokenize, expandQuery, scoreTfIdfPreTokenized, extractBestExcerpt } from './scoring';
|
||||
import { tokenize, expandQuery, scoreTfIdfPreTokenized, extractBestExcerpt, extractBestSection } from './scoring';
|
||||
import { selectWithinBudget, assembleContext, estimateTokens } from './contextBudget';
|
||||
import { getBrainTokenIndex } from './brainIndex';
|
||||
import { getBrainTokenIndex, getBrainEmbeddings } from './brainIndex';
|
||||
import { extractLessonEssence } from './lessonHelpers';
|
||||
import { cosineSimilarity } from './embeddings';
|
||||
|
||||
export { tokenize, expandQuery, scoreTfIdf, scoreTfIdfPreTokenized, extractBestExcerpt } from './scoring';
|
||||
export { selectWithinBudget, assembleContext, estimateTokens } from './contextBudget';
|
||||
export { getBrainTokenIndex, clearBrainTokenIndex } from './brainIndex';
|
||||
export * from './types';
|
||||
|
||||
/** Compact summary of a past chat session for medium-term memory retrieval. */
|
||||
export interface RecentSessionSummary {
|
||||
id: string;
|
||||
title: string;
|
||||
firstUserMsg: string;
|
||||
lastAssistantExcerpt: string;
|
||||
/**
|
||||
* Optional LLM-compressed recap stored at session end (~200 chars).
|
||||
* When present, retrieval uses this instead of the firstUserMsg+tail
|
||||
* fragment because it actually captures the decision/outcome.
|
||||
*/
|
||||
summary?: string;
|
||||
timestamp: number;
|
||||
}
|
||||
|
||||
interface RetrievalOptions {
|
||||
brain: BrainProfile;
|
||||
memoryManager: MemoryManager;
|
||||
@@ -44,6 +61,26 @@ interface RetrievalOptions {
|
||||
* silently dropped by the caller (see `agentKnowledgeMap.resolveScopeForAgent`).
|
||||
*/
|
||||
scopeFolders?: string[];
|
||||
/**
|
||||
* Compact summaries of recently-touched chat sessions (excluding the
|
||||
* active one). Scored against the query and the top `mediumTermLimit`
|
||||
* are injected as medium-term memory chunks. Caller pre-computes these
|
||||
* to avoid threading vscode/ExtensionContext through this module.
|
||||
*/
|
||||
recentSessions?: RecentSessionSummary[];
|
||||
/** Max number of medium-term session chunks to include after scoring. */
|
||||
mediumTermLimit?: number;
|
||||
/**
|
||||
* Optional query embedding for hybrid (sparse+dense) brain search. When
|
||||
* provided, each candidate file's cached embedding is cosine-matched and
|
||||
* blended with the TF-IDF score by `embeddingBlendAlpha`. Caller computes
|
||||
* this once per turn so we don't pay the embedding RTT inside scoring.
|
||||
*/
|
||||
queryEmbedding?: number[];
|
||||
/** Embedding model name (used as a cache key on the brain index side). */
|
||||
embeddingModel?: string;
|
||||
/** Blend weight: 0 = TF-IDF only, 1 = cosine only. Default 0.5. */
|
||||
embeddingBlendAlpha?: number;
|
||||
}
|
||||
|
||||
export class RetrievalOrchestrator {
|
||||
@@ -60,7 +97,7 @@ export class RetrievalOrchestrator {
|
||||
fusionLog.push(`Query tokens: [${queryTokens.slice(0, 10).join(', ')}]`);
|
||||
fusionLog.push(`Expanded tokens: [${expandedTokens.slice(0, 15).join(', ')}]`);
|
||||
|
||||
// ── ① Brain File Search (TF-IDF enhanced) ──
|
||||
// ── ① Brain File Search (TF-IDF enhanced, optionally hybrid with embeddings) ──
|
||||
const scopeFolders = options.scopeFolders ?? [];
|
||||
const brainChunks = this.searchBrainFiles(
|
||||
query,
|
||||
@@ -68,7 +105,10 @@ export class RetrievalOrchestrator {
|
||||
options.brain,
|
||||
options.brainFileLimit || 8,
|
||||
options.includeRawConversations || false,
|
||||
scopeFolders
|
||||
scopeFolders,
|
||||
options.queryEmbedding,
|
||||
options.embeddingModel,
|
||||
options.embeddingBlendAlpha
|
||||
);
|
||||
allChunks.push(...brainChunks);
|
||||
fusionLog.push(
|
||||
@@ -87,6 +127,15 @@ export class RetrievalOrchestrator {
|
||||
allChunks.push(...memoryChunks);
|
||||
fusionLog.push(`Memory search: ${memoryChunks.length} chunks found`);
|
||||
|
||||
// ── ②-b Medium-Term Memory (recent sessions) ──
|
||||
const mediumChunks = this.scoreRecentSessions(
|
||||
expandedTokens,
|
||||
options.recentSessions || [],
|
||||
options.mediumTermLimit ?? 0
|
||||
);
|
||||
allChunks.push(...mediumChunks);
|
||||
fusionLog.push(`Medium-term sessions: ${mediumChunks.length} chunks selected`);
|
||||
|
||||
// ── ③ Result Fusion — normalize scores across sources ──
|
||||
this.normalizeScores(allChunks);
|
||||
fusionLog.push(`Total chunks before budget: ${allChunks.length}`);
|
||||
@@ -129,7 +178,10 @@ export class RetrievalOrchestrator {
|
||||
brain: BrainProfile,
|
||||
limit: number,
|
||||
includeRaw: boolean,
|
||||
scopeFolders: string[] = []
|
||||
scopeFolders: string[] = [],
|
||||
queryEmbedding?: number[],
|
||||
embeddingModel?: string,
|
||||
embeddingBlendAlpha?: number,
|
||||
): RetrievalChunk[] {
|
||||
try {
|
||||
const scoped = (file: string) => scopeFolders.length === 0
|
||||
@@ -155,6 +207,34 @@ export class RetrievalOrchestrator {
|
||||
}))
|
||||
);
|
||||
|
||||
// Hybrid blend: when the caller provided a query embedding and an
|
||||
// embedding model, fetch the cached file vectors and add a cosine
|
||||
// similarity term to each score. We normalise TF-IDF scores by the
|
||||
// top observed value so the two terms live on the same scale before
|
||||
// blending. Files without a cached embedding keep their pure TF-IDF
|
||||
// score so adding/missing embeddings doesn't hurt retrieval.
|
||||
if (queryEmbedding && embeddingModel && (embeddingBlendAlpha ?? 0) > 0) {
|
||||
const alpha = Math.max(0, Math.min(1, embeddingBlendAlpha!));
|
||||
const filePaths = indexed.map((d) => d.filePath);
|
||||
const embeddings = getBrainEmbeddings(brain.localBrainPath, filePaths, embeddingModel);
|
||||
if (embeddings.size > 0) {
|
||||
const maxTfidf = scored.reduce((m, s) => s.score > m ? s.score : m, 0) || 1;
|
||||
let hits = 0;
|
||||
for (const s of scored) {
|
||||
const fp = indexed[s.index].filePath;
|
||||
const vec = embeddings.get(fp);
|
||||
if (!vec) continue;
|
||||
const cos = cosineSimilarity(queryEmbedding, vec); // [-1, 1] in theory; positive for typical embedding spaces
|
||||
const tfidfNorm = s.score / maxTfidf;
|
||||
s.score = (1 - alpha) * tfidfNorm + alpha * Math.max(0, cos);
|
||||
hits++;
|
||||
}
|
||||
if (hits > 0) {
|
||||
// Re-sort downstream is handled by the .filter().sort() that follows.
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Always consider lesson cards for the top slots even if they didn't crack the raw-score top-`limit`:
|
||||
// they're short, high-signal, and we want them surfaced when relevant. We keep the regular top-`limit`
|
||||
// and additively pull in up to a few lesson cards (deduped by index).
|
||||
@@ -180,12 +260,20 @@ export class RetrievalOrchestrator {
|
||||
// Only the chosen files are actually read off disk (for excerpt extraction).
|
||||
let content = '';
|
||||
try { content = fs.readFileSync(doc.filePath, 'utf8'); } catch { /* deleted just now — skip */ continue; }
|
||||
// Lesson cards: hand back the whole card (they're meant to be short) so the Prevention Checklist
|
||||
// survives; fall back to a generous excerpt for long ones. Regular notes: the usual 400-char excerpt.
|
||||
// Lesson cards: extract just the high-signal sections (Mistake / Root Cause / Fix /
|
||||
// Prevention Checklist) instead of dumping the whole 2500-char card. Old lessons
|
||||
// without those headings fall back to a query-targeted excerpt. Cuts retrieval tokens
|
||||
// by ~70% per lesson without losing the guardrail content.
|
||||
//
|
||||
// Regular notes: pick the best heading-bounded section for the query (markdown
|
||||
// section retrieval) so that long notes don't dump their intro/setup blocks just
|
||||
// because they happen to be in the top 400 chars. Falls back to keyword-window
|
||||
// extraction inside the section, or whole-doc extraction when there are no
|
||||
// headings at all.
|
||||
const excerpt = isLesson
|
||||
? (content.length <= 2500 ? content.trim() : extractBestExcerpt(content, expandedTokens, 1500))
|
||||
: extractBestExcerpt(content, expandedTokens, 400);
|
||||
const cap = isLesson ? 2500 : 400;
|
||||
? extractLessonEssence(content, 1200) || extractBestExcerpt(content, expandedTokens, 1200)
|
||||
: extractBestSection(content, expandedTokens, 600);
|
||||
const cap = isLesson ? 1200 : 600;
|
||||
topResults.push({
|
||||
id: `brain-${s.index}`,
|
||||
source: 'brain-memory' as const,
|
||||
@@ -287,6 +375,70 @@ export class RetrievalOrchestrator {
|
||||
return chunks;
|
||||
}
|
||||
|
||||
// ─── Medium-Term: Recent Sessions ───
|
||||
|
||||
/**
|
||||
* Score the user-provided session summaries against the current query
|
||||
* (lightweight token overlap — sessions are small so we skip the TF-IDF
|
||||
* machinery) and return up to `limit` as chunks. Each chunk packs the
|
||||
* title + first user message + last assistant excerpt — enough for the
|
||||
* model to recall the thread without re-injecting the whole transcript.
|
||||
*
|
||||
* Why include recent sessions at all: short-term covers "this conversation",
|
||||
* long-term covers "stable brain notes", but there's a gap for "what we
|
||||
* worked on yesterday/last week" that the user expects me to remember.
|
||||
*/
|
||||
private scoreRecentSessions(
|
||||
expandedTokens: string[],
|
||||
sessions: RecentSessionSummary[],
|
||||
limit: number,
|
||||
): RetrievalChunk[] {
|
||||
if (!sessions || sessions.length === 0 || limit <= 0) return [];
|
||||
const qSet = new Set(expandedTokens.filter((t) => t.length >= 2));
|
||||
const scored = sessions.map((s) => {
|
||||
// Prefer the LLM-compressed summary when present — it's a real
|
||||
// 2-3 sentence recap of the session, so query matches against it
|
||||
// are far more meaningful than against an arbitrary head/tail.
|
||||
const text = s.summary
|
||||
? `${s.title}\n${s.summary}`
|
||||
: `${s.title}\n${s.firstUserMsg}\n${s.lastAssistantExcerpt}`;
|
||||
const docTokens = tokenize(text);
|
||||
let overlap = 0;
|
||||
for (const t of docTokens) if (qSet.has(t)) overlap++;
|
||||
// Tiny recency boost so equal-overlap sessions prefer the more
|
||||
// recent one (most users mean "what we just discussed"). +0.1 max
|
||||
// for sessions <7 days old, decays to 0 beyond that.
|
||||
const ageDays = s.timestamp ? Math.max(0, (Date.now() - s.timestamp) / 86400000) : 999;
|
||||
const recency = ageDays < 7 ? (7 - ageDays) / 70 : 0;
|
||||
return { s, score: overlap + recency };
|
||||
}).filter((x) => x.score > 0);
|
||||
scored.sort((a, b) => b.score - a.score);
|
||||
const picked = scored.slice(0, limit);
|
||||
if (picked.length === 0) return [];
|
||||
return picked.map(({ s, score }, idx) => {
|
||||
const dateStr = s.timestamp ? new Date(s.timestamp).toISOString().slice(0, 10) : '';
|
||||
// Prefer the LLM-compressed summary; fall back to the raw fragments
|
||||
// when the session ended before the summarizer could run (or was
|
||||
// too short to summarize, < 3 visible messages).
|
||||
const body = s.summary
|
||||
? [`**${s.title}**${dateStr ? ` (${dateStr})` : ''}`, s.summary].join('\n')
|
||||
: [
|
||||
`**${s.title}**${dateStr ? ` (${dateStr})` : ''}`,
|
||||
s.firstUserMsg ? `사용자 요청: ${s.firstUserMsg}` : '',
|
||||
s.lastAssistantExcerpt ? `이전 답변 마지막 부분: …${s.lastAssistantExcerpt}` : '',
|
||||
].filter(Boolean).join('\n');
|
||||
return {
|
||||
id: `mtm-${idx}-${s.id}`,
|
||||
source: 'medium-term-memory',
|
||||
title: s.title || '(untitled session)',
|
||||
content: body,
|
||||
score,
|
||||
tokenEstimate: estimateTokens(body),
|
||||
metadata: { category: 'medium-term', lastUpdated: s.timestamp },
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
// ─── Score Normalization ───
|
||||
|
||||
/**
|
||||
@@ -315,6 +467,7 @@ export class RetrievalOrchestrator {
|
||||
'project-memory': 0.85,
|
||||
'long-term-memory': 0.8,
|
||||
'procedural-memory': 0.95, // Procedural is highly specific
|
||||
'medium-term-memory': 0.78, // recent sessions: useful when the user references "last time / yesterday"
|
||||
'episodic-memory': 0.7,
|
||||
'project-scan': 0.6,
|
||||
'recent-knowledge': 0.75
|
||||
|
||||
@@ -47,6 +47,54 @@ function parseFrontmatterType(content: string): string {
|
||||
return m ? m[1].trim().toLowerCase() : '';
|
||||
}
|
||||
|
||||
/**
|
||||
* Pull a specific markdown section ("## NAME ... up to the next heading") from a lesson card.
|
||||
* Returns trimmed body text, or '' if the heading isn't found.
|
||||
*/
|
||||
function extractSection(content: string, headingRe: RegExp): string {
|
||||
const m = content.match(headingRe);
|
||||
if (!m || m.index === undefined) return '';
|
||||
const after = content.slice(m.index + m[0].length);
|
||||
const stop = after.search(/\n#{1,6}\s/);
|
||||
const section = stop >= 0 ? after.slice(0, stop) : after;
|
||||
return section.trim();
|
||||
}
|
||||
|
||||
/**
|
||||
* Slim a lesson card down to the sections that actually matter for guardrails:
|
||||
* Mistake / Risk, Root Cause, Fix, and Prevention Checklist. Drops Situation,
|
||||
* Applies-To, and any verbose narrative. Returned text is markdown-compatible
|
||||
* with the original headings so the model still sees the structure.
|
||||
*
|
||||
* Falls back to the original content (clipped to `maxLen`) if no recognised
|
||||
* sections are found — keeps backwards-compat for old lessons that don't
|
||||
* follow the current template.
|
||||
*
|
||||
* Why: lesson cards are loaded at 2500 chars each and three cards can eat
|
||||
* ~11K tokens. The essence sections are usually <600 chars total per card,
|
||||
* which trims retrieval tokens by ~70% without losing the signal.
|
||||
*/
|
||||
export function extractLessonEssence(content: string, maxLen = 1200): string {
|
||||
if (!content) return '';
|
||||
const sections: Array<{ heading: string; body: string }> = [];
|
||||
const want: Array<[string, RegExp]> = [
|
||||
['## Mistake / Risk', /^#{1,6}\s*(?:mistake\s*\/?\s*risk|mistake|risk|실수|문제)\s*$/im],
|
||||
['## Root Cause', /^#{1,6}\s*(?:root\s*cause|근본\s*원인|원인)\s*$/im],
|
||||
['## Fix', /^#{1,6}\s*(?:fix|해결|수정)\s*$/im],
|
||||
['## Prevention Checklist', /^#{1,6}\s*(?:prevention\s*checklist|prevention|체크리스트|예방\s*체크리스트)\s*$/im],
|
||||
];
|
||||
for (const [heading, re] of want) {
|
||||
const body = extractSection(content, re);
|
||||
if (body && !/^<[^>]+>$/.test(body)) sections.push({ heading, body });
|
||||
}
|
||||
if (sections.length === 0) {
|
||||
return content.length <= maxLen ? content.trim() : content.slice(0, maxLen).trim() + '\n…';
|
||||
}
|
||||
let assembled = sections.map((s) => `${s.heading}\n${s.body}`).join('\n\n');
|
||||
if (assembled.length > maxLen) assembled = assembled.slice(0, maxLen).trim() + '\n…';
|
||||
return assembled;
|
||||
}
|
||||
|
||||
/** Extract the "## Prevention Checklist" bullet list from a lesson card, if present. */
|
||||
export function extractPreventionChecklist(content: string): string[] {
|
||||
if (!content) return [];
|
||||
|
||||
@@ -316,6 +316,121 @@ export function scoreTfIdfPreTokenized(
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Split markdown content into top-level sections by `#` / `##` / `###` headings.
|
||||
*
|
||||
* Returned sections are `{ heading, body }` — `heading` includes the heading
|
||||
* line itself (preserving level), `body` is the text up to the next heading
|
||||
* of the same-or-shallower depth. Front-matter (a leading `--- … ---` block)
|
||||
* is dropped because it's not query-relevant.
|
||||
*
|
||||
* A document with no headings returns one synthetic section
|
||||
* `{ heading: '', body: content }` so callers can treat the result uniformly.
|
||||
*
|
||||
* Why this exists: retrieval was returning whole files (excerpts capped at
|
||||
* 400 chars). On long notes, that excerpt was often the file's intro/setup,
|
||||
* not the section that actually matched the query. Section-level retrieval
|
||||
* lets us pick the relevant heading directly and drop everything else.
|
||||
*/
|
||||
export interface MarkdownSection {
|
||||
heading: string;
|
||||
body: string;
|
||||
}
|
||||
export function splitMarkdownSections(content: string): MarkdownSection[] {
|
||||
if (!content) return [];
|
||||
// Strip frontmatter
|
||||
let text = content;
|
||||
if (/^?---\s*\n/.test(text)) {
|
||||
const end = text.indexOf('\n---', 4);
|
||||
if (end >= 0) text = text.slice(end + 4).replace(/^\s*\n/, '');
|
||||
}
|
||||
const lines = text.split('\n');
|
||||
const headingIdx: Array<{ line: number; level: number }> = [];
|
||||
for (let i = 0; i < lines.length; i++) {
|
||||
const m = /^(#{1,6})\s+\S/.exec(lines[i]);
|
||||
if (m) headingIdx.push({ line: i, level: m[1].length });
|
||||
}
|
||||
if (headingIdx.length === 0) {
|
||||
return [{ heading: '', body: text.trim() }];
|
||||
}
|
||||
const sections: MarkdownSection[] = [];
|
||||
// Capture any leading content above the first heading as a "preamble" section.
|
||||
if (headingIdx[0].line > 0) {
|
||||
const preamble = lines.slice(0, headingIdx[0].line).join('\n').trim();
|
||||
if (preamble) sections.push({ heading: '', body: preamble });
|
||||
}
|
||||
for (let i = 0; i < headingIdx.length; i++) {
|
||||
const start = headingIdx[i].line;
|
||||
const end = i + 1 < headingIdx.length ? headingIdx[i + 1].line : lines.length;
|
||||
const heading = lines[start].trim();
|
||||
const body = lines.slice(start + 1, end).join('\n').trim();
|
||||
sections.push({ heading, body });
|
||||
}
|
||||
return sections;
|
||||
}
|
||||
|
||||
/**
|
||||
* Pick the best heading-bounded section of a markdown document for a query,
|
||||
* then fall back to keyword-window extraction inside that section if the
|
||||
* section itself is still too long.
|
||||
*
|
||||
* Strategy:
|
||||
* 1. Split into sections by heading (`splitMarkdownSections`).
|
||||
* 2. Score each section's heading + body by query token overlap; weight
|
||||
* heading matches 3× so "## Foo" beats a body mention of "foo".
|
||||
* 3. If the top section's text fits, return it as-is (heading + body).
|
||||
* 4. Otherwise, run `extractBestExcerpt` inside the top section's body and
|
||||
* prepend the heading.
|
||||
*
|
||||
* Falls back to a plain `extractBestExcerpt` when the document has no
|
||||
* headings — that's what `splitMarkdownSections` returns as a single
|
||||
* synthetic section.
|
||||
*
|
||||
* Caps:
|
||||
* - Output is always ≤ `maxLength` (final excerpt is sliced as a safety net).
|
||||
* - Sections smaller than 24 chars after stripping are skipped — they're
|
||||
* usually empty headings the author left as placeholders.
|
||||
*/
|
||||
export function extractBestSection(
|
||||
content: string,
|
||||
queryTokens: string[],
|
||||
maxLength = 600
|
||||
): string {
|
||||
const sections = splitMarkdownSections(content);
|
||||
if (sections.length === 0) return content.slice(0, maxLength);
|
||||
if (sections.length === 1 && !sections[0].heading) {
|
||||
return extractBestExcerpt(sections[0].body || content, queryTokens, maxLength);
|
||||
}
|
||||
const expanded = expandQuery(queryTokens);
|
||||
const expandedSet = new Set(expanded);
|
||||
const scoreText = (text: string) => {
|
||||
if (!text) return 0;
|
||||
const toks = tokenize(text);
|
||||
let hits = 0;
|
||||
for (const t of toks) if (expandedSet.has(t)) hits++;
|
||||
return hits;
|
||||
};
|
||||
let best = { idx: -1, score: -1 };
|
||||
for (let i = 0; i < sections.length; i++) {
|
||||
const s = sections[i];
|
||||
if ((s.heading.length + s.body.length) < 24) continue;
|
||||
const score = scoreText(s.heading) * 3 + scoreText(s.body);
|
||||
if (score > best.score) best = { idx: i, score };
|
||||
}
|
||||
if (best.idx < 0) {
|
||||
// No section contained any query terms — fall back to a whole-doc excerpt.
|
||||
return extractBestExcerpt(content, queryTokens, maxLength);
|
||||
}
|
||||
const picked = sections[best.idx];
|
||||
const headingLine = picked.heading ? `${picked.heading}\n` : '';
|
||||
const room = Math.max(64, maxLength - headingLine.length);
|
||||
if (picked.body.length <= room) {
|
||||
return (headingLine + picked.body).slice(0, maxLength).trim();
|
||||
}
|
||||
const inner = extractBestExcerpt(picked.body, queryTokens, room);
|
||||
return (headingLine + inner).slice(0, maxLength).trim();
|
||||
}
|
||||
|
||||
/**
|
||||
* 텍스트에서 가장 관련성 높은 구간(excerpt)을 추출합니다.
|
||||
* 단순 paragraph 단위가 아니라, 키워드 밀도가 높은 윈도우를 찾습니다.
|
||||
|
||||
+10
-9
@@ -7,15 +7,16 @@
|
||||
* ============================================================
|
||||
*/
|
||||
|
||||
export type RetrievalSource =
|
||||
| 'brain-trace' // Second Brain Trace
|
||||
| 'brain-memory' // findRelevantBrainMemory (legacy)
|
||||
| 'long-term-memory' // Long-Term Memory
|
||||
| 'project-memory' // Project Memory
|
||||
| 'procedural-memory' // Procedural Memory
|
||||
| 'episodic-memory' // Episodic Memory
|
||||
| 'project-scan' // Local Project Path scan
|
||||
| 'recent-knowledge'; // Recent Project Knowledge record
|
||||
export type RetrievalSource =
|
||||
| 'brain-trace' // Second Brain Trace
|
||||
| 'brain-memory' // findRelevantBrainMemory (legacy)
|
||||
| 'long-term-memory' // Long-Term Memory
|
||||
| 'medium-term-memory' // Recent session summaries (memoryMediumTermSessions)
|
||||
| 'project-memory' // Project Memory
|
||||
| 'procedural-memory' // Procedural Memory
|
||||
| 'episodic-memory' // Episodic Memory
|
||||
| 'project-scan' // Local Project Path scan
|
||||
| 'recent-knowledge'; // Recent Project Knowledge record
|
||||
|
||||
export type ConflictSeverity = 'NONE' | 'LOW' | 'MEDIUM' | 'HIGH';
|
||||
|
||||
|
||||
+69
-2
@@ -64,6 +64,7 @@ export class SidebarChatProvider implements vscode.WebviewViewProvider, BridgeIn
|
||||
static readonly activeChronicleProjectStateKey = 'g1nation.activeChronicleProjectId';
|
||||
static readonly lastAutoChronicleSignatureStateKey = 'g1nation.lastAutoChronicleSignature';
|
||||
_view?: vscode.WebviewView;
|
||||
_panel?: vscode.WebviewPanel;
|
||||
public brainEnabled = true;
|
||||
_currentSessionBrainId: string | null = null;
|
||||
_currentNegativePrompt: string = '';
|
||||
@@ -93,6 +94,36 @@ export class SidebarChatProvider implements vscode.WebviewViewProvider, BridgeIn
|
||||
context: vscode.WebviewViewResolveContext,
|
||||
_token: vscode.CancellationToken,
|
||||
) {
|
||||
this._initView(webviewView);
|
||||
}
|
||||
|
||||
/**
|
||||
* Open the chat as a standalone editor panel (Column 3 by default).
|
||||
* Reuses the same view-init logic via a WebviewPanel→WebviewView adapter
|
||||
* so the rest of the provider keeps using `this._view` unchanged.
|
||||
*/
|
||||
public openAsPanel(column: vscode.ViewColumn = vscode.ViewColumn.Three): vscode.WebviewPanel {
|
||||
if (this._panel) {
|
||||
this._panel.reveal(column);
|
||||
return this._panel;
|
||||
}
|
||||
const panel = vscode.window.createWebviewPanel(
|
||||
SidebarChatProvider.viewType,
|
||||
'Astra Chat',
|
||||
column,
|
||||
{ enableScripts: true, localResourceRoots: [this._extensionUri], retainContextWhenHidden: true }
|
||||
);
|
||||
this._panel = panel;
|
||||
const adapter = wrapPanelAsView(panel);
|
||||
panel.onDidDispose(() => {
|
||||
if (this._panel === panel) this._panel = undefined;
|
||||
if (this._view === adapter) this._view = undefined;
|
||||
});
|
||||
this._initView(adapter);
|
||||
return panel;
|
||||
}
|
||||
|
||||
private _initView(webviewView: vscode.WebviewView) {
|
||||
this._view = webviewView;
|
||||
|
||||
webviewView.webview.options = {
|
||||
@@ -108,8 +139,8 @@ export class SidebarChatProvider implements vscode.WebviewViewProvider, BridgeIn
|
||||
// 5초 이내에 이미 갱신했으면 건너뜀
|
||||
if (now - _lastVisibilityRefresh < 5000) return;
|
||||
_lastVisibilityRefresh = now;
|
||||
|
||||
logInfo('Sidebar became visible, restoring state...');
|
||||
|
||||
logInfo('Astra view became visible, restoring state...');
|
||||
void this._sendModels();
|
||||
void this._sendBrainProfiles();
|
||||
void this._sendAgentsList();
|
||||
@@ -2043,3 +2074,39 @@ export class SidebarChatProvider implements vscode.WebviewViewProvider, BridgeIn
|
||||
.replace('__SCRIPT_URI__', scriptUri);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Adapter that makes a {@link vscode.WebviewPanel} quack like a
|
||||
* {@link vscode.WebviewView}, so providers written against the view API can
|
||||
* mount inside an editor column without their internals knowing the difference.
|
||||
*
|
||||
* `onDidChangeVisibility` is synthesized from `onDidChangeViewState` — panels
|
||||
* fire that event for both visibility *and* column moves, but the listener
|
||||
* here only re-fires when the visible flag actually toggles.
|
||||
*/
|
||||
export function wrapPanelAsView(panel: vscode.WebviewPanel): vscode.WebviewView {
|
||||
const visibilityEmitter = new vscode.EventEmitter<void>();
|
||||
let _lastVisible = panel.visible;
|
||||
panel.onDidChangeViewState(() => {
|
||||
if (panel.visible !== _lastVisible) {
|
||||
_lastVisible = panel.visible;
|
||||
visibilityEmitter.fire();
|
||||
}
|
||||
});
|
||||
panel.onDidDispose(() => visibilityEmitter.dispose());
|
||||
const adapter: any = {
|
||||
viewType: panel.viewType,
|
||||
webview: panel.webview,
|
||||
get visible() { return panel.visible; },
|
||||
get title() { return panel.title; },
|
||||
set title(v: string | undefined) { panel.title = v ?? ''; },
|
||||
description: undefined as string | undefined,
|
||||
badge: undefined as vscode.ViewBadge | undefined,
|
||||
onDidChangeVisibility: visibilityEmitter.event,
|
||||
onDidDispose: panel.onDidDispose,
|
||||
show(preserveFocus?: boolean) {
|
||||
panel.reveal(panel.viewColumn ?? vscode.ViewColumn.Three, preserveFocus);
|
||||
},
|
||||
};
|
||||
return adapter as vscode.WebviewView;
|
||||
}
|
||||
|
||||
+9
-9
@@ -61,18 +61,18 @@ export function resolveEngine(baseUrl: string): EngineKind {
|
||||
return 'ollama';
|
||||
}
|
||||
|
||||
export function buildApiUrl(baseUrl: string, engine: EngineKind, endpoint: 'models' | 'chat'): string {
|
||||
export function buildApiUrl(baseUrl: string, engine: EngineKind, endpoint: 'models' | 'chat' | 'embeddings'): string {
|
||||
const normalized = normalizeBaseUrl(baseUrl);
|
||||
if (engine === 'lmstudio') {
|
||||
if (normalized.endsWith('/v1')) {
|
||||
return endpoint === 'models' ? `${normalized}/models` : `${normalized}/chat/completions`;
|
||||
}
|
||||
return endpoint === 'models' ? `${normalized}/v1/models` : `${normalized}/v1/chat/completions`;
|
||||
const root = normalized.endsWith('/v1') ? normalized : `${normalized}/v1`;
|
||||
if (endpoint === 'models') return `${root}/models`;
|
||||
if (endpoint === 'embeddings') return `${root}/embeddings`;
|
||||
return `${root}/chat/completions`;
|
||||
}
|
||||
if (normalized.endsWith('/api')) {
|
||||
return endpoint === 'models' ? `${normalized}/tags` : `${normalized}/chat`;
|
||||
}
|
||||
return endpoint === 'models' ? `${normalized}/api/tags` : `${normalized}/api/chat`;
|
||||
const apiRoot = normalized.endsWith('/api') ? normalized : `${normalized}/api`;
|
||||
if (endpoint === 'models') return `${apiRoot}/tags`;
|
||||
if (endpoint === 'embeddings') return `${apiRoot}/embed`;
|
||||
return `${apiRoot}/chat`;
|
||||
}
|
||||
|
||||
export function summarizeText(text: string, maxLength: number = 400): string {
|
||||
|
||||
Reference in New Issue
Block a user