From 089abf22db413ddb856cef309d8007a8aab91157 Mon Sep 17 00:00:00 2001 From: g1nation Date: Wed, 13 May 2026 19:23:57 +0900 Subject: [PATCH] refactor: optimize core engine and retrieval logic for v2.80.43 --- media/sidebar.js | 12 + src/agent.ts | 402 +++++++++++++++++- src/config.ts | 17 +- src/core/queue.ts | 22 +- src/core/telemetry.ts | 129 ++++++ src/extension.ts | 23 +- .../approval/approvalPanelProvider.ts | 36 +- .../settings/settingsPanelProvider.ts | 46 +- src/retrieval/brainIndex.ts | 100 ++++- src/retrieval/contextBudget.ts | 1 + src/retrieval/embeddings.ts | 167 ++++++++ src/retrieval/index.ts | 173 +++++++- src/retrieval/lessonHelpers.ts | 48 +++ src/retrieval/scoring.ts | 115 +++++ src/retrieval/types.ts | 19 +- src/sidebarProvider.ts | 71 +++- src/utils.ts | 18 +- 17 files changed, 1311 insertions(+), 88 deletions(-) create mode 100644 src/core/telemetry.ts create mode 100644 src/retrieval/embeddings.ts diff --git a/media/sidebar.js b/media/sidebar.js index 88a548b..0ed2830 100644 --- a/media/sidebar.js +++ b/media/sidebar.js @@ -518,6 +518,18 @@ chat.scrollTop = chat.scrollHeight; } break; + case 'streamReplace': + // Progressive answering: the backend streamed raw tokens + // live (including hidden reasoning, pre-sanitize text); + // once everything is finalized it sends the cleaned full + // text via streamReplace so the bubble ends up correct + // regardless of what slipped through during streaming. + if (streamBody) { + streamBody._parent._raw = String(msg.value ?? ''); + streamBody.innerHTML = fmt(streamBody._parent._raw); + chat.scrollTop = chat.scrollHeight; + } + break; case 'streamEnd': if (streamBody) { streamBody.classList.remove('stream-active'); diff --git a/src/agent.ts b/src/agent.ts index 6c6847b..5b3bc8a 100644 --- a/src/agent.ts +++ b/src/agent.ts @@ -30,6 +30,7 @@ import { StatusBarManager, AgentStatus } from './core/statusBar'; import { lockManager } from './core/lock'; import { actionQueue } from './core/queue'; import { ConflictResolver } from './core/conflict'; +import { recordTelemetry } from './core/telemetry'; import { buildSecondBrainTrace, enforceProjectClaimPolicyInAnswer, @@ -40,6 +41,8 @@ import { import { MemoryManager } from './memory'; import { RetrievalOrchestrator } from './retrieval'; import { buildLessonChecklistBlock, isQaRegressionFeedback, findUnaddressedChecklistItems } from './retrieval/lessonHelpers'; +import { embedQuery, embedTexts } from './retrieval/embeddings'; +import { backfillBrainEmbeddings } from './retrieval/brainIndex'; import { resolveScopeForAgent } from './skills/agentKnowledgeMap'; import { extractVisibleFinal, @@ -117,6 +120,51 @@ const AGENT_PROMPTS: Record = { 3. Deliver a logical, consistent, and polished response.` }; +/** + * Compact recent chat sessions for medium-term memory retrieval. + * + * Returns up to `limit + 5` recently-touched sessions (excluding the active + * one) as small summaries: title + first user message + tail of the last + * assistant message. The retrieval orchestrator then scores these against the + * current query and selects the top `limit` matches inside the shared budget. + * + * We pull a few more than `limit` so TF-IDF scoring has room to rerank — the + * persisted list is timestamp-ordered, which isn't the same as topical fit. + */ +function compactRecentSessions( + rawSessions: any[], + activeSessionId: string | null, + limit: number, +): Array<{ id: string; title: string; firstUserMsg: string; lastAssistantExcerpt: string; summary?: string; timestamp: number }> { + if (!Array.isArray(rawSessions) || rawSessions.length === 0 || limit <= 0) return []; + const pool = rawSessions.length > limit + 5 ? limit + 5 : rawSessions.length; + const out: Array<{ id: string; title: string; firstUserMsg: string; lastAssistantExcerpt: string; summary?: string; timestamp: number }> = []; + for (let i = 0; i < rawSessions.length && out.length < pool; i++) { + const s = rawSessions[i]; + if (!s || typeof s !== 'object') continue; + const id = String(s.id ?? ''); + if (!id || id === activeSessionId) continue; + const history: any[] = Array.isArray(s.history) ? s.history : []; + if (history.length === 0) continue; + const firstUser = history.find((m) => m?.role === 'user'); + const lastAssistant = [...history].reverse().find((m) => m?.role === 'assistant'); + const firstUserMsg = String(firstUser?.content ?? '').replace(/\s+/g, ' ').trim().slice(0, 200); + const lastTxt = String(lastAssistant?.content ?? '').replace(/\s+/g, ' ').trim(); + const lastAssistantExcerpt = lastTxt.length <= 200 ? lastTxt : lastTxt.slice(-200); + const summary = typeof s.summary === 'string' ? s.summary.trim().slice(0, 600) : undefined; + if (!firstUserMsg && !lastAssistantExcerpt && !summary) continue; + out.push({ + id, + title: String(s.title ?? '').trim() || firstUserMsg.slice(0, 50), + firstUserMsg, + lastAssistantExcerpt, + summary, + timestamp: typeof s.timestamp === 'number' ? s.timestamp : 0, + }); + } + return out; +} + // Local-path detectors used to decide whether a user prompt refers to a file/dir on disk. // POSIX: /Volumes/, /Users/, /home/, /opt/, ... or ~/ — backtick excluded (markdown code spans). const POSIX_ABS_PATH_SRC = "(?:\\/(?:Volumes|Users|home|opt|srv|mnt|data|workspace)\\/|~\\/)[^\\s`\"'<>|*?]+"; @@ -328,6 +376,10 @@ export class AgentExecutor { if (!this.webview) return; + // Telemetry: wall-clock start of the user-visible turn. Only meaningful + // at loopDepth===0 (action-loop recursions roll up into the same turn). + const turnStartMs = loopDepth === 0 ? Date.now() : 0; + try { // 0. Safety Check: Rollback any dangling transaction from previous runs if (this.transactionManager.isActive()) { @@ -471,9 +523,19 @@ export class AgentExecutor { const secondBrainTraceCtx = secondBrainTrace ? `\n\n${renderSecondBrainTraceContext(secondBrainTrace)}` : ''; + const retrievalStartMs = Date.now(); const memoryCtx = isCasualConversation ? '' - : this.buildMemoryContext(prompt || '', activeBrain, options.agentSkillFile); + : await this.buildMemoryContext(prompt || '', activeBrain, options.agentSkillFile); + if (loopDepth === 0 && !isCasualConversation && this._lastRetrievalInfo) { + recordTelemetry({ + kind: 'retrieval', + durationMs: Date.now() - retrievalStartMs, + brainFiles: this._lastRetrievalInfo.usedBrainFiles.length, + memoryLayers: this._lastRetrievalInfo.usedMemoryLayers, + note: `chunks=${this._lastRetrievalInfo.selectedChunks}/${this._lastRetrievalInfo.totalChunks} lessons=${this._lastRetrievalInfo.lessonFiles.length}`, + }); + } const knowledgeContextForPrompt = isCasualConversation ? '' : `${brainContext}${brainInventoryCtx}`; @@ -677,6 +739,16 @@ export class AgentExecutor { this.options.onStreamLifecycle?.start(); } + // Progressive answering: live-stream tokens to the webview during + // the user-visible first turn (loopDepth === 0). The bubble fills + // as the model generates instead of dropping all at once at the end, + // and any auto-continuation rounds keep posting deltas through the + // same channel. Post-processing (reasoning strip / sanitize / + // policy enforcement) emits a final `streamReplace` so the bubble + // ends up matching the cleaned answer regardless of what slipped + // through live. + const postLiveDeltas = loopDepth === 0; + if (useLmStudioSdk) { apiUrl = `${ollamaUrl} (sdk)`; logInfo('Streaming chat via LM Studio SDK.', { model: actualModel }); @@ -691,7 +763,10 @@ export class AgentExecutor { }); for await (const { token, stopReason } of stream) { if (this.isStaleRun(runId)) return; - if (token) aiResponseText += token; + if (token) { + aiResponseText += token; + if (postLiveDeltas) this.webview.postMessage({ type: 'streamChunk', value: token }); + } if (stopReason) finishStopReason = stopReason; } } catch (err: any) { @@ -747,6 +822,7 @@ export class AgentExecutor { const token = engine === 'lmstudio' ? json.choices?.[0]?.delta?.content || '' : json.message?.content || json.response || ''; if (token) { aiResponseText += token; + if (postLiveDeltas) this.webview.postMessage({ type: 'streamChunk', value: token }); } const fr = engine === 'lmstudio' ? json.choices?.[0]?.finish_reason @@ -778,6 +854,7 @@ export class AgentExecutor { const token = engine === 'lmstudio' ? json.choices?.[0]?.delta?.content || '' : json.message?.content || json.response || ''; if (token) { aiResponseText += token; + if (postLiveDeltas) this.webview.postMessage({ type: 'streamChunk', value: token }); } const fr = engine === 'lmstudio' ? json.choices?.[0]?.finish_reason @@ -829,7 +906,10 @@ export class AgentExecutor { let retryText = ''; for await (const { token, stopReason } of retryStream) { if (this.isStaleRun(runId)) return; - if (token) retryText += token; + if (token) { + retryText += token; + if (postLiveDeltas) this.webview.postMessage({ type: 'streamChunk', value: token }); + } if (stopReason) finishStopReason = stopReason; } if (retryText.trim()) { @@ -922,6 +1002,7 @@ export class AgentExecutor { && !this.isStaleRun(runId) ) { continuationCount++; + const continuationStartMs = Date.now(); this.webview.postMessage({ type: 'autoContinue', value: `답변이 길어 이어서 정리하는 중입니다... (${continuationCount}/${config.maxAutoContinuations})` }); try { const contMsgs: ChatMessage[] = [ @@ -929,11 +1010,24 @@ export class AgentExecutor { { role: 'user', content: buildContinuationUserPrompt(originalUserPrompt, cleaned.visible) }, ]; lastMaxOutputTokens = computeOutputBudget(estimateMessagesTokens(contMsgs), ctxLimits).maxOutputTokens; - const cr = await this.callNonStreaming({ - baseUrl: ollamaUrl, modelName: actualModel, engine, messages: contMsgs, - temperature, maxTokens: lastMaxOutputTokens, contextLength: ctxLimits.contextLength, - signal: this.abortController?.signal, + // Stream the continuation through the same channel as the main turn so + // the user sees the answer keep growing instead of freezing for 10–30s + // while we silently call non-streaming. The trailing streamReplace + // (after sanitize / merge) corrects any overlap the model re-emits. + const cr = await this.streamChatOnce({ + runId, useLmStudioSdk, engine, ollamaUrl, modelName: actualModel, + messages: contMsgs, + temperature, + maxTokens: lastMaxOutputTokens, + contextLength: ctxLimits.contextLength, + contextOverflowPolicy: config.contextOverflowPolicy, + signal: this.abortController!.signal, + postLiveDeltas, }); + if (cr.aborted) { + logInfo('Auto-continuation aborted mid-stream.', { model: actualModel, round: continuationCount }); + break; + } finishStopReason = cr.stopReason; const ccl = extractVisibleFinal(cr.text); if (!ccl.visible.trim()) { @@ -944,6 +1038,15 @@ export class AgentExecutor { cleaned = { ...cleaned, visible: mergeContinuationParts(cleaned.visible, ccl.visible), wasThoughtOnly: false }; lastOutputTokens = estimateTokens(ccl.visible); logInfo('Auto-continued the answer.', { model: actualModel, round: continuationCount, addedChars: ccl.visible.length, totalChars: cleaned.visible.length, contStopReason: cr.stopReason, contMaxTokens: lastMaxOutputTokens }); + recordTelemetry({ + kind: 'continuation', + durationMs: Date.now() - continuationStartMs, + model: actualModel, engine, + outputTokens: lastOutputTokens, + round: continuationCount, + stopReason: cr.stopReason, + note: `addedChars=${ccl.visible.length} mergedAdd=${cleaned.visible.length - before.length}`, + }); // Guard against a continuation that adds (almost) nothing new after dedup — stop instead of spinning. if (cleaned.visible.length - before.length < 20) { logInfo('Continuation added negligible new text — stopping.', { model: actualModel, round: continuationCount }); @@ -1099,7 +1202,32 @@ export class AgentExecutor { value: { ...this._lastRetrievalInfo, hasAgentSelected: !!options.agentSkillFile, unaddressedChecklist }, }); } - this.webview.postMessage({ type: 'streamChunk', value: finalAssistantContent }); + // Progressive answering: the bubble was filled live with raw tokens + // during streaming (and during any auto-continuation rounds). Now + // that we have the cleaned + merged + policy-enforced text, swap the + // bubble's content for the final version so the user sees the + // correct answer regardless of what slipped through live — + // hidden reasoning, mid-stream artifacts, continuation-overlap re- + // emits, truncation notice. Action-loop turns (loopDepth > 0) still + // append via streamChunk because the bubble has multiple action + // segments and we don't have a single "final" to replace with. + if (loopDepth === 0) { + this.webview.postMessage({ type: 'streamReplace', value: finalAssistantContent }); + recordTelemetry({ + kind: 'turn', + durationMs: Date.now() - turnStartMs, + model: actualModel, engine, + inputTokens, + outputTokens, + contextLength: ctxLimits.contextLength, + stopReason: finishStopReason, + brainFiles: this._lastRetrievalInfo?.usedBrainFiles.length ?? 0, + memoryLayers: this._lastRetrievalInfo?.usedMemoryLayers ?? [], + note: `continuations=${continuationCount} historyDropped=${reqMessages.length - budgetedHistory.length}`, + }); + } else { + this.webview.postMessage({ type: 'streamChunk', value: finalAssistantContent }); + } } catch (error: any) { this.statusBarManager.updateStatus(AgentStatus.Error, error.message); @@ -2309,7 +2437,7 @@ export class AgentExecutor { }); } - private buildMemoryContext(currentPrompt: string, activeBrain: BrainProfile, agentSkillFile?: string): string { + private async buildMemoryContext(currentPrompt: string, activeBrain: BrainProfile, agentSkillFile?: string): Promise { const config = getConfig(); this._lastRetrievalInfo = null; this._lastLessonContents = []; @@ -2331,6 +2459,44 @@ export class AgentExecutor { // keeping the legacy behavior intact. const scope = resolveScopeForAgent(agentSkillFile, activeBrain.localBrainPath); + // Scale retrieval/memory budget with the configured context window so + // that raising g1nation.contextLength actually gives the RAG pipeline + // more room. At 32K context we keep the legacy 8K total (≈3.2K + // retrieval); at 230K we allocate ~57K total (≈23K retrieval). Capped + // at 80K so scoring stays fast on huge contexts. + const scaledTotalBudget = Math.min( + 80000, + Math.max(8000, Math.floor(config.contextLength * 0.25)) + ); + + // Pull recent session summaries for the medium-term layer. We read + // from the sidebar's persisted store directly (same key it writes to) + // to avoid threading another callback through the agent constructor. + const rawSessions = this.context.globalState.get('chat_sessions', []) || []; + const recentSessions = compactRecentSessions( + rawSessions, + this.currentTaskId, + Math.max(0, config.memoryMediumTermSessions ?? 0) + ); + + // Hybrid retrieval (optional): when the user has configured an + // embedding model, fetch a query embedding so searchBrainFiles can + // blend cosine similarity with TF-IDF. Time-bounded — if the + // embedding endpoint is slow or down, we fall through with no + // embedding and the retriever stays in pure-TF-IDF mode. + let queryEmbedding: number[] | undefined; + if (config.embeddingModel) { + const EMBED_QUERY_TIMEOUT_MS = 4000; + try { + queryEmbedding = await Promise.race([ + embedQuery(currentPrompt, { baseUrl: config.ollamaUrl, model: config.embeddingModel }), + new Promise((resolve) => setTimeout(() => resolve(undefined), EMBED_QUERY_TIMEOUT_MS)), + ]); + } catch { + queryEmbedding = undefined; + } + } + // Use the Unified RAG Pipeline const result = this.retrievalOrchestrator.retrieve(currentPrompt, { brain: activeBrain, @@ -2338,13 +2504,36 @@ export class AgentExecutor { workspacePath, chatHistory: visibleHistory, contextBudget: { - totalBudget: 8000, + totalBudget: scaledTotalBudget, retrievalRatio: 0.4 }, brainFileLimit: config.memoryLongTermFiles, - scopeFolders: scope.folders + scopeFolders: scope.folders, + recentSessions, + mediumTermLimit: config.memoryMediumTermSessions ?? 0, + queryEmbedding, + embeddingModel: config.embeddingModel || undefined, + embeddingBlendAlpha: config.embeddingBlendAlpha, }); + // Fire-and-forget background embedding for the files we just scored. + // Embeds only files that lack a vector for the current model — so + // steady-state turns do no embedding work. The next turn benefits. + if (config.embeddingModel) { + const scoredFilePaths = result.selectedChunks + .filter((c) => c.source === 'brain-memory' && c.metadata.filePath) + .map((c) => c.metadata.filePath!) + .filter((p, i, arr) => arr.indexOf(p) === i); + if (scoredFilePaths.length > 0) { + void backfillBrainEmbeddings( + activeBrain.localBrainPath, + scoredFilePaths, + config.embeddingModel, + (texts) => embedTexts(texts, { baseUrl: config.ollamaUrl, model: config.embeddingModel }), + ); + } + } + // Stash what actually fed this turn so handlePrompt can show it under the answer. const brainRoot = activeBrain.localBrainPath; const rel = (p?: string) => (p ? (path.relative(brainRoot, p) || p) : ''); @@ -2406,11 +2595,74 @@ export class AgentExecutor { workspacePath ); logInfo('Memory extraction completed for session end.', { taskId: this.currentTaskId }); + recordTelemetry({ + kind: 'session-end', + note: `taskId=${this.currentTaskId} messages=${this.chatHistory.filter((m) => !m.internal).length}`, + }); + // Fire-and-forget LLM compression: turns the raw transcript into a + // 2–3 sentence summary that medium-term retrieval can use instead + // of just "first user msg + last assistant 200 chars". Cheap call + // (~256 output tokens), runs in the background so it never blocks + // the next chat turn. + void this.compressSessionSummary(this.currentTaskId, this.chatHistory.slice()); } catch (error: any) { logError('Memory extraction failed on session end.', { error: error?.message || String(error) }); } } + /** + * Compress a finished session into a short summary and persist it to the + * session record. The summary is later read by `compactRecentSessions` so + * the medium-term memory layer carries a real recap instead of a fragment. + * + * Skips sessions with fewer than 3 visible messages — they're typically + * single-question pings where the raw first message is already a good + * summary. Failures are logged and swallowed: a missing summary just + * falls back to the legacy "first user msg" representation. + */ + private async compressSessionSummary(taskId: string, history: ChatMessage[]): Promise { + const visible = history.filter((m) => !m.internal && (m.role === 'user' || m.role === 'assistant')); + if (visible.length < 3) return; + const cfg = getConfig(); + const transcript = visible + .map((m) => `${m.role.toUpperCase()}: ${String(m.content).replace(/\s+/g, ' ').slice(0, 400)}`) + .join('\n\n'); + const messages: ChatMessage[] = [ + { + role: 'system', + content: [ + 'You compress chat transcripts into a 2-3 sentence summary.', + 'Capture: (1) the user\'s topic or task, (2) the main decision or answer reached, (3) any open issue.', + 'Reply in the user\'s primary language (mirror Korean ↔ English exactly as in the transcript).', + 'Reply with ONLY the summary text. No headers, no quotes, no preamble.', + ].join(' '), + internal: true, + }, + { role: 'user', content: `[TRANSCRIPT]\n${transcript}\n[END]` }, + ]; + try { + const result = await this.callNonStreaming({ + baseUrl: cfg.ollamaUrl, + modelName: cfg.defaultModel, + engine: resolveEngine(cfg.ollamaUrl), + messages, + temperature: 0.3, + maxTokens: 256, + contextLength: cfg.contextLength, + }); + const summary = (result.text || '').trim().replace(/^["'`]+|["'`]+$/g, ''); + if (!summary || summary.length < 12) return; + const sessions = this.context.globalState.get('chat_sessions', []) || []; + const idx = sessions.findIndex((s) => String(s?.id) === String(taskId)); + if (idx < 0) return; + sessions[idx].summary = summary; + await this.context.globalState.update('chat_sessions', sessions); + logInfo('Session summary stored for medium-term recall.', { taskId, length: summary.length }); + } catch (e: any) { + logError('Session summary compression failed.', { taskId, error: e?.message ?? String(e) }); + } + } + private async createStreamingRequest(params: { baseUrl: string; modelName: string; @@ -2568,6 +2820,134 @@ export class AgentExecutor { } } + /** + * Single streaming call used by progressive answering (live-delta main + * stream + auto-continuation rounds). Mirrors the main streaming block in + * handlePrompt but without the empty-stream recovery / non-streaming + * fallback machinery — those only matter for the very first generation. + * + * When `postLiveDeltas` is true, every token is also forwarded to the + * webview as a `streamChunk`, giving the user a real-time view of the + * answer (and of continuation rounds) instead of one big drop at the end. + * + * Returns the accumulated text and the final stop reason. Aborts and + * stale runs surface as `aborted: true` and an empty/partial text — the + * caller decides what to do with that. + */ + private async streamChatOnce(params: { + runId: number; + useLmStudioSdk: boolean; + engine: 'lmstudio' | 'ollama'; + ollamaUrl: string; + modelName: string; + messages: ChatMessage[]; + temperature: number; + maxTokens: number; + contextLength: number; + contextOverflowPolicy: 'stopAtLimit' | 'truncateMiddle' | 'rollingWindow'; + signal: AbortSignal; + postLiveDeltas: boolean; + }): Promise<{ text: string; stopReason?: string; aborted: boolean }> { + let accumulated = ''; + let finishStopReason: string | undefined; + const post = (token: string) => { + if (params.postLiveDeltas && token) { + this.webview?.postMessage({ type: 'streamChunk', value: token }); + } + }; + + if (params.useLmStudioSdk) { + try { + const stream = this.options.lmStudioStreamer!.stream({ + modelName: params.modelName, + messages: params.messages.map((m) => ({ role: m.role, content: m.content })), + temperature: params.temperature, + maxTokens: params.maxTokens, + contextOverflowPolicy: params.contextOverflowPolicy, + signal: params.signal, + }); + for await (const { token, stopReason } of stream) { + if (this.isStaleRun(params.runId)) { + return { text: accumulated, stopReason: finishStopReason, aborted: true }; + } + if (token) { + accumulated += token; + post(token); + } + if (stopReason) finishStopReason = stopReason; + } + } catch (err: any) { + if (err?.name === 'AbortError' || params.signal.aborted) { + return { text: accumulated, stopReason: finishStopReason, aborted: true }; + } + const msg = err?.message ?? String(err); + if (/context\s*length|contextlengthreached|exceed|too\s*long/i.test(msg)) { + finishStopReason = 'contextLengthReached'; + } + logError('streamChatOnce SDK path failed.', { engine: params.engine, error: msg }); + throw err; + } + return { text: accumulated, stopReason: finishStopReason, aborted: false }; + } + + const request = await this.createStreamingRequest({ + baseUrl: params.ollamaUrl, + modelName: params.modelName, + reqMessages: params.messages, + temperature: params.temperature, + maxTokens: params.maxTokens, + contextLength: params.contextLength, + }); + const reader = request.response.body?.getReader(); + if (!reader) throw new Error('Response body is not readable.'); + const decoder = new TextDecoder(); + let buffer = ''; + const consumeJsonLine = (line: string) => { + const trimmed = line.trim(); + if (!trimmed || trimmed === 'data: [DONE]') return; + try { + const raw = trimmed.startsWith('data: ') ? trimmed.slice(6) : trimmed; + const json = JSON.parse(raw); + const token = params.engine === 'lmstudio' + ? json.choices?.[0]?.delta?.content || '' + : json.message?.content || json.response || ''; + if (token) { + accumulated += token; + post(token); + } + const fr = params.engine === 'lmstudio' + ? json.choices?.[0]?.finish_reason + : (json.done_reason ?? (json.done === true ? 'stop' : undefined)); + if (fr) finishStopReason = fr; + } catch (e: any) { + logError('streamChatOnce: failed to parse chunk.', { engine: params.engine, chunk: summarizeText(trimmed, 200), error: e?.message ?? String(e) }); + } + }; + try { + while (true) { + const { done, value } = await reader.read(); + if (done) break; + if (this.isStaleRun(params.runId)) { + return { text: accumulated, stopReason: finishStopReason, aborted: true }; + } + buffer += decoder.decode(value, { stream: true }); + const lines = buffer.split('\n'); + buffer = lines.pop() || ''; + for (const line of lines) consumeJsonLine(line); + } + if (buffer.trim()) consumeJsonLine(buffer); + } catch (err: any) { + if (err?.name === 'AbortError') { + return { text: accumulated, stopReason: finishStopReason, aborted: true }; + } + logError('streamChatOnce REST path failed.', { engine: params.engine, error: err?.message ?? String(err) }); + throw err; + } finally { + try { reader.releaseLock(); } catch { /* already released on abort */ } + } + return { text: accumulated, stopReason: finishStopReason, aborted: false }; + } + private normalizeMessages(messages: ChatMessage[]) { return messages.map((message) => { const normalizedContent = typeof message.content === 'string' diff --git a/src/config.ts b/src/config.ts index 61df6f9..e4c7535 100644 --- a/src/config.ts +++ b/src/config.ts @@ -45,6 +45,19 @@ export interface IAgentConfig { maxAutoContinuations: number; /** 모델이 내부 사고만 출력하고 답변이 없으면 "최종 답변만" 지시로 1회 재생성. */ finalOnlyRetryOnThoughtLeak: boolean; + // ─── Hybrid Semantic Search ─── + /** + * Embedding model name as registered in LM Studio / Ollama. Empty disables + * semantic search and the retriever falls back to TF-IDF only. The user + * must load this model in the engine before enabling it here. + */ + embeddingModel: string; + /** + * Blend between TF-IDF (sparse) and embedding cosine (dense) scoring. + * 0 = TF-IDF only (status quo), 1 = embedding only. + * Default 0.5 = equal weight, a reasonable starting point. + */ + embeddingBlendAlpha: number; } // ─── 경로 정규화 유틸리티 ─── @@ -125,7 +138,9 @@ export function getConfig(): IAgentConfig { smallModelContextCap: Math.max(0, cfg.get('smallModelContextCap', 0)), autoContinueOnOutputLimit: cfg.get('autoContinueOnOutputLimit', true), maxAutoContinuations: Math.max(0, Math.min(10, cfg.get('maxAutoContinuations', 4))), - finalOnlyRetryOnThoughtLeak: cfg.get('finalOnlyRetryOnThoughtLeak', true) + finalOnlyRetryOnThoughtLeak: cfg.get('finalOnlyRetryOnThoughtLeak', true), + embeddingModel: (cfg.get('embeddingModel', '') || '').trim(), + embeddingBlendAlpha: Math.max(0, Math.min(1, cfg.get('embeddingBlendAlpha', 0.5))), }; } diff --git a/src/core/queue.ts b/src/core/queue.ts index 637d57f..74ce1bf 100644 --- a/src/core/queue.ts +++ b/src/core/queue.ts @@ -1,8 +1,24 @@ +import * as os from 'os'; import { logInfo, logError } from '../utils'; /** - * ActionQueueManager: Manages large-scale tasks by processing them - * with a concurrency limit to prevent resource exhaustion and I/O bottlenecks + * Default concurrency = max(2, cpus - 1). Leaves one core for the VS Code UI + * thread and the extension host, scales up on bigger boxes. Static per-process + * (no dynamic adjustment) — kept simple because the heavy work (LLM calls) + * is gated by `missionId` locks elsewhere, not the action queue. + */ +function defaultConcurrencyLimit(): number { + try { + const cpus = os.cpus()?.length ?? 4; + return Math.max(2, cpus - 1); + } catch { + return 3; + } +} + +/** + * ActionQueueManager: Manages large-scale tasks by processing them + * with a concurrency limit to prevent resource exhaustion and I/O bottlenecks * while maintaining high throughput under maximum load. */ export class ActionQueueManager { @@ -10,7 +26,7 @@ export class ActionQueueManager { private activeCount: number = 0; private readonly concurrencyLimit: number; - constructor(concurrencyLimit: number = 3) { + constructor(concurrencyLimit: number = defaultConcurrencyLimit()) { this.concurrencyLimit = concurrencyLimit; } diff --git a/src/core/telemetry.ts b/src/core/telemetry.ts new file mode 100644 index 0000000..ca6f51b --- /dev/null +++ b/src/core/telemetry.ts @@ -0,0 +1,129 @@ +/** + * ============================================================ + * Telemetry — append-only usage events to `.astra/usage.jsonl` + * + * Why local-file telemetry instead of a webview dashboard or remote endpoint: + * - Astra is local-first. No data leaves the machine. + * - JSONL is trivial to inspect manually (`tail`, jq) and trivial to ingest + * into a future webview chart without schema migrations. + * - Append-only means the writer never blocks on history. + * + * Event shape is intentionally flat — top-level scalar fields only, so a future + * dashboard can sum/group/filter without parsing nested structures. + * ============================================================ + */ + +import * as fs from 'fs'; +import * as path from 'path'; +import { getAstraDataDir } from './astraPath'; +import { logError } from '../utils'; + +/** Top-level event kinds. Add sparingly — each is a stable contract for the JSONL. */ +export type TelemetryEventKind = + | 'turn' // one user-visible chat turn (input → final answer) + | 'continuation' // an auto-continuation round inside a turn + | 'retrieval' // brain + memory retrieval summary + | 'session-end'; // session closed (used to bound aggregation queries) + +export interface TelemetryEvent { + kind: TelemetryEventKind; + /** ISO timestamp. Always present so a viewer can plot on a time axis without recomputing. */ + ts: string; + /** Wall-clock milliseconds the event took, when applicable. 0 for instantaneous events. */ + durationMs?: number; + /** Model identifier the request was bound to, when applicable. */ + model?: string; + /** Engine name (lmstudio | ollama), when applicable. */ + engine?: string; + /** Input token estimate that went into this event, when applicable. */ + inputTokens?: number; + /** Output token estimate produced by this event, when applicable. */ + outputTokens?: number; + /** Configured context window for this event, when applicable. */ + contextLength?: number; + /** Continuation round index for `kind: 'continuation'`. */ + round?: number; + /** Stop reason from the engine, when applicable. */ + stopReason?: string; + /** Brain files actually used this turn. */ + brainFiles?: number; + /** Memory layers that contributed chunks this turn. */ + memoryLayers?: string[]; + /** Free-form structured details. Keep small — this lives in the JSONL forever. */ + note?: string; +} + +const MAX_FILE_BYTES = 5 * 1024 * 1024; // 5 MB → ~25k events worst case +const ROTATE_KEEP = 2; // keep usage.jsonl + usage.1.jsonl + +function jsonlPath(): string { + return path.join(getAstraDataDir(), 'usage.jsonl'); +} + +function rotateIfNeeded(p: string): void { + try { + const stat = fs.statSync(p); + if (stat.size <= MAX_FILE_BYTES) return; + // Shift usage.{N-1}.jsonl → usage.{N}.jsonl, drop the oldest. + for (let i = ROTATE_KEEP; i >= 1; i--) { + const older = path.join(getAstraDataDir(), `usage.${i}.jsonl`); + const newer = i === 1 ? p : path.join(getAstraDataDir(), `usage.${i - 1}.jsonl`); + if (fs.existsSync(newer)) { + if (i === ROTATE_KEEP && fs.existsSync(older)) { + try { fs.unlinkSync(older); } catch { /* non-fatal */ } + } + try { fs.renameSync(newer, older); } catch { /* non-fatal */ } + } + } + } catch { + // File doesn't exist yet — first write will create it. + } +} + +/** + * Append one event to the rotating JSONL. Best-effort: failures are logged but + * never thrown, because telemetry must not break a live chat turn. + */ +export function recordTelemetry(event: Omit & { ts?: string }): void { + try { + const full: TelemetryEvent = { ts: new Date().toISOString(), ...event }; + const line = JSON.stringify(full) + '\n'; + const p = jsonlPath(); + rotateIfNeeded(p); + fs.appendFile(p, line, { encoding: 'utf8' }, (err) => { + if (err) logError('Telemetry append failed.', { error: err.message }); + }); + } catch (e: any) { + // Final safety net — telemetry must never escape. + logError('Telemetry recordTelemetry threw.', { error: e?.message ?? String(e) }); + } +} + +/** + * Read the last `limit` events from the current and prior usage files. Used by + * a future Settings panel chart; here so the viewer doesn't have to parse paths + * or worry about rotation. + */ +export function readRecentTelemetry(limit = 500): TelemetryEvent[] { + const dir = getAstraDataDir(); + const files: string[] = []; + const head = path.join(dir, 'usage.jsonl'); + if (fs.existsSync(head)) files.push(head); + for (let i = 1; i <= ROTATE_KEEP; i++) { + const p = path.join(dir, `usage.${i}.jsonl`); + if (fs.existsSync(p)) files.push(p); + } + const out: TelemetryEvent[] = []; + for (const f of files) { + try { + const raw = fs.readFileSync(f, 'utf8'); + for (const line of raw.split('\n')) { + const trimmed = line.trim(); + if (!trimmed) continue; + try { out.push(JSON.parse(trimmed) as TelemetryEvent); } catch { /* skip bad line */ } + } + } catch { /* skip unreadable file */ } + if (out.length >= limit * 2) break; // rough upper bound to bound work + } + return out.slice(-limit); +} diff --git a/src/extension.ts b/src/extension.ts index 951cbb4..f4e762e 100644 --- a/src/extension.ts +++ b/src/extension.ts @@ -119,11 +119,13 @@ export async function activate(context: vscode.ExtensionContext) { ); // 3. Initialize Approval subsystem (queue + panel webview + status bar badge) + // Astra 2.81: sidebar view container is gone; all webviews open in editor + // column 3 instead. We don't register a WebviewViewProvider — panels are + // created on-demand via openAsPanel(). const approvalQueue = new ApprovalQueue(); const approvalPanel = new ApprovalPanelProvider(context.extensionUri, approvalQueue); const approvalStatusBar = new ApprovalStatusBar(approvalQueue); context.subscriptions.push( - vscode.window.registerWebviewViewProvider(ApprovalPanelProvider.viewType, approvalPanel), approvalStatusBar, { dispose: () => approvalQueue.dispose() }, vscode.commands.registerCommand(ApprovalStatusBar.focusCommand, () => approvalPanel.focus()), @@ -140,14 +142,16 @@ export async function activate(context: vscode.ExtensionContext) { approvalQueue, }); - // 4. Initialize Sidebar Provider + // 4. Initialize Chat Provider (renders into an editor column, not a sidebar view) provider = new SidebarChatProvider(context.extensionUri, context, agent, { lifecycle, activity: activityTracker, loadedModels: () => lmStudioClient.listLoadedCached(), }); context.subscriptions.push( - vscode.window.registerWebviewViewProvider(SidebarChatProvider.viewType, provider) + vscode.commands.registerCommand('g1nation.openChat', () => { + provider!.openAsPanel(vscode.ViewColumn.Three); + }) ); // 4. Initialize Bridge Server (Port 4825) @@ -559,7 +563,6 @@ export async function activate(context: vscode.ExtensionContext) { telegramBot, }); context.subscriptions.push( - vscode.window.registerWebviewViewProvider(SettingsPanelProvider.viewType, settingsPanel), // Refresh the settings UI whenever any g1nation.* config changes (toggle, allowedChatIds, …). vscode.workspace.onDidChangeConfiguration((e) => { if (e.affectsConfiguration('g1nation')) void settingsPanel.refresh(); @@ -628,6 +631,18 @@ export async function activate(context: vscode.ExtensionContext) { if (!setupComplete) { await runInitialSetup(context); } + + // 7. Auto-open all three Astra webviews as tabs in editor column 3. + // The sidebar/activity-bar entry point was removed in 2.81 — all three views + // (Chat, Approvals, Settings) now stack as tabs in the third editor column. + // Order matters: Chat opens last so it ends up as the active tab. + try { + approvalPanel.openAsPanel(vscode.ViewColumn.Three); + await settingsPanel.openAsPanel(vscode.ViewColumn.Three); + provider!.openAsPanel(vscode.ViewColumn.Three); + } catch (e) { + logError('Failed to auto-open Astra panels.', e); + } } export async function deactivate() { diff --git a/src/features/approval/approvalPanelProvider.ts b/src/features/approval/approvalPanelProvider.ts index 3fac639..e549893 100644 --- a/src/features/approval/approvalPanelProvider.ts +++ b/src/features/approval/approvalPanelProvider.ts @@ -1,5 +1,6 @@ import * as vscode from 'vscode'; import { ApprovalQueue, Approval } from './approvalQueue'; +import { wrapPanelAsView } from '../../sidebarProvider'; /** * A small webview view that surfaces the currently pending approval, separate @@ -14,6 +15,7 @@ export class ApprovalPanelProvider implements vscode.WebviewViewProvider { public static readonly viewType = 'g1nation-approval-panel'; private _view?: vscode.WebviewView; + private _panel?: vscode.WebviewPanel; private _subscription?: vscode.Disposable; constructor( @@ -22,6 +24,32 @@ export class ApprovalPanelProvider implements vscode.WebviewViewProvider { ) {} public resolveWebviewView(view: vscode.WebviewView): void { + this._initView(view); + } + + /** Open the approvals UI as an editor panel (Column 3 by default). */ + public openAsPanel(column: vscode.ViewColumn = vscode.ViewColumn.Three): vscode.WebviewPanel { + if (this._panel) { + this._panel.reveal(column); + return this._panel; + } + const panel = vscode.window.createWebviewPanel( + ApprovalPanelProvider.viewType, + 'Pending Approvals', + column, + { enableScripts: true, localResourceRoots: [this._extensionUri], retainContextWhenHidden: true } + ); + this._panel = panel; + const adapter = wrapPanelAsView(panel); + panel.onDidDispose(() => { + if (this._panel === panel) this._panel = undefined; + if (this._view === adapter) this._view = undefined; + }); + this._initView(adapter); + return panel; + } + + private _initView(view: vscode.WebviewView): void { this._view = view; view.webview.options = { enableScripts: true, localResourceRoots: [this._extensionUri] }; view.webview.html = this._render(this._queue.current()); @@ -40,13 +68,17 @@ export class ApprovalPanelProvider implements vscode.WebviewViewProvider { view.onDidDispose(() => { this._subscription?.dispose(); this._subscription = undefined; - this._view = undefined; + if (this._view === view) this._view = undefined; }); } /** Bring the panel into focus; used by the status bar badge. */ public focus(): void { - void vscode.commands.executeCommand(`${ApprovalPanelProvider.viewType}.focus`); + if (this._panel) { + this._panel.reveal(this._panel.viewColumn ?? vscode.ViewColumn.Three); + return; + } + this.openAsPanel(); } private _render(approval: Approval | null): string { diff --git a/src/features/settings/settingsPanelProvider.ts b/src/features/settings/settingsPanelProvider.ts index b2cbb0e..92e553a 100644 --- a/src/features/settings/settingsPanelProvider.ts +++ b/src/features/settings/settingsPanelProvider.ts @@ -123,45 +123,23 @@ export class SettingsPanelProvider implements vscode.WebviewViewProvider { } public async focus(): Promise { - // Reveal the Astra activity-bar container so a focus() doesn't silently - // no-op against a collapsed sidebar. - try { - await vscode.commands.executeCommand('workbench.view.extension.g1nation-sidebar'); - } catch { - // Older VS Code versions may not expose this command. - } - try { - await vscode.commands.executeCommand(`${SettingsPanelProvider.viewType}.focus`); - } catch (e: any) { - // The view-focus command is auto-generated only when VS Code parsed - // the package.json `views` entry. If a stale .vsix is installed - // (or the user hasn't reloaded after a fresh install) the command - // is missing and we hit `command not found`. Fall back to a - // floating panel so the user still gets the same UI. - if (this._isCommandNotFound(e)) { - logInfo('Settings view command missing — opening as floating panel.'); - await this.openAsPanel(); - return; - } - throw e; - } + await this.openAsPanel(); } /** - * Open the same settings UI as a stand-alone editor panel. Used when the - * sidebar `WebviewView` isn't registered yet (e.g. user installed a fresh - * .vsix without reloading) — keeps the feature reachable without forcing - * the user back through `vsce package` cycles. + * Open the settings UI as a stand-alone editor panel (Column 3 by default). + * Astra's sidebar view container was removed in 2.81 — all three webviews + * (Chat, Approvals, Settings) now live in the editor area. */ - public async openAsPanel(): Promise { + public async openAsPanel(column: vscode.ViewColumn = vscode.ViewColumn.Three): Promise { if (this._panel) { - this._panel.reveal(vscode.ViewColumn.Active); - return; + this._panel.reveal(column); + return this._panel; } const panel = vscode.window.createWebviewPanel( - 'g1nation-settings-panel-floating', + SettingsPanelProvider.viewType, 'Astra Settings', - vscode.ViewColumn.Active, + column, { enableScripts: true, localResourceRoots: [this._deps.extensionUri], retainContextWhenHidden: true } ); this._panel = panel; @@ -169,11 +147,7 @@ export class SettingsPanelProvider implements vscode.WebviewViewProvider { panel.onDidDispose(() => { this._panel = undefined; }); await this._refreshState(); void this._fetchModelsAndRefresh(); - } - - private _isCommandNotFound(e: unknown): boolean { - const msg = (e as any)?.message ?? String(e ?? ''); - return /command\s+'.+'\s+not found/i.test(msg); + return panel; } /** Re-pull state from sources of truth and broadcast to the webview. */ diff --git a/src/retrieval/brainIndex.ts b/src/retrieval/brainIndex.ts index d48d98b..864dc6b 100644 --- a/src/retrieval/brainIndex.ts +++ b/src/retrieval/brainIndex.ts @@ -17,7 +17,10 @@ import { tokenize, countConflictIndicators } from './scoring'; import { detectLessonKind } from './lessonHelpers'; import { logInfo } from '../utils'; -const INDEX_VERSION = 3; +// v4 adds optional per-file `embedding` for hybrid (sparse+dense) retrieval. +// Older v3 indexes are auto-rebuilt on first load — no migration needed because +// the cache is derivable from the brain itself. +const INDEX_VERSION = 4; const INDEX_DIR = '.astra'; const INDEX_FILE = 'brain-index.json'; /** 인덱스가 이 개수를 넘으면 이번 스캔에서 못 본 항목을 정리합니다 (삭제된 파일 누적 방지). */ @@ -34,6 +37,14 @@ interface IndexEntry { titleTokens: string[]; // tokenize(title) conflictCount: number; // countConflictIndicators(`${title} ${content}`) kind: string; // '' for an ordinary note, else 'lesson' | 'playbook' | 'qa-finding' + /** + * Dense embedding for hybrid retrieval. Populated lazily by a background + * pass after the file is tokenized — TF-IDF queries don't wait on it. + * Cleared when mtimeMs/size change because the content moved on. + */ + embedding?: number[]; + /** Embedding model the vector was produced with — invalidates the vector when the user switches models. */ + embeddingModel?: string; } interface PersistedIndex { @@ -212,6 +223,93 @@ export function getBrainTokenIndex(brainPath: string, files: string[]): IndexedB return out; } +/** + * Pull (filePath, embedding) for every file in `filePaths` that has a current + * cached vector under `model`. Caller uses this to rank top TF-IDF candidates + * by cosine similarity. Files missing an embedding are silently omitted. + */ +export function getBrainEmbeddings(brainPath: string, filePaths: string[], model: string): Map { + const out = new Map(); + if (!brainPath || !model.trim() || !Array.isArray(filePaths) || filePaths.length === 0) return out; + const st = _states.get(brainPath); + if (!st) return out; + for (const fp of filePaths) { + const entry = st.index.entries[fp]; + if (!entry?.embedding || entry.embeddingModel !== model) continue; + if (!Array.isArray(entry.embedding) || entry.embedding.length === 0) continue; + out.set(fp, entry.embedding); + } + return out; +} + +/** + * Background fill: for each file under `filePaths`, embed its content with + * `embedFn` if no current vector exists for `model`. Calls `embedFn` in + * caller-controlled batches (caller can chunk filePaths as wanted), and saves + * the disk index. Designed to be fire-and-forget — failures are logged and + * swallowed. + * + * Returns the count of newly embedded files (0 when everything was cached + * already or the model is empty). + */ +export async function backfillBrainEmbeddings( + brainPath: string, + filePaths: string[], + model: string, + embedFn: (texts: string[]) => Promise, +): Promise { + if (!brainPath || !model.trim() || !Array.isArray(filePaths) || filePaths.length === 0) return 0; + const st = _states.get(brainPath); + if (!st) return 0; + const stale: string[] = []; + for (const fp of filePaths) { + const entry = st.index.entries[fp]; + if (!entry) continue; + if (entry.embedding && entry.embeddingModel === model) continue; + stale.push(fp); + } + if (stale.length === 0) return 0; + // Build embedding inputs from cached tokens (much cheaper than re-reading + // the file). We re-read content only when the cached tokens are missing + // somehow — defensive, but the index always has them after tokenization. + const texts: string[] = []; + const keys: string[] = []; + for (const fp of stale) { + const entry = st.index.entries[fp]; + if (!entry) continue; + let text = ''; + if (Array.isArray(entry.tokens) && entry.tokens.length > 0) { + text = `${entry.title}\n${entry.tokens.join(' ')}`; + } else { + try { text = fs.readFileSync(fp, 'utf8'); } catch { continue; } + } + if (!text.trim()) continue; + texts.push(text); + keys.push(fp); + } + if (texts.length === 0) return 0; + try { + const vectors = await embedFn(texts); + for (let i = 0; i < vectors.length && i < keys.length; i++) { + const v = vectors[i]; + if (!Array.isArray(v) || v.length === 0) continue; + const entry = st.index.entries[keys[i]]; + if (!entry) continue; + entry.embedding = v; + entry.embeddingModel = model; + st.dirty = true; + } + if (st.dirty) { + logInfo('Brain embeddings backfilled.', { brainPath, model, embedded: vectors.length }); + scheduleWrite(st, brainPath); + } + return vectors.length; + } catch (e: any) { + logInfo('Brain embedding backfill failed (TF-IDF still works).', { brainPath, model, error: e?.message ?? String(e) }); + return 0; + } +} + /** Drop the in-memory index (and pending write) for one brain, or all brains. The disk file is left as-is. */ export function clearBrainTokenIndex(brainPath?: string): void { if (brainPath === undefined) { diff --git a/src/retrieval/contextBudget.ts b/src/retrieval/contextBudget.ts index 3aae427..e3e0e0b 100644 --- a/src/retrieval/contextBudget.ts +++ b/src/retrieval/contextBudget.ts @@ -101,6 +101,7 @@ export function assembleContext(chunks: RetrievalChunk[]): string { 'brain-trace': '📚 Second Brain Knowledge', 'brain-memory': '📚 Brain Knowledge', 'long-term-memory': '🧠 Long-Term Memory (사용자 규칙/결정)', + 'medium-term-memory': '🗂️ Medium-Term Memory (최근 세션 요약)', 'project-memory': '📂 Project Memory (프로젝트 컨텍스트)', 'procedural-memory': '📋 Procedural Memory (반복 절차)', 'episodic-memory': '📖 Episodic Memory (과거 대화 흐름)', diff --git a/src/retrieval/embeddings.ts b/src/retrieval/embeddings.ts new file mode 100644 index 0000000..8b44dd4 --- /dev/null +++ b/src/retrieval/embeddings.ts @@ -0,0 +1,167 @@ +/** + * ============================================================ + * Embeddings — local hybrid (sparse + dense) retrieval support + * + * TF-IDF is fast and zero-cost but misses synonyms / paraphrase. A small local + * embedding model (BGE-small, multilingual-e5-small, nomic-embed-text, …) + * loaded in LM Studio or Ollama bridges that gap without sending anything + * off the machine. + * + * Design choices: + * - Opt-in via g1nation.embeddingModel (empty = disabled). We don't auto- + * pick a model because the user has to load it in LM Studio/Ollama first. + * - Calls are best-effort: a missing model / network blip falls back to + * pure TF-IDF without breaking the query. + * - We never block retrieval on embedding work. Missing-file embeddings are + * populated by a separate fire-and-forget pass after the TF-IDF answer + * ships, so the *next* query benefits. + * + * Numerical format: + * - Vectors are `number[]` (not Float32Array) so they JSON-serialize for + * the brain-index cache without per-element conversion. The hot loop + * (cosine) is small enough that the extra precision is irrelevant to + * throughput on typical brain sizes. + * ============================================================ + */ + +import { resolveEngine, buildApiUrl, logError, logInfo } from '../utils'; + +/** Maximum characters of a single text chunk fed to the embedding model. */ +const EMBED_INPUT_CAP = 4000; +/** Maximum texts per embedding API call. */ +const BATCH_SIZE = 16; +/** Request timeout for one embedding batch. */ +const REQ_TIMEOUT_MS = 30000; + +export interface EmbeddingCallOptions { + /** OpenAI-compatible base URL (e.g. http://127.0.0.1:1234 for LM Studio). */ + baseUrl: string; + /** Embedding model name as registered in LM Studio / Ollama. Empty disables. */ + model: string; + /** AbortSignal for cancellation propagation. */ + signal?: AbortSignal; +} + +/** + * Embed a batch of texts. Returns one vector per input. Throws if the call + * fails — callers wrap with try/catch and fall back to TF-IDF. + * + * Engine selection mirrors the chat path: LM Studio takes precedence when the + * URL points at port 1234 or includes the /v1/ prefix, otherwise Ollama. + */ +export async function embedTexts(texts: string[], opts: EmbeddingCallOptions): Promise { + if (!opts.model.trim()) throw new Error('Embedding model not configured.'); + if (!texts || texts.length === 0) return []; + const engine = resolveEngine(opts.baseUrl); + const url = buildApiUrl(opts.baseUrl, engine, 'embeddings'); + const out: number[][] = []; + for (let i = 0; i < texts.length; i += BATCH_SIZE) { + const batch = texts.slice(i, i + BATCH_SIZE).map((t) => clipForEmbedding(t)); + const body = engine === 'lmstudio' + ? { model: opts.model, input: batch } + : { model: opts.model, input: batch }; // Ollama 0.1.30+ also accepts array input + const controller = opts.signal ? undefined : new AbortController(); + const timer = controller ? setTimeout(() => controller.abort(), REQ_TIMEOUT_MS) : undefined; + try { + const response = await fetch(url, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify(body), + signal: opts.signal ?? controller?.signal, + }); + if (!response.ok) { + const errText = await response.text().catch(() => ''); + throw new Error(`Embedding endpoint returned ${response.status}: ${errText.slice(0, 200)}`); + } + const json = await response.json() as any; + // OpenAI-compatible: { data: [{ embedding: [...] }, ...] } + // Ollama: { embedding: [...] } (single) or { embeddings: [[...], ...] } (newer) + if (Array.isArray(json?.data)) { + for (const row of json.data) { + if (Array.isArray(row?.embedding)) out.push(row.embedding as number[]); + } + } else if (Array.isArray(json?.embeddings)) { + for (const v of json.embeddings) { + if (Array.isArray(v)) out.push(v as number[]); + } + } else if (Array.isArray(json?.embedding)) { + out.push(json.embedding as number[]); + } + } finally { + if (timer) clearTimeout(timer); + } + } + return out; +} + +/** Cosine similarity for equal-length vectors. Returns 0 when either vector is empty / zero. */ +export function cosineSimilarity(a: number[], b: number[]): number { + if (!a || !b || a.length === 0 || b.length === 0) return 0; + const n = Math.min(a.length, b.length); + let dot = 0, na = 0, nb = 0; + for (let i = 0; i < n; i++) { + const va = a[i], vb = b[i]; + dot += va * vb; + na += va * va; + nb += vb * vb; + } + if (na === 0 || nb === 0) return 0; + return dot / (Math.sqrt(na) * Math.sqrt(nb)); +} + +/** Clip a text to a length the embedding model will accept without truncation surprises. */ +function clipForEmbedding(text: string): string { + if (!text) return ''; + return text.length <= EMBED_INPUT_CAP ? text : text.slice(0, EMBED_INPUT_CAP); +} + +/** + * Tiny LRU for query embeddings: typing the same query twice (or retrying) + * shouldn't re-hit the embedding endpoint. Keyed on `model + text`. + * + * Capped at QUERY_CACHE_MAX entries; oldest evicted. Strictly process-local + * (no disk persistence) because the query strings are short and the gains + * across restarts are marginal. + */ +const QUERY_CACHE_MAX = 32; +const _queryCache = new Map(); +function queryCacheKey(model: string, text: string): string { return `${model}|${text}`; } +export function getCachedQueryEmbedding(model: string, text: string): number[] | undefined { + const k = queryCacheKey(model, text); + const v = _queryCache.get(k); + if (!v) return undefined; + // refresh recency + _queryCache.delete(k); + _queryCache.set(k, v); + return v; +} +export function setCachedQueryEmbedding(model: string, text: string, vec: number[]): void { + const k = queryCacheKey(model, text); + _queryCache.set(k, vec); + if (_queryCache.size > QUERY_CACHE_MAX) { + const oldest = _queryCache.keys().next().value; + if (oldest !== undefined) _queryCache.delete(oldest); + } +} + +/** + * Embed a single query string, using the in-process LRU. Returns `undefined` + * if the embedding endpoint fails — callers treat that as "semantic + * scoring unavailable for this turn, fall back to TF-IDF". + */ +export async function embedQuery(text: string, opts: EmbeddingCallOptions): Promise { + if (!opts.model.trim() || !text.trim()) return undefined; + const cached = getCachedQueryEmbedding(opts.model, text); + if (cached) return cached; + try { + const [vec] = await embedTexts([text], opts); + if (vec && vec.length > 0) { + setCachedQueryEmbedding(opts.model, text, vec); + logInfo('Query embedding computed.', { model: opts.model, dim: vec.length }); + return vec; + } + } catch (e: any) { + logError('Query embedding failed.', { model: opts.model, error: e?.message ?? String(e) }); + } + return undefined; +} diff --git a/src/retrieval/index.ts b/src/retrieval/index.ts index 9c3b470..c1ae52f 100644 --- a/src/retrieval/index.ts +++ b/src/retrieval/index.ts @@ -19,15 +19,32 @@ import { findBrainFiles, summarizeText } from '../utils'; import { isInside } from '../lib/paths'; import { MemoryManager } from '../memory'; import { RetrievalChunk, RetrievalResult, ContextBudgetConfig } from './types'; -import { tokenize, expandQuery, scoreTfIdfPreTokenized, extractBestExcerpt } from './scoring'; +import { tokenize, expandQuery, scoreTfIdfPreTokenized, extractBestExcerpt, extractBestSection } from './scoring'; import { selectWithinBudget, assembleContext, estimateTokens } from './contextBudget'; -import { getBrainTokenIndex } from './brainIndex'; +import { getBrainTokenIndex, getBrainEmbeddings } from './brainIndex'; +import { extractLessonEssence } from './lessonHelpers'; +import { cosineSimilarity } from './embeddings'; export { tokenize, expandQuery, scoreTfIdf, scoreTfIdfPreTokenized, extractBestExcerpt } from './scoring'; export { selectWithinBudget, assembleContext, estimateTokens } from './contextBudget'; export { getBrainTokenIndex, clearBrainTokenIndex } from './brainIndex'; export * from './types'; +/** Compact summary of a past chat session for medium-term memory retrieval. */ +export interface RecentSessionSummary { + id: string; + title: string; + firstUserMsg: string; + lastAssistantExcerpt: string; + /** + * Optional LLM-compressed recap stored at session end (~200 chars). + * When present, retrieval uses this instead of the firstUserMsg+tail + * fragment because it actually captures the decision/outcome. + */ + summary?: string; + timestamp: number; +} + interface RetrievalOptions { brain: BrainProfile; memoryManager: MemoryManager; @@ -44,6 +61,26 @@ interface RetrievalOptions { * silently dropped by the caller (see `agentKnowledgeMap.resolveScopeForAgent`). */ scopeFolders?: string[]; + /** + * Compact summaries of recently-touched chat sessions (excluding the + * active one). Scored against the query and the top `mediumTermLimit` + * are injected as medium-term memory chunks. Caller pre-computes these + * to avoid threading vscode/ExtensionContext through this module. + */ + recentSessions?: RecentSessionSummary[]; + /** Max number of medium-term session chunks to include after scoring. */ + mediumTermLimit?: number; + /** + * Optional query embedding for hybrid (sparse+dense) brain search. When + * provided, each candidate file's cached embedding is cosine-matched and + * blended with the TF-IDF score by `embeddingBlendAlpha`. Caller computes + * this once per turn so we don't pay the embedding RTT inside scoring. + */ + queryEmbedding?: number[]; + /** Embedding model name (used as a cache key on the brain index side). */ + embeddingModel?: string; + /** Blend weight: 0 = TF-IDF only, 1 = cosine only. Default 0.5. */ + embeddingBlendAlpha?: number; } export class RetrievalOrchestrator { @@ -60,7 +97,7 @@ export class RetrievalOrchestrator { fusionLog.push(`Query tokens: [${queryTokens.slice(0, 10).join(', ')}]`); fusionLog.push(`Expanded tokens: [${expandedTokens.slice(0, 15).join(', ')}]`); - // ── ① Brain File Search (TF-IDF enhanced) ── + // ── ① Brain File Search (TF-IDF enhanced, optionally hybrid with embeddings) ── const scopeFolders = options.scopeFolders ?? []; const brainChunks = this.searchBrainFiles( query, @@ -68,7 +105,10 @@ export class RetrievalOrchestrator { options.brain, options.brainFileLimit || 8, options.includeRawConversations || false, - scopeFolders + scopeFolders, + options.queryEmbedding, + options.embeddingModel, + options.embeddingBlendAlpha ); allChunks.push(...brainChunks); fusionLog.push( @@ -87,6 +127,15 @@ export class RetrievalOrchestrator { allChunks.push(...memoryChunks); fusionLog.push(`Memory search: ${memoryChunks.length} chunks found`); + // ── ②-b Medium-Term Memory (recent sessions) ── + const mediumChunks = this.scoreRecentSessions( + expandedTokens, + options.recentSessions || [], + options.mediumTermLimit ?? 0 + ); + allChunks.push(...mediumChunks); + fusionLog.push(`Medium-term sessions: ${mediumChunks.length} chunks selected`); + // ── ③ Result Fusion — normalize scores across sources ── this.normalizeScores(allChunks); fusionLog.push(`Total chunks before budget: ${allChunks.length}`); @@ -129,7 +178,10 @@ export class RetrievalOrchestrator { brain: BrainProfile, limit: number, includeRaw: boolean, - scopeFolders: string[] = [] + scopeFolders: string[] = [], + queryEmbedding?: number[], + embeddingModel?: string, + embeddingBlendAlpha?: number, ): RetrievalChunk[] { try { const scoped = (file: string) => scopeFolders.length === 0 @@ -155,6 +207,34 @@ export class RetrievalOrchestrator { })) ); + // Hybrid blend: when the caller provided a query embedding and an + // embedding model, fetch the cached file vectors and add a cosine + // similarity term to each score. We normalise TF-IDF scores by the + // top observed value so the two terms live on the same scale before + // blending. Files without a cached embedding keep their pure TF-IDF + // score so adding/missing embeddings doesn't hurt retrieval. + if (queryEmbedding && embeddingModel && (embeddingBlendAlpha ?? 0) > 0) { + const alpha = Math.max(0, Math.min(1, embeddingBlendAlpha!)); + const filePaths = indexed.map((d) => d.filePath); + const embeddings = getBrainEmbeddings(brain.localBrainPath, filePaths, embeddingModel); + if (embeddings.size > 0) { + const maxTfidf = scored.reduce((m, s) => s.score > m ? s.score : m, 0) || 1; + let hits = 0; + for (const s of scored) { + const fp = indexed[s.index].filePath; + const vec = embeddings.get(fp); + if (!vec) continue; + const cos = cosineSimilarity(queryEmbedding, vec); // [-1, 1] in theory; positive for typical embedding spaces + const tfidfNorm = s.score / maxTfidf; + s.score = (1 - alpha) * tfidfNorm + alpha * Math.max(0, cos); + hits++; + } + if (hits > 0) { + // Re-sort downstream is handled by the .filter().sort() that follows. + } + } + } + // Always consider lesson cards for the top slots even if they didn't crack the raw-score top-`limit`: // they're short, high-signal, and we want them surfaced when relevant. We keep the regular top-`limit` // and additively pull in up to a few lesson cards (deduped by index). @@ -180,12 +260,20 @@ export class RetrievalOrchestrator { // Only the chosen files are actually read off disk (for excerpt extraction). let content = ''; try { content = fs.readFileSync(doc.filePath, 'utf8'); } catch { /* deleted just now — skip */ continue; } - // Lesson cards: hand back the whole card (they're meant to be short) so the Prevention Checklist - // survives; fall back to a generous excerpt for long ones. Regular notes: the usual 400-char excerpt. + // Lesson cards: extract just the high-signal sections (Mistake / Root Cause / Fix / + // Prevention Checklist) instead of dumping the whole 2500-char card. Old lessons + // without those headings fall back to a query-targeted excerpt. Cuts retrieval tokens + // by ~70% per lesson without losing the guardrail content. + // + // Regular notes: pick the best heading-bounded section for the query (markdown + // section retrieval) so that long notes don't dump their intro/setup blocks just + // because they happen to be in the top 400 chars. Falls back to keyword-window + // extraction inside the section, or whole-doc extraction when there are no + // headings at all. const excerpt = isLesson - ? (content.length <= 2500 ? content.trim() : extractBestExcerpt(content, expandedTokens, 1500)) - : extractBestExcerpt(content, expandedTokens, 400); - const cap = isLesson ? 2500 : 400; + ? extractLessonEssence(content, 1200) || extractBestExcerpt(content, expandedTokens, 1200) + : extractBestSection(content, expandedTokens, 600); + const cap = isLesson ? 1200 : 600; topResults.push({ id: `brain-${s.index}`, source: 'brain-memory' as const, @@ -287,6 +375,70 @@ export class RetrievalOrchestrator { return chunks; } + // ─── Medium-Term: Recent Sessions ─── + + /** + * Score the user-provided session summaries against the current query + * (lightweight token overlap — sessions are small so we skip the TF-IDF + * machinery) and return up to `limit` as chunks. Each chunk packs the + * title + first user message + last assistant excerpt — enough for the + * model to recall the thread without re-injecting the whole transcript. + * + * Why include recent sessions at all: short-term covers "this conversation", + * long-term covers "stable brain notes", but there's a gap for "what we + * worked on yesterday/last week" that the user expects me to remember. + */ + private scoreRecentSessions( + expandedTokens: string[], + sessions: RecentSessionSummary[], + limit: number, + ): RetrievalChunk[] { + if (!sessions || sessions.length === 0 || limit <= 0) return []; + const qSet = new Set(expandedTokens.filter((t) => t.length >= 2)); + const scored = sessions.map((s) => { + // Prefer the LLM-compressed summary when present — it's a real + // 2-3 sentence recap of the session, so query matches against it + // are far more meaningful than against an arbitrary head/tail. + const text = s.summary + ? `${s.title}\n${s.summary}` + : `${s.title}\n${s.firstUserMsg}\n${s.lastAssistantExcerpt}`; + const docTokens = tokenize(text); + let overlap = 0; + for (const t of docTokens) if (qSet.has(t)) overlap++; + // Tiny recency boost so equal-overlap sessions prefer the more + // recent one (most users mean "what we just discussed"). +0.1 max + // for sessions <7 days old, decays to 0 beyond that. + const ageDays = s.timestamp ? Math.max(0, (Date.now() - s.timestamp) / 86400000) : 999; + const recency = ageDays < 7 ? (7 - ageDays) / 70 : 0; + return { s, score: overlap + recency }; + }).filter((x) => x.score > 0); + scored.sort((a, b) => b.score - a.score); + const picked = scored.slice(0, limit); + if (picked.length === 0) return []; + return picked.map(({ s, score }, idx) => { + const dateStr = s.timestamp ? new Date(s.timestamp).toISOString().slice(0, 10) : ''; + // Prefer the LLM-compressed summary; fall back to the raw fragments + // when the session ended before the summarizer could run (or was + // too short to summarize, < 3 visible messages). + const body = s.summary + ? [`**${s.title}**${dateStr ? ` (${dateStr})` : ''}`, s.summary].join('\n') + : [ + `**${s.title}**${dateStr ? ` (${dateStr})` : ''}`, + s.firstUserMsg ? `사용자 요청: ${s.firstUserMsg}` : '', + s.lastAssistantExcerpt ? `이전 답변 마지막 부분: …${s.lastAssistantExcerpt}` : '', + ].filter(Boolean).join('\n'); + return { + id: `mtm-${idx}-${s.id}`, + source: 'medium-term-memory', + title: s.title || '(untitled session)', + content: body, + score, + tokenEstimate: estimateTokens(body), + metadata: { category: 'medium-term', lastUpdated: s.timestamp }, + }; + }); + } + // ─── Score Normalization ─── /** @@ -315,6 +467,7 @@ export class RetrievalOrchestrator { 'project-memory': 0.85, 'long-term-memory': 0.8, 'procedural-memory': 0.95, // Procedural is highly specific + 'medium-term-memory': 0.78, // recent sessions: useful when the user references "last time / yesterday" 'episodic-memory': 0.7, 'project-scan': 0.6, 'recent-knowledge': 0.75 diff --git a/src/retrieval/lessonHelpers.ts b/src/retrieval/lessonHelpers.ts index 0e7278a..611dbcb 100644 --- a/src/retrieval/lessonHelpers.ts +++ b/src/retrieval/lessonHelpers.ts @@ -47,6 +47,54 @@ function parseFrontmatterType(content: string): string { return m ? m[1].trim().toLowerCase() : ''; } +/** + * Pull a specific markdown section ("## NAME ... up to the next heading") from a lesson card. + * Returns trimmed body text, or '' if the heading isn't found. + */ +function extractSection(content: string, headingRe: RegExp): string { + const m = content.match(headingRe); + if (!m || m.index === undefined) return ''; + const after = content.slice(m.index + m[0].length); + const stop = after.search(/\n#{1,6}\s/); + const section = stop >= 0 ? after.slice(0, stop) : after; + return section.trim(); +} + +/** + * Slim a lesson card down to the sections that actually matter for guardrails: + * Mistake / Risk, Root Cause, Fix, and Prevention Checklist. Drops Situation, + * Applies-To, and any verbose narrative. Returned text is markdown-compatible + * with the original headings so the model still sees the structure. + * + * Falls back to the original content (clipped to `maxLen`) if no recognised + * sections are found — keeps backwards-compat for old lessons that don't + * follow the current template. + * + * Why: lesson cards are loaded at 2500 chars each and three cards can eat + * ~11K tokens. The essence sections are usually <600 chars total per card, + * which trims retrieval tokens by ~70% without losing the signal. + */ +export function extractLessonEssence(content: string, maxLen = 1200): string { + if (!content) return ''; + const sections: Array<{ heading: string; body: string }> = []; + const want: Array<[string, RegExp]> = [ + ['## Mistake / Risk', /^#{1,6}\s*(?:mistake\s*\/?\s*risk|mistake|risk|실수|문제)\s*$/im], + ['## Root Cause', /^#{1,6}\s*(?:root\s*cause|근본\s*원인|원인)\s*$/im], + ['## Fix', /^#{1,6}\s*(?:fix|해결|수정)\s*$/im], + ['## Prevention Checklist', /^#{1,6}\s*(?:prevention\s*checklist|prevention|체크리스트|예방\s*체크리스트)\s*$/im], + ]; + for (const [heading, re] of want) { + const body = extractSection(content, re); + if (body && !/^<[^>]+>$/.test(body)) sections.push({ heading, body }); + } + if (sections.length === 0) { + return content.length <= maxLen ? content.trim() : content.slice(0, maxLen).trim() + '\n…'; + } + let assembled = sections.map((s) => `${s.heading}\n${s.body}`).join('\n\n'); + if (assembled.length > maxLen) assembled = assembled.slice(0, maxLen).trim() + '\n…'; + return assembled; +} + /** Extract the "## Prevention Checklist" bullet list from a lesson card, if present. */ export function extractPreventionChecklist(content: string): string[] { if (!content) return []; diff --git a/src/retrieval/scoring.ts b/src/retrieval/scoring.ts index c5b5b5f..944e557 100644 --- a/src/retrieval/scoring.ts +++ b/src/retrieval/scoring.ts @@ -316,6 +316,121 @@ export function scoreTfIdfPreTokenized( }); } +/** + * Split markdown content into top-level sections by `#` / `##` / `###` headings. + * + * Returned sections are `{ heading, body }` — `heading` includes the heading + * line itself (preserving level), `body` is the text up to the next heading + * of the same-or-shallower depth. Front-matter (a leading `--- … ---` block) + * is dropped because it's not query-relevant. + * + * A document with no headings returns one synthetic section + * `{ heading: '', body: content }` so callers can treat the result uniformly. + * + * Why this exists: retrieval was returning whole files (excerpts capped at + * 400 chars). On long notes, that excerpt was often the file's intro/setup, + * not the section that actually matched the query. Section-level retrieval + * lets us pick the relevant heading directly and drop everything else. + */ +export interface MarkdownSection { + heading: string; + body: string; +} +export function splitMarkdownSections(content: string): MarkdownSection[] { + if (!content) return []; + // Strip frontmatter + let text = content; + if (/^?---\s*\n/.test(text)) { + const end = text.indexOf('\n---', 4); + if (end >= 0) text = text.slice(end + 4).replace(/^\s*\n/, ''); + } + const lines = text.split('\n'); + const headingIdx: Array<{ line: number; level: number }> = []; + for (let i = 0; i < lines.length; i++) { + const m = /^(#{1,6})\s+\S/.exec(lines[i]); + if (m) headingIdx.push({ line: i, level: m[1].length }); + } + if (headingIdx.length === 0) { + return [{ heading: '', body: text.trim() }]; + } + const sections: MarkdownSection[] = []; + // Capture any leading content above the first heading as a "preamble" section. + if (headingIdx[0].line > 0) { + const preamble = lines.slice(0, headingIdx[0].line).join('\n').trim(); + if (preamble) sections.push({ heading: '', body: preamble }); + } + for (let i = 0; i < headingIdx.length; i++) { + const start = headingIdx[i].line; + const end = i + 1 < headingIdx.length ? headingIdx[i + 1].line : lines.length; + const heading = lines[start].trim(); + const body = lines.slice(start + 1, end).join('\n').trim(); + sections.push({ heading, body }); + } + return sections; +} + +/** + * Pick the best heading-bounded section of a markdown document for a query, + * then fall back to keyword-window extraction inside that section if the + * section itself is still too long. + * + * Strategy: + * 1. Split into sections by heading (`splitMarkdownSections`). + * 2. Score each section's heading + body by query token overlap; weight + * heading matches 3× so "## Foo" beats a body mention of "foo". + * 3. If the top section's text fits, return it as-is (heading + body). + * 4. Otherwise, run `extractBestExcerpt` inside the top section's body and + * prepend the heading. + * + * Falls back to a plain `extractBestExcerpt` when the document has no + * headings — that's what `splitMarkdownSections` returns as a single + * synthetic section. + * + * Caps: + * - Output is always ≤ `maxLength` (final excerpt is sliced as a safety net). + * - Sections smaller than 24 chars after stripping are skipped — they're + * usually empty headings the author left as placeholders. + */ +export function extractBestSection( + content: string, + queryTokens: string[], + maxLength = 600 +): string { + const sections = splitMarkdownSections(content); + if (sections.length === 0) return content.slice(0, maxLength); + if (sections.length === 1 && !sections[0].heading) { + return extractBestExcerpt(sections[0].body || content, queryTokens, maxLength); + } + const expanded = expandQuery(queryTokens); + const expandedSet = new Set(expanded); + const scoreText = (text: string) => { + if (!text) return 0; + const toks = tokenize(text); + let hits = 0; + for (const t of toks) if (expandedSet.has(t)) hits++; + return hits; + }; + let best = { idx: -1, score: -1 }; + for (let i = 0; i < sections.length; i++) { + const s = sections[i]; + if ((s.heading.length + s.body.length) < 24) continue; + const score = scoreText(s.heading) * 3 + scoreText(s.body); + if (score > best.score) best = { idx: i, score }; + } + if (best.idx < 0) { + // No section contained any query terms — fall back to a whole-doc excerpt. + return extractBestExcerpt(content, queryTokens, maxLength); + } + const picked = sections[best.idx]; + const headingLine = picked.heading ? `${picked.heading}\n` : ''; + const room = Math.max(64, maxLength - headingLine.length); + if (picked.body.length <= room) { + return (headingLine + picked.body).slice(0, maxLength).trim(); + } + const inner = extractBestExcerpt(picked.body, queryTokens, room); + return (headingLine + inner).slice(0, maxLength).trim(); +} + /** * 텍스트에서 가장 관련성 높은 구간(excerpt)을 추출합니다. * 단순 paragraph 단위가 아니라, 키워드 밀도가 높은 윈도우를 찾습니다. diff --git a/src/retrieval/types.ts b/src/retrieval/types.ts index e2b51de..9482205 100644 --- a/src/retrieval/types.ts +++ b/src/retrieval/types.ts @@ -7,15 +7,16 @@ * ============================================================ */ -export type RetrievalSource = - | 'brain-trace' // Second Brain Trace - | 'brain-memory' // findRelevantBrainMemory (legacy) - | 'long-term-memory' // Long-Term Memory - | 'project-memory' // Project Memory - | 'procedural-memory' // Procedural Memory - | 'episodic-memory' // Episodic Memory - | 'project-scan' // Local Project Path scan - | 'recent-knowledge'; // Recent Project Knowledge record +export type RetrievalSource = + | 'brain-trace' // Second Brain Trace + | 'brain-memory' // findRelevantBrainMemory (legacy) + | 'long-term-memory' // Long-Term Memory + | 'medium-term-memory' // Recent session summaries (memoryMediumTermSessions) + | 'project-memory' // Project Memory + | 'procedural-memory' // Procedural Memory + | 'episodic-memory' // Episodic Memory + | 'project-scan' // Local Project Path scan + | 'recent-knowledge'; // Recent Project Knowledge record export type ConflictSeverity = 'NONE' | 'LOW' | 'MEDIUM' | 'HIGH'; diff --git a/src/sidebarProvider.ts b/src/sidebarProvider.ts index 54b3468..4a343d0 100644 --- a/src/sidebarProvider.ts +++ b/src/sidebarProvider.ts @@ -64,6 +64,7 @@ export class SidebarChatProvider implements vscode.WebviewViewProvider, BridgeIn static readonly activeChronicleProjectStateKey = 'g1nation.activeChronicleProjectId'; static readonly lastAutoChronicleSignatureStateKey = 'g1nation.lastAutoChronicleSignature'; _view?: vscode.WebviewView; + _panel?: vscode.WebviewPanel; public brainEnabled = true; _currentSessionBrainId: string | null = null; _currentNegativePrompt: string = ''; @@ -93,6 +94,36 @@ export class SidebarChatProvider implements vscode.WebviewViewProvider, BridgeIn context: vscode.WebviewViewResolveContext, _token: vscode.CancellationToken, ) { + this._initView(webviewView); + } + + /** + * Open the chat as a standalone editor panel (Column 3 by default). + * Reuses the same view-init logic via a WebviewPanel→WebviewView adapter + * so the rest of the provider keeps using `this._view` unchanged. + */ + public openAsPanel(column: vscode.ViewColumn = vscode.ViewColumn.Three): vscode.WebviewPanel { + if (this._panel) { + this._panel.reveal(column); + return this._panel; + } + const panel = vscode.window.createWebviewPanel( + SidebarChatProvider.viewType, + 'Astra Chat', + column, + { enableScripts: true, localResourceRoots: [this._extensionUri], retainContextWhenHidden: true } + ); + this._panel = panel; + const adapter = wrapPanelAsView(panel); + panel.onDidDispose(() => { + if (this._panel === panel) this._panel = undefined; + if (this._view === adapter) this._view = undefined; + }); + this._initView(adapter); + return panel; + } + + private _initView(webviewView: vscode.WebviewView) { this._view = webviewView; webviewView.webview.options = { @@ -108,8 +139,8 @@ export class SidebarChatProvider implements vscode.WebviewViewProvider, BridgeIn // 5초 이내에 이미 갱신했으면 건너뜀 if (now - _lastVisibilityRefresh < 5000) return; _lastVisibilityRefresh = now; - - logInfo('Sidebar became visible, restoring state...'); + + logInfo('Astra view became visible, restoring state...'); void this._sendModels(); void this._sendBrainProfiles(); void this._sendAgentsList(); @@ -2043,3 +2074,39 @@ export class SidebarChatProvider implements vscode.WebviewViewProvider, BridgeIn .replace('__SCRIPT_URI__', scriptUri); } } + +/** + * Adapter that makes a {@link vscode.WebviewPanel} quack like a + * {@link vscode.WebviewView}, so providers written against the view API can + * mount inside an editor column without their internals knowing the difference. + * + * `onDidChangeVisibility` is synthesized from `onDidChangeViewState` — panels + * fire that event for both visibility *and* column moves, but the listener + * here only re-fires when the visible flag actually toggles. + */ +export function wrapPanelAsView(panel: vscode.WebviewPanel): vscode.WebviewView { + const visibilityEmitter = new vscode.EventEmitter(); + let _lastVisible = panel.visible; + panel.onDidChangeViewState(() => { + if (panel.visible !== _lastVisible) { + _lastVisible = panel.visible; + visibilityEmitter.fire(); + } + }); + panel.onDidDispose(() => visibilityEmitter.dispose()); + const adapter: any = { + viewType: panel.viewType, + webview: panel.webview, + get visible() { return panel.visible; }, + get title() { return panel.title; }, + set title(v: string | undefined) { panel.title = v ?? ''; }, + description: undefined as string | undefined, + badge: undefined as vscode.ViewBadge | undefined, + onDidChangeVisibility: visibilityEmitter.event, + onDidDispose: panel.onDidDispose, + show(preserveFocus?: boolean) { + panel.reveal(panel.viewColumn ?? vscode.ViewColumn.Three, preserveFocus); + }, + }; + return adapter as vscode.WebviewView; +} diff --git a/src/utils.ts b/src/utils.ts index cc9284d..96cc19e 100644 --- a/src/utils.ts +++ b/src/utils.ts @@ -61,18 +61,18 @@ export function resolveEngine(baseUrl: string): EngineKind { return 'ollama'; } -export function buildApiUrl(baseUrl: string, engine: EngineKind, endpoint: 'models' | 'chat'): string { +export function buildApiUrl(baseUrl: string, engine: EngineKind, endpoint: 'models' | 'chat' | 'embeddings'): string { const normalized = normalizeBaseUrl(baseUrl); if (engine === 'lmstudio') { - if (normalized.endsWith('/v1')) { - return endpoint === 'models' ? `${normalized}/models` : `${normalized}/chat/completions`; - } - return endpoint === 'models' ? `${normalized}/v1/models` : `${normalized}/v1/chat/completions`; + const root = normalized.endsWith('/v1') ? normalized : `${normalized}/v1`; + if (endpoint === 'models') return `${root}/models`; + if (endpoint === 'embeddings') return `${root}/embeddings`; + return `${root}/chat/completions`; } - if (normalized.endsWith('/api')) { - return endpoint === 'models' ? `${normalized}/tags` : `${normalized}/chat`; - } - return endpoint === 'models' ? `${normalized}/api/tags` : `${normalized}/api/chat`; + const apiRoot = normalized.endsWith('/api') ? normalized : `${normalized}/api`; + if (endpoint === 'models') return `${apiRoot}/tags`; + if (endpoint === 'embeddings') return `${apiRoot}/embed`; + return `${apiRoot}/chat`; } export function summarizeText(text: string, maxLength: number = 400): string {