refactor: optimize core engine and retrieval logic for v2.80.43

2026-05-13 19:23:57 +09:00
parent c4260466b9
commit 089abf22db
17 changed files with 1311 additions and 88 deletions
@@ -0,0 +1,167 @@
+/**
+ * ============================================================
+ * Embeddings — local hybrid (sparse + dense) retrieval support
+ *
+ * TF-IDF is fast and zero-cost but misses synonyms / paraphrase. A small local
+ * embedding model (BGE-small, multilingual-e5-small, nomic-embed-text, …)
+ * loaded in LM Studio or Ollama bridges that gap without sending anything
+ * off the machine.
+ *
+ * Design choices:
+ *   - Opt-in via g1nation.embeddingModel (empty = disabled). We don't auto-
+ *     pick a model because the user has to load it in LM Studio/Ollama first.
+ *   - Calls are best-effort: a missing model / network blip falls back to
+ *     pure TF-IDF without breaking the query.
+ *   - We never block retrieval on embedding work. Missing-file embeddings are
+ *     populated by a separate fire-and-forget pass after the TF-IDF answer
+ *     ships, so the *next* query benefits.
+ *
+ * Numerical format:
+ *   - Vectors are `number[]` (not Float32Array) so they JSON-serialize for
+ *     the brain-index cache without per-element conversion. The hot loop
+ *     (cosine) is small enough that the extra precision is irrelevant to
+ *     throughput on typical brain sizes.
+ * ============================================================
+ */
+
+import { resolveEngine, buildApiUrl, logError, logInfo } from '../utils';
+
+/** Maximum characters of a single text chunk fed to the embedding model. */
+const EMBED_INPUT_CAP = 4000;
+/** Maximum texts per embedding API call. */
+const BATCH_SIZE = 16;
+/** Request timeout for one embedding batch. */
+const REQ_TIMEOUT_MS = 30000;
+
+export interface EmbeddingCallOptions {
+    /** OpenAI-compatible base URL (e.g. http://127.0.0.1:1234 for LM Studio). */
+    baseUrl: string;
+    /** Embedding model name as registered in LM Studio / Ollama. Empty disables. */
+    model: string;
+    /** AbortSignal for cancellation propagation. */
+    signal?: AbortSignal;
+}
+
+/**
+ * Embed a batch of texts. Returns one vector per input. Throws if the call
+ * fails — callers wrap with try/catch and fall back to TF-IDF.
+ *
+ * Engine selection mirrors the chat path: LM Studio takes precedence when the
+ * URL points at port 1234 or includes the /v1/ prefix, otherwise Ollama.
+ */
+export async function embedTexts(texts: string[], opts: EmbeddingCallOptions): Promise<number[][]> {
+    if (!opts.model.trim()) throw new Error('Embedding model not configured.');
+    if (!texts || texts.length === 0) return [];
+    const engine = resolveEngine(opts.baseUrl);
+    const url = buildApiUrl(opts.baseUrl, engine, 'embeddings');
+    const out: number[][] = [];
+    for (let i = 0; i < texts.length; i += BATCH_SIZE) {
+        const batch = texts.slice(i, i + BATCH_SIZE).map((t) => clipForEmbedding(t));
+        const body = engine === 'lmstudio'
+            ? { model: opts.model, input: batch }
+            : { model: opts.model, input: batch };  // Ollama 0.1.30+ also accepts array input
+        const controller = opts.signal ? undefined : new AbortController();
+        const timer = controller ? setTimeout(() => controller.abort(), REQ_TIMEOUT_MS) : undefined;
+        try {
+            const response = await fetch(url, {
+                method: 'POST',
+                headers: { 'Content-Type': 'application/json' },
+                body: JSON.stringify(body),
+                signal: opts.signal ?? controller?.signal,
+            });
+            if (!response.ok) {
+                const errText = await response.text().catch(() => '');
+                throw new Error(`Embedding endpoint returned ${response.status}: ${errText.slice(0, 200)}`);
+            }
+            const json = await response.json() as any;
+            // OpenAI-compatible: { data: [{ embedding: [...] }, ...] }
+            // Ollama: { embedding: [...] } (single) or { embeddings: [[...], ...] } (newer)
+            if (Array.isArray(json?.data)) {
+                for (const row of json.data) {
+                    if (Array.isArray(row?.embedding)) out.push(row.embedding as number[]);
+                }
+            } else if (Array.isArray(json?.embeddings)) {
+                for (const v of json.embeddings) {
+                    if (Array.isArray(v)) out.push(v as number[]);
+                }
+            } else if (Array.isArray(json?.embedding)) {
+                out.push(json.embedding as number[]);
+            }
+        } finally {
+            if (timer) clearTimeout(timer);
+        }
+    }
+    return out;
+}
+
+/** Cosine similarity for equal-length vectors. Returns 0 when either vector is empty / zero. */
+export function cosineSimilarity(a: number[], b: number[]): number {
+    if (!a || !b || a.length === 0 || b.length === 0) return 0;
+    const n = Math.min(a.length, b.length);
+    let dot = 0, na = 0, nb = 0;
+    for (let i = 0; i < n; i++) {
+        const va = a[i], vb = b[i];
+        dot += va * vb;
+        na += va * va;
+        nb += vb * vb;
+    }
+    if (na === 0 || nb === 0) return 0;
+    return dot / (Math.sqrt(na) * Math.sqrt(nb));
+}
+
+/** Clip a text to a length the embedding model will accept without truncation surprises. */
+function clipForEmbedding(text: string): string {
+    if (!text) return '';
+    return text.length <= EMBED_INPUT_CAP ? text : text.slice(0, EMBED_INPUT_CAP);
+}
+
+/**
+ * Tiny LRU for query embeddings: typing the same query twice (or retrying)
+ * shouldn't re-hit the embedding endpoint. Keyed on `model + text`.
+ *
+ * Capped at QUERY_CACHE_MAX entries; oldest evicted. Strictly process-local
+ * (no disk persistence) because the query strings are short and the gains
+ * across restarts are marginal.
+ */
+const QUERY_CACHE_MAX = 32;
+const _queryCache = new Map<string, number[]>();
+function queryCacheKey(model: string, text: string): string { return `${model}|${text}`; }
+export function getCachedQueryEmbedding(model: string, text: string): number[] | undefined {
+    const k = queryCacheKey(model, text);
+    const v = _queryCache.get(k);
+    if (!v) return undefined;
+    // refresh recency
+    _queryCache.delete(k);
+    _queryCache.set(k, v);
+    return v;
+}
+export function setCachedQueryEmbedding(model: string, text: string, vec: number[]): void {
+    const k = queryCacheKey(model, text);
+    _queryCache.set(k, vec);
+    if (_queryCache.size > QUERY_CACHE_MAX) {
+        const oldest = _queryCache.keys().next().value;
+        if (oldest !== undefined) _queryCache.delete(oldest);
+    }
+}
+
+/**
+ * Embed a single query string, using the in-process LRU. Returns `undefined`
+ * if the embedding endpoint fails — callers treat that as "semantic
+ * scoring unavailable for this turn, fall back to TF-IDF".
+ */
+export async function embedQuery(text: string, opts: EmbeddingCallOptions): Promise<number[] | undefined> {
+    if (!opts.model.trim() || !text.trim()) return undefined;
+    const cached = getCachedQueryEmbedding(opts.model, text);
+    if (cached) return cached;
+    try {
+        const [vec] = await embedTexts([text], opts);
+        if (vec && vec.length > 0) {
+            setCachedQueryEmbedding(opts.model, text, vec);
+            logInfo('Query embedding computed.', { model: opts.model, dim: vec.length });
+            return vec;
+        }
+    } catch (e: any) {
+        logError('Query embedding failed.', { model: opts.model, error: e?.message ?? String(e) });
+    }
+    return undefined;
+}