/** * ============================================================ * Embeddings — local hybrid (sparse + dense) retrieval support * * TF-IDF is fast and zero-cost but misses synonyms / paraphrase. A small local * embedding model (BGE-small, multilingual-e5-small, nomic-embed-text, …) * loaded in LM Studio or Ollama bridges that gap without sending anything * off the machine. * * Design choices: * - Opt-in via g1nation.embeddingModel (empty = disabled). We don't auto- * pick a model because the user has to load it in LM Studio/Ollama first. * - Calls are best-effort: a missing model / network blip falls back to * pure TF-IDF without breaking the query. * - We never block retrieval on embedding work. Missing-file embeddings are * populated by a separate fire-and-forget pass after the TF-IDF answer * ships, so the *next* query benefits. * * Numerical format: * - Vectors are `number[]` (not Float32Array) so they JSON-serialize for * the brain-index cache without per-element conversion. The hot loop * (cosine) is small enough that the extra precision is irrelevant to * throughput on typical brain sizes. * ============================================================ */ import { resolveEngine, buildApiUrl, logError, logInfo } from '../utils'; /** Maximum characters of a single text chunk fed to the embedding model. */ const EMBED_INPUT_CAP = 4000; /** Maximum texts per embedding API call. */ const BATCH_SIZE = 16; /** Request timeout for one embedding batch. */ const REQ_TIMEOUT_MS = 30000; export interface EmbeddingCallOptions { /** OpenAI-compatible base URL (e.g. http://127.0.0.1:1234 for LM Studio). */ baseUrl: string; /** Embedding model name as registered in LM Studio / Ollama. Empty disables. */ model: string; /** AbortSignal for cancellation propagation. */ signal?: AbortSignal; /** * Task kind for asymmetric embedding models. nomic-embed v1.5 / e5 계열은 * 질의·문서에 서로 다른 prefix 를 *반드시* 붙여야 본래 품질이 나온다. * embedTexts 기본 'document', embedQuery 는 항상 'query'. */ kind?: 'query' | 'document'; } /** * 모델별 task prefix. prefix 미적용 시 nomic/e5 는 의미 매칭 품질이 크게 떨어진다 * (모델 카드 명시 요구사항). 알 수 없는 모델은 prefix 없음. */ export function taskPrefix(model: string, kind: 'query' | 'document'): string { if (/nomic/i.test(model)) return kind === 'query' ? 'search_query: ' : 'search_document: '; if (/(^|[^a-z0-9])e5([^a-z0-9]|$)/i.test(model)) return kind === 'query' ? 'query: ' : 'passage: '; return ''; } /** * Embed a batch of texts. Returns one vector per input. Throws if the call * fails — callers wrap with try/catch and fall back to TF-IDF. * * Engine selection mirrors the chat path: LM Studio takes precedence when the * URL points at port 1234 or includes the /v1/ prefix, otherwise Ollama. */ export async function embedTexts(texts: string[], opts: EmbeddingCallOptions): Promise { if (!opts.model.trim()) throw new Error('Embedding model not configured.'); if (!texts || texts.length === 0) return []; const engine = resolveEngine(opts.baseUrl); const url = buildApiUrl(opts.baseUrl, engine, 'embeddings'); const prefix = taskPrefix(opts.model, opts.kind ?? 'document'); const out: number[][] = []; for (let i = 0; i < texts.length; i += BATCH_SIZE) { const batch = texts.slice(i, i + BATCH_SIZE).map((t) => prefix + clipForEmbedding(t)); const body = engine === 'lmstudio' ? { model: opts.model, input: batch } : { model: opts.model, input: batch }; // Ollama 0.1.30+ also accepts array input const controller = opts.signal ? undefined : new AbortController(); const timer = controller ? setTimeout(() => controller.abort(), REQ_TIMEOUT_MS) : undefined; try { const response = await fetch(url, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify(body), signal: opts.signal ?? controller?.signal, }); if (!response.ok) { const errText = await response.text().catch(() => ''); throw new Error(`Embedding endpoint returned ${response.status}: ${errText.slice(0, 200)}`); } const json = await response.json() as any; // OpenAI-compatible: { data: [{ embedding: [...] }, ...] } // Ollama: { embedding: [...] } (single) or { embeddings: [[...], ...] } (newer) if (Array.isArray(json?.data)) { for (const row of json.data) { if (Array.isArray(row?.embedding)) out.push(row.embedding as number[]); } } else if (Array.isArray(json?.embeddings)) { for (const v of json.embeddings) { if (Array.isArray(v)) out.push(v as number[]); } } else if (Array.isArray(json?.embedding)) { out.push(json.embedding as number[]); } } finally { if (timer) clearTimeout(timer); } } return out; } /** Cosine similarity for equal-length vectors. Returns 0 when either vector is empty / zero. */ export function cosineSimilarity(a: number[], b: number[]): number { if (!a || !b || a.length === 0 || b.length === 0) return 0; const n = Math.min(a.length, b.length); let dot = 0, na = 0, nb = 0; for (let i = 0; i < n; i++) { const va = a[i], vb = b[i]; dot += va * vb; na += va * va; nb += vb * vb; } if (na === 0 || nb === 0) return 0; return dot / (Math.sqrt(na) * Math.sqrt(nb)); } /** Clip a text to a length the embedding model will accept without truncation surprises. */ function clipForEmbedding(text: string): string { if (!text) return ''; return text.length <= EMBED_INPUT_CAP ? text : text.slice(0, EMBED_INPUT_CAP); } /** * Tiny LRU for query embeddings: typing the same query twice (or retrying) * shouldn't re-hit the embedding endpoint. Keyed on `model + text`. * * Capped at QUERY_CACHE_MAX entries; oldest evicted. Strictly process-local * (no disk persistence) because the query strings are short and the gains * across restarts are marginal. */ const QUERY_CACHE_MAX = 32; const _queryCache = new Map(); function queryCacheKey(model: string, text: string): string { return `${model}|${text}`; } export function getCachedQueryEmbedding(model: string, text: string): number[] | undefined { const k = queryCacheKey(model, text); const v = _queryCache.get(k); if (!v) return undefined; // refresh recency _queryCache.delete(k); _queryCache.set(k, v); return v; } export function setCachedQueryEmbedding(model: string, text: string, vec: number[]): void { const k = queryCacheKey(model, text); _queryCache.set(k, vec); if (_queryCache.size > QUERY_CACHE_MAX) { const oldest = _queryCache.keys().next().value; if (oldest !== undefined) _queryCache.delete(oldest); } } /** * Embed a single query string, using the in-process LRU. Returns `undefined` * if the embedding endpoint fails — callers treat that as "semantic * scoring unavailable for this turn, fall back to TF-IDF". */ export async function embedQuery(text: string, opts: EmbeddingCallOptions): Promise { if (!opts.model.trim() || !text.trim()) return undefined; const cached = getCachedQueryEmbedding(opts.model, text); if (cached) return cached; try { const [vec] = await embedTexts([text], { ...opts, kind: 'query' }); if (vec && vec.length > 0) { setCachedQueryEmbedding(opts.model, text, vec); logInfo('Query embedding computed.', { model: opts.model, dim: vec.length }); return vec; } } catch (e: any) { logError('Query embedding failed.', { model: opts.model, error: e?.message ?? String(e) }); } return undefined; }