67927b1d4e
골든셋(24질의) 측정으로 기존 하이브리드 구현의 결함 3건을 잡고 기본 활성화. 측정 결과: recall@3 83.3%→87.5%, MRR 0.802→0.806, recall@1 회귀 없음 (α=0.5). 수정 (측정으로 검증): - 임베딩 입력을 토큰 재조합(tokens.join)→원문 슬라이스로 교체 + nomic/e5 task prefix (search_query:/search_document:). 토큰 죽 입력은 하이브리드를 전 지표 하락시켰음 (recall@1 75%→54%). @r2 리비전 키로 구벡터 자동 무효화. - 블렌드 스케일 버그: 벡터 있는 후보만 정규화돼 벡터 없는 후보의 raw 점수가 상위 독식 → 전 후보 정규화 + cosine 후보군 내 min-max 정규화. - 헤딩-only 청크도 헤딩 텍스트로 임베딩 (벡터 공백 제거). 추가: - embeddingBootstrap: 활성화 시 엔진 모델 목록에서 임베딩 모델 자동 감지 → embeddingModel 자동 설정 + "전체 색인" 버튼 알림. 다국어 모델(e5/bge-m3) 우선. 사용자가 의도적으로 비우면 재설정 안 함 (globalState 가드). - 벡터 저장 시 소수 4자리 양자화 — 캐시 360MB→~150MB (코사인 순위 영향 없음). - tests/retrievalEvalEmbedding.test.ts: env-gated 하이브리드 측정 하니스 (alpha sweep). - scripts/compact_brain_index.mjs: 기존 full-precision 캐시 1회 압축 도구. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
185 lines
8.0 KiB
TypeScript
185 lines
8.0 KiB
TypeScript
/**
|
|
* ============================================================
|
|
* Embeddings — local hybrid (sparse + dense) retrieval support
|
|
*
|
|
* TF-IDF is fast and zero-cost but misses synonyms / paraphrase. A small local
|
|
* embedding model (BGE-small, multilingual-e5-small, nomic-embed-text, …)
|
|
* loaded in LM Studio or Ollama bridges that gap without sending anything
|
|
* off the machine.
|
|
*
|
|
* Design choices:
|
|
* - Opt-in via g1nation.embeddingModel (empty = disabled). We don't auto-
|
|
* pick a model because the user has to load it in LM Studio/Ollama first.
|
|
* - Calls are best-effort: a missing model / network blip falls back to
|
|
* pure TF-IDF without breaking the query.
|
|
* - We never block retrieval on embedding work. Missing-file embeddings are
|
|
* populated by a separate fire-and-forget pass after the TF-IDF answer
|
|
* ships, so the *next* query benefits.
|
|
*
|
|
* Numerical format:
|
|
* - Vectors are `number[]` (not Float32Array) so they JSON-serialize for
|
|
* the brain-index cache without per-element conversion. The hot loop
|
|
* (cosine) is small enough that the extra precision is irrelevant to
|
|
* throughput on typical brain sizes.
|
|
* ============================================================
|
|
*/
|
|
|
|
import { resolveEngine, buildApiUrl, logError, logInfo } from '../utils';
|
|
|
|
/** Maximum characters of a single text chunk fed to the embedding model. */
|
|
const EMBED_INPUT_CAP = 4000;
|
|
/** Maximum texts per embedding API call. */
|
|
const BATCH_SIZE = 16;
|
|
/** Request timeout for one embedding batch. */
|
|
const REQ_TIMEOUT_MS = 30000;
|
|
|
|
export interface EmbeddingCallOptions {
|
|
/** OpenAI-compatible base URL (e.g. http://127.0.0.1:1234 for LM Studio). */
|
|
baseUrl: string;
|
|
/** Embedding model name as registered in LM Studio / Ollama. Empty disables. */
|
|
model: string;
|
|
/** AbortSignal for cancellation propagation. */
|
|
signal?: AbortSignal;
|
|
/**
|
|
* Task kind for asymmetric embedding models. nomic-embed v1.5 / e5 계열은
|
|
* 질의·문서에 서로 다른 prefix 를 *반드시* 붙여야 본래 품질이 나온다.
|
|
* embedTexts 기본 'document', embedQuery 는 항상 'query'.
|
|
*/
|
|
kind?: 'query' | 'document';
|
|
}
|
|
|
|
/**
|
|
* 모델별 task prefix. prefix 미적용 시 nomic/e5 는 의미 매칭 품질이 크게 떨어진다
|
|
* (모델 카드 명시 요구사항). 알 수 없는 모델은 prefix 없음.
|
|
*/
|
|
export function taskPrefix(model: string, kind: 'query' | 'document'): string {
|
|
if (/nomic/i.test(model)) return kind === 'query' ? 'search_query: ' : 'search_document: ';
|
|
if (/(^|[^a-z0-9])e5([^a-z0-9]|$)/i.test(model)) return kind === 'query' ? 'query: ' : 'passage: ';
|
|
return '';
|
|
}
|
|
|
|
/**
|
|
* Embed a batch of texts. Returns one vector per input. Throws if the call
|
|
* fails — callers wrap with try/catch and fall back to TF-IDF.
|
|
*
|
|
* Engine selection mirrors the chat path: LM Studio takes precedence when the
|
|
* URL points at port 1234 or includes the /v1/ prefix, otherwise Ollama.
|
|
*/
|
|
export async function embedTexts(texts: string[], opts: EmbeddingCallOptions): Promise<number[][]> {
|
|
if (!opts.model.trim()) throw new Error('Embedding model not configured.');
|
|
if (!texts || texts.length === 0) return [];
|
|
const engine = resolveEngine(opts.baseUrl);
|
|
const url = buildApiUrl(opts.baseUrl, engine, 'embeddings');
|
|
const prefix = taskPrefix(opts.model, opts.kind ?? 'document');
|
|
const out: number[][] = [];
|
|
for (let i = 0; i < texts.length; i += BATCH_SIZE) {
|
|
const batch = texts.slice(i, i + BATCH_SIZE).map((t) => prefix + clipForEmbedding(t));
|
|
const body = engine === 'lmstudio'
|
|
? { model: opts.model, input: batch }
|
|
: { model: opts.model, input: batch }; // Ollama 0.1.30+ also accepts array input
|
|
const controller = opts.signal ? undefined : new AbortController();
|
|
const timer = controller ? setTimeout(() => controller.abort(), REQ_TIMEOUT_MS) : undefined;
|
|
try {
|
|
const response = await fetch(url, {
|
|
method: 'POST',
|
|
headers: { 'Content-Type': 'application/json' },
|
|
body: JSON.stringify(body),
|
|
signal: opts.signal ?? controller?.signal,
|
|
});
|
|
if (!response.ok) {
|
|
const errText = await response.text().catch(() => '');
|
|
throw new Error(`Embedding endpoint returned ${response.status}: ${errText.slice(0, 200)}`);
|
|
}
|
|
const json = await response.json() as any;
|
|
// OpenAI-compatible: { data: [{ embedding: [...] }, ...] }
|
|
// Ollama: { embedding: [...] } (single) or { embeddings: [[...], ...] } (newer)
|
|
if (Array.isArray(json?.data)) {
|
|
for (const row of json.data) {
|
|
if (Array.isArray(row?.embedding)) out.push(row.embedding as number[]);
|
|
}
|
|
} else if (Array.isArray(json?.embeddings)) {
|
|
for (const v of json.embeddings) {
|
|
if (Array.isArray(v)) out.push(v as number[]);
|
|
}
|
|
} else if (Array.isArray(json?.embedding)) {
|
|
out.push(json.embedding as number[]);
|
|
}
|
|
} finally {
|
|
if (timer) clearTimeout(timer);
|
|
}
|
|
}
|
|
return out;
|
|
}
|
|
|
|
/** Cosine similarity for equal-length vectors. Returns 0 when either vector is empty / zero. */
|
|
export function cosineSimilarity(a: number[], b: number[]): number {
|
|
if (!a || !b || a.length === 0 || b.length === 0) return 0;
|
|
const n = Math.min(a.length, b.length);
|
|
let dot = 0, na = 0, nb = 0;
|
|
for (let i = 0; i < n; i++) {
|
|
const va = a[i], vb = b[i];
|
|
dot += va * vb;
|
|
na += va * va;
|
|
nb += vb * vb;
|
|
}
|
|
if (na === 0 || nb === 0) return 0;
|
|
return dot / (Math.sqrt(na) * Math.sqrt(nb));
|
|
}
|
|
|
|
/** Clip a text to a length the embedding model will accept without truncation surprises. */
|
|
function clipForEmbedding(text: string): string {
|
|
if (!text) return '';
|
|
return text.length <= EMBED_INPUT_CAP ? text : text.slice(0, EMBED_INPUT_CAP);
|
|
}
|
|
|
|
/**
|
|
* Tiny LRU for query embeddings: typing the same query twice (or retrying)
|
|
* shouldn't re-hit the embedding endpoint. Keyed on `model + text`.
|
|
*
|
|
* Capped at QUERY_CACHE_MAX entries; oldest evicted. Strictly process-local
|
|
* (no disk persistence) because the query strings are short and the gains
|
|
* across restarts are marginal.
|
|
*/
|
|
const QUERY_CACHE_MAX = 32;
|
|
const _queryCache = new Map<string, number[]>();
|
|
function queryCacheKey(model: string, text: string): string { return `${model}|${text}`; }
|
|
export function getCachedQueryEmbedding(model: string, text: string): number[] | undefined {
|
|
const k = queryCacheKey(model, text);
|
|
const v = _queryCache.get(k);
|
|
if (!v) return undefined;
|
|
// refresh recency
|
|
_queryCache.delete(k);
|
|
_queryCache.set(k, v);
|
|
return v;
|
|
}
|
|
export function setCachedQueryEmbedding(model: string, text: string, vec: number[]): void {
|
|
const k = queryCacheKey(model, text);
|
|
_queryCache.set(k, vec);
|
|
if (_queryCache.size > QUERY_CACHE_MAX) {
|
|
const oldest = _queryCache.keys().next().value;
|
|
if (oldest !== undefined) _queryCache.delete(oldest);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Embed a single query string, using the in-process LRU. Returns `undefined`
|
|
* if the embedding endpoint fails — callers treat that as "semantic
|
|
* scoring unavailable for this turn, fall back to TF-IDF".
|
|
*/
|
|
export async function embedQuery(text: string, opts: EmbeddingCallOptions): Promise<number[] | undefined> {
|
|
if (!opts.model.trim() || !text.trim()) return undefined;
|
|
const cached = getCachedQueryEmbedding(opts.model, text);
|
|
if (cached) return cached;
|
|
try {
|
|
const [vec] = await embedTexts([text], { ...opts, kind: 'query' });
|
|
if (vec && vec.length > 0) {
|
|
setCachedQueryEmbedding(opts.model, text, vec);
|
|
logInfo('Query embedding computed.', { model: opts.model, dim: vec.length });
|
|
return vec;
|
|
}
|
|
} catch (e: any) {
|
|
logError('Query embedding failed.', { model: opts.model, error: e?.message ?? String(e) });
|
|
}
|
|
return undefined;
|
|
}
|