/** * embedding-live-parity.bench.ts - LIVE benchmark vs qmd-embed-worker. * * NOT a vitest test (uses .bench.ts suffix to skip auto-discovery). * Run manually with `bun src/test-preload.ts test/embedding-live-parity.bench.ts` * or `npx tsx test/embedding-live-parity.bench.ts`. * * Pre-req: `QMD_EMBED_ENDPOINT=http://10.0.2.162:8082` (or any reachable * qmd-embed-worker / ai.mm.mk endpoint with the embeddinggemma model loaded). * * What it measures: * 1. Healthcheck — confirm endpoint is up + reports expected model * 2. Single-text embed parity — same text via OpenAIEmbeddingsProvider * vs LocalLlamaCppProvider, measure cosine similarity (target ≥0.999) * 3. Batch perf — embed 100 texts via HTTP and report throughput * * Local llama-cpp is OPTIONAL — set QMD_BENCH_SKIP_LOCAL=1 to skip parity * (only useful on machines without GPU/CPU model build support, like `code` * where Vulkan compilation fails). */ import { OpenAIEmbeddingsProvider } from "../src/embedding/openai.js"; import { LocalLlamaCppProvider } from "../src/embedding/local.js"; const ENDPOINT = process.env.QMD_EMBED_ENDPOINT?.trim() || "http://10.0.2.162:8082"; const MODEL_ID = process.env.QMD_EMBED_MODEL_ID?.trim() || "embeddinggemma"; const UPSTREAM_MODEL = process.env.QMD_EMBED_UPSTREAM_MODEL?.trim() || "embeddinggemma:300m"; const SKIP_LOCAL = process.env.QMD_BENCH_SKIP_LOCAL === "1"; const N_PERF = Number.parseInt(process.env.QMD_BENCH_N ?? "100", 10); function cosine(a: number[], b: number[]): number { if (a.length !== b.length) return 0; let dot = 0; let na = 0; let nb = 0; for (let i = 0; i < a.length; i++) { dot += a[i]! * b[i]!; na += a[i]! * a[i]!; nb += b[i]! * b[i]!; } return dot / (Math.sqrt(na) * Math.sqrt(nb) + 1e-12); } function fmtMs(ms: number): string { if (ms < 1000) return `${ms.toFixed(0)}ms`; return `${(ms / 1000).toFixed(2)}s`; } async function main() { console.log(`╭─ qmd embedding live benchmark ──────────────────╮`); console.log(`│ endpoint: ${ENDPOINT}`); console.log(`│ model: ${MODEL_ID} (upstream=${UPSTREAM_MODEL})`); console.log(`│ n_perf: ${N_PERF}`); console.log(`│ skip local: ${SKIP_LOCAL ? "YES" : "no"}`); console.log(`╰─────────────────────────────────────────────────╯\n`); // ─── Step 1: healthcheck ──────────────────────────────────────────────── const provider = new OpenAIEmbeddingsProvider({ endpoint: ENDPOINT, modelId: MODEL_ID, upstreamModel: UPSTREAM_MODEL, timeoutMs: 30_000, }); console.log("[1/3] Healthcheck..."); const health = await provider.healthcheck(); console.log(` → ok=${health.ok}, model=${health.model}, dims=${health.dimensions ?? "?"}`); console.log(` → detail: ${health.detail ?? "-"}\n`); if (!health.ok) { console.error("✗ Healthcheck failed; aborting."); process.exit(1); } // ─── Step 2: parity (HTTP vs local) ───────────────────────────────────── const sampleTexts = [ "task: search result | query: hybrid search architecture", "title: README | text: QMD is a hybrid search engine combining BM25 and vector embeddings.", "title: Configuration | text: The retry schedule for 429 responses is 1s, 4s, 16s with up to 3 attempts.", ]; console.log("[2/3] HTTP embed parity check..."); const httpStart = Date.now(); const httpResults = await provider.embedBatch(sampleTexts); const httpMs = Date.now() - httpStart; console.log(` → HTTP embedded ${httpResults.length} texts in ${fmtMs(httpMs)}`); for (let i = 0; i < httpResults.length; i++) { const r = httpResults[i]; console.log(` [${i}] dim=${r?.embedding.length ?? "null"}, model="${r?.model}"`); } if (!SKIP_LOCAL) { try { console.log("\n → Trying local llama-cpp comparison (may build models on first run)..."); const local = new LocalLlamaCppProvider({ modelId: MODEL_ID }); const localStart = Date.now(); const localResults = await local.embedBatch(sampleTexts); const localMs = Date.now() - localStart; console.log(` → LOCAL embedded ${localResults.length} texts in ${fmtMs(localMs)}`); console.log("\n Cosine similarity (HTTP vs local):"); let allPass = true; for (let i = 0; i < sampleTexts.length; i++) { const a = httpResults[i]?.embedding; const b = localResults[i]?.embedding; if (!a || !b) { console.log(` [${i}] SKIP — null result`); continue; } const c = cosine(a, b); const ok = c >= 0.999; if (!ok) allPass = false; console.log( ` [${i}] cos=${c.toFixed(6)} ${ok ? "✓" : "✗ (target ≥0.999)"}`, ); } console.log(allPass ? " ✓ Parity PASS" : " ✗ Parity FAIL"); await local.dispose(); } catch (err) { console.log(` → Local comparison skipped: ${err instanceof Error ? err.message : err}`); } } else { console.log(" → Local comparison skipped (QMD_BENCH_SKIP_LOCAL=1)"); } // ─── Step 3: throughput / perf benchmark ──────────────────────────────── console.log(`\n[3/3] Performance: embedding ${N_PERF} chunks via HTTP...`); const texts: string[] = []; for (let i = 0; i < N_PERF; i++) { texts.push( `title: doc-${i} | text: This is sample text number ${i} containing words like search, embedding, vector, retrieval, similarity, ranking.`, ); } const perfStart = Date.now(); const perfResults = await provider.embedBatch(texts); const perfMs = Date.now() - perfStart; const okCount = perfResults.filter((r) => r !== null).length; console.log( ` → ${okCount}/${N_PERF} embedded in ${fmtMs(perfMs)} (${(N_PERF / (perfMs / 1000)).toFixed(1)} chunks/s)`, ); console.log( ` → average per chunk: ${(perfMs / N_PERF).toFixed(2)}ms`, ); console.log(`\nDone. ✓`); await provider.dispose(); } main().catch((err) => { console.error("Benchmark failed:", err); process.exit(1); });