embedding-live-parity.bench.ts 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154
  1. /**
  2. * embedding-live-parity.bench.ts - LIVE benchmark vs qmd-embed-worker.
  3. *
  4. * NOT a vitest test (uses .bench.ts suffix to skip auto-discovery).
  5. * Run manually with `bun src/test-preload.ts test/embedding-live-parity.bench.ts`
  6. * or `npx tsx test/embedding-live-parity.bench.ts`.
  7. *
  8. * Pre-req: `QMD_EMBED_ENDPOINT=http://10.0.2.162:8082` (or any reachable
  9. * qmd-embed-worker / ai.mm.mk endpoint with the embeddinggemma model loaded).
  10. *
  11. * What it measures:
  12. * 1. Healthcheck — confirm endpoint is up + reports expected model
  13. * 2. Single-text embed parity — same text via OpenAIEmbeddingsProvider
  14. * vs LocalLlamaCppProvider, measure cosine similarity (target ≥0.999)
  15. * 3. Batch perf — embed 100 texts via HTTP and report throughput
  16. *
  17. * Local llama-cpp is OPTIONAL — set QMD_BENCH_SKIP_LOCAL=1 to skip parity
  18. * (only useful on machines without GPU/CPU model build support, like `code`
  19. * where Vulkan compilation fails).
  20. */
  21. import { OpenAIEmbeddingsProvider } from "../src/embedding/openai.js";
  22. import { LocalLlamaCppProvider } from "../src/embedding/local.js";
  23. const ENDPOINT =
  24. process.env.QMD_EMBED_ENDPOINT?.trim() || "http://10.0.2.162:8082";
  25. const MODEL_ID = process.env.QMD_EMBED_MODEL_ID?.trim() || "embeddinggemma";
  26. const UPSTREAM_MODEL =
  27. process.env.QMD_EMBED_UPSTREAM_MODEL?.trim() || "embeddinggemma:300m";
  28. const SKIP_LOCAL = process.env.QMD_BENCH_SKIP_LOCAL === "1";
  29. const N_PERF = Number.parseInt(process.env.QMD_BENCH_N ?? "100", 10);
  30. function cosine(a: number[], b: number[]): number {
  31. if (a.length !== b.length) return 0;
  32. let dot = 0;
  33. let na = 0;
  34. let nb = 0;
  35. for (let i = 0; i < a.length; i++) {
  36. dot += a[i]! * b[i]!;
  37. na += a[i]! * a[i]!;
  38. nb += b[i]! * b[i]!;
  39. }
  40. return dot / (Math.sqrt(na) * Math.sqrt(nb) + 1e-12);
  41. }
  42. function fmtMs(ms: number): string {
  43. if (ms < 1000) return `${ms.toFixed(0)}ms`;
  44. return `${(ms / 1000).toFixed(2)}s`;
  45. }
  46. async function main() {
  47. console.log(`╭─ qmd embedding live benchmark ──────────────────╮`);
  48. console.log(`│ endpoint: ${ENDPOINT}`);
  49. console.log(`│ model: ${MODEL_ID} (upstream=${UPSTREAM_MODEL})`);
  50. console.log(`│ n_perf: ${N_PERF}`);
  51. console.log(`│ skip local: ${SKIP_LOCAL ? "YES" : "no"}`);
  52. console.log(`╰─────────────────────────────────────────────────╯\n`);
  53. // ─── Step 1: healthcheck ────────────────────────────────────────────────
  54. const provider = new OpenAIEmbeddingsProvider({
  55. endpoint: ENDPOINT,
  56. modelId: MODEL_ID,
  57. upstreamModel: UPSTREAM_MODEL,
  58. timeoutMs: 30_000,
  59. });
  60. console.log("[1/3] Healthcheck...");
  61. const health = await provider.healthcheck();
  62. console.log(` → ok=${health.ok}, model=${health.model}, dims=${health.dimensions ?? "?"}`);
  63. console.log(` → detail: ${health.detail ?? "-"}\n`);
  64. if (!health.ok) {
  65. console.error("✗ Healthcheck failed; aborting.");
  66. process.exit(1);
  67. }
  68. // ─── Step 2: parity (HTTP vs local) ─────────────────────────────────────
  69. const sampleTexts = [
  70. "task: search result | query: hybrid search architecture",
  71. "title: README | text: QMD is a hybrid search engine combining BM25 and vector embeddings.",
  72. "title: Configuration | text: The retry schedule for 429 responses is 1s, 4s, 16s with up to 3 attempts.",
  73. ];
  74. console.log("[2/3] HTTP embed parity check...");
  75. const httpStart = Date.now();
  76. const httpResults = await provider.embedBatch(sampleTexts);
  77. const httpMs = Date.now() - httpStart;
  78. console.log(` → HTTP embedded ${httpResults.length} texts in ${fmtMs(httpMs)}`);
  79. for (let i = 0; i < httpResults.length; i++) {
  80. const r = httpResults[i];
  81. console.log(` [${i}] dim=${r?.embedding.length ?? "null"}, model="${r?.model}"`);
  82. }
  83. if (!SKIP_LOCAL) {
  84. try {
  85. console.log("\n → Trying local llama-cpp comparison (may build models on first run)...");
  86. const local = new LocalLlamaCppProvider({ modelId: MODEL_ID });
  87. const localStart = Date.now();
  88. const localResults = await local.embedBatch(sampleTexts);
  89. const localMs = Date.now() - localStart;
  90. console.log(` → LOCAL embedded ${localResults.length} texts in ${fmtMs(localMs)}`);
  91. console.log("\n Cosine similarity (HTTP vs local):");
  92. let allPass = true;
  93. for (let i = 0; i < sampleTexts.length; i++) {
  94. const a = httpResults[i]?.embedding;
  95. const b = localResults[i]?.embedding;
  96. if (!a || !b) {
  97. console.log(` [${i}] SKIP — null result`);
  98. continue;
  99. }
  100. const c = cosine(a, b);
  101. const ok = c >= 0.999;
  102. if (!ok) allPass = false;
  103. console.log(
  104. ` [${i}] cos=${c.toFixed(6)} ${ok ? "✓" : "✗ (target ≥0.999)"}`,
  105. );
  106. }
  107. console.log(allPass ? " ✓ Parity PASS" : " ✗ Parity FAIL");
  108. await local.dispose();
  109. } catch (err) {
  110. console.log(` → Local comparison skipped: ${err instanceof Error ? err.message : err}`);
  111. }
  112. } else {
  113. console.log(" → Local comparison skipped (QMD_BENCH_SKIP_LOCAL=1)");
  114. }
  115. // ─── Step 3: throughput / perf benchmark ────────────────────────────────
  116. console.log(`\n[3/3] Performance: embedding ${N_PERF} chunks via HTTP...`);
  117. const texts: string[] = [];
  118. for (let i = 0; i < N_PERF; i++) {
  119. texts.push(
  120. `title: doc-${i} | text: This is sample text number ${i} containing words like search, embedding, vector, retrieval, similarity, ranking.`,
  121. );
  122. }
  123. const perfStart = Date.now();
  124. const perfResults = await provider.embedBatch(texts);
  125. const perfMs = Date.now() - perfStart;
  126. const okCount = perfResults.filter((r) => r !== null).length;
  127. console.log(
  128. ` → ${okCount}/${N_PERF} embedded in ${fmtMs(perfMs)} (${(N_PERF / (perfMs / 1000)).toFixed(1)} chunks/s)`,
  129. );
  130. console.log(
  131. ` → average per chunk: ${(perfMs / N_PERF).toFixed(2)}ms`,
  132. );
  133. console.log(`\nDone. ✓`);
  134. await provider.dispose();
  135. }
  136. main().catch((err) => {
  137. console.error("Benchmark failed:", err);
  138. process.exit(1);
  139. });