suby
/
qmd


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154
							/**
 * embedding-live-parity.bench.ts - LIVE benchmark vs qmd-embed-worker.
 *
 * NOT a vitest test (uses .bench.ts suffix to skip auto-discovery).
 * Run manually with `bun src/test-preload.ts test/embedding-live-parity.bench.ts`
 * or `npx tsx test/embedding-live-parity.bench.ts`.
 *
 * Pre-req: `QMD_EMBED_ENDPOINT=http://10.0.2.162:8082` (or any reachable
 * qmd-embed-worker / ai.mm.mk endpoint with the embeddinggemma model loaded).
 *
 * What it measures:
 *   1. Healthcheck — confirm endpoint is up + reports expected model
 *   2. Single-text embed parity — same text via OpenAIEmbeddingsProvider
 *      vs LocalLlamaCppProvider, measure cosine similarity (target ≥0.999)
 *   3. Batch perf — embed 100 texts via HTTP and report throughput
 *
 * Local llama-cpp is OPTIONAL — set QMD_BENCH_SKIP_LOCAL=1 to skip parity
 * (only useful on machines without GPU/CPU model build support, like `code`
 * where Vulkan compilation fails).
 */

import { OpenAIEmbeddingsProvider } from "../src/embedding/openai.js";
import { LocalLlamaCppProvider } from "../src/embedding/local.js";

const ENDPOINT =
  process.env.QMD_EMBED_ENDPOINT?.trim() || "http://10.0.2.162:8082";
const MODEL_ID = process.env.QMD_EMBED_MODEL_ID?.trim() || "embeddinggemma";
const UPSTREAM_MODEL =
  process.env.QMD_EMBED_UPSTREAM_MODEL?.trim() || "embeddinggemma:300m";
const SKIP_LOCAL = process.env.QMD_BENCH_SKIP_LOCAL === "1";
const N_PERF = Number.parseInt(process.env.QMD_BENCH_N ?? "100", 10);

function cosine(a: number[], b: number[]): number {
  if (a.length !== b.length) return 0;
  let dot = 0;
  let na = 0;
  let nb = 0;
  for (let i = 0; i < a.length; i++) {
    dot += a[i]! * b[i]!;
    na += a[i]! * a[i]!;
    nb += b[i]! * b[i]!;
  }
  return dot / (Math.sqrt(na) * Math.sqrt(nb) + 1e-12);
}

function fmtMs(ms: number): string {
  if (ms < 1000) return `${ms.toFixed(0)}ms`;
  return `${(ms / 1000).toFixed(2)}s`;
}

async function main() {
  console.log(`╭─ qmd embedding live benchmark ──────────────────╮`);
  console.log(`│ endpoint: ${ENDPOINT}`);
  console.log(`│ model:    ${MODEL_ID} (upstream=${UPSTREAM_MODEL})`);
  console.log(`│ n_perf:   ${N_PERF}`);
  console.log(`│ skip local: ${SKIP_LOCAL ? "YES" : "no"}`);
  console.log(`╰─────────────────────────────────────────────────╯\n`);

  // ─── Step 1: healthcheck ────────────────────────────────────────────────
  const provider = new OpenAIEmbeddingsProvider({
    endpoint: ENDPOINT,
    modelId: MODEL_ID,
    upstreamModel: UPSTREAM_MODEL,
    timeoutMs: 30_000,
  });

  console.log("[1/3] Healthcheck...");
  const health = await provider.healthcheck();
  console.log(`  → ok=${health.ok}, model=${health.model}, dims=${health.dimensions ?? "?"}`);
  console.log(`  → detail: ${health.detail ?? "-"}\n`);
  if (!health.ok) {
    console.error("✗ Healthcheck failed; aborting.");
    process.exit(1);
  }

  // ─── Step 2: parity (HTTP vs local) ─────────────────────────────────────
  const sampleTexts = [
    "task: search result | query: hybrid search architecture",
    "title: README | text: QMD is a hybrid search engine combining BM25 and vector embeddings.",
    "title: Configuration | text: The retry schedule for 429 responses is 1s, 4s, 16s with up to 3 attempts.",
  ];

  console.log("[2/3] HTTP embed parity check...");
  const httpStart = Date.now();
  const httpResults = await provider.embedBatch(sampleTexts);
  const httpMs = Date.now() - httpStart;
  console.log(`  → HTTP embedded ${httpResults.length} texts in ${fmtMs(httpMs)}`);
  for (let i = 0; i < httpResults.length; i++) {
    const r = httpResults[i];
    console.log(`    [${i}] dim=${r?.embedding.length ?? "null"}, model="${r?.model}"`);
  }

  if (!SKIP_LOCAL) {
    try {
      console.log("\n  → Trying local llama-cpp comparison (may build models on first run)...");
      const local = new LocalLlamaCppProvider({ modelId: MODEL_ID });
      const localStart = Date.now();
      const localResults = await local.embedBatch(sampleTexts);
      const localMs = Date.now() - localStart;
      console.log(`  → LOCAL embedded ${localResults.length} texts in ${fmtMs(localMs)}`);

      console.log("\n  Cosine similarity (HTTP vs local):");
      let allPass = true;
      for (let i = 0; i < sampleTexts.length; i++) {
        const a = httpResults[i]?.embedding;
        const b = localResults[i]?.embedding;
        if (!a || !b) {
          console.log(`    [${i}] SKIP — null result`);
          continue;
        }
        const c = cosine(a, b);
        const ok = c >= 0.999;
        if (!ok) allPass = false;
        console.log(
          `    [${i}] cos=${c.toFixed(6)} ${ok ? "✓" : "✗ (target ≥0.999)"}`,
        );
      }
      console.log(allPass ? "  ✓ Parity PASS" : "  ✗ Parity FAIL");

      await local.dispose();
    } catch (err) {
      console.log(`  → Local comparison skipped: ${err instanceof Error ? err.message : err}`);
    }
  } else {
    console.log("  → Local comparison skipped (QMD_BENCH_SKIP_LOCAL=1)");
  }

  // ─── Step 3: throughput / perf benchmark ────────────────────────────────
  console.log(`\n[3/3] Performance: embedding ${N_PERF} chunks via HTTP...`);
  const texts: string[] = [];
  for (let i = 0; i < N_PERF; i++) {
    texts.push(
      `title: doc-${i} | text: This is sample text number ${i} containing words like search, embedding, vector, retrieval, similarity, ranking.`,
    );
  }
  const perfStart = Date.now();
  const perfResults = await provider.embedBatch(texts);
  const perfMs = Date.now() - perfStart;
  const okCount = perfResults.filter((r) => r !== null).length;
  console.log(
    `  → ${okCount}/${N_PERF} embedded in ${fmtMs(perfMs)} (${(N_PERF / (perfMs / 1000)).toFixed(1)} chunks/s)`,
  );
  console.log(
    `  → average per chunk: ${(perfMs / N_PERF).toFixed(2)}ms`,
  );
  console.log(`\nDone. ✓`);

  await provider.dispose();
}

main().catch((err) => {
  console.error("Benchmark failed:", err);
  process.exit(1);
});