/** * QMD Benchmark Harness * * Runs queries from a fixture file against multiple search backends * and measures precision@k, recall, MRR, F1, and latency. * * Usage: * qmd bench [--json] [--collection ] * * Backends tested: * - bm25: BM25 keyword search (searchLex) * - vector: Vector similarity search (searchVector) * - hybrid: BM25 + vector RRF fusion without reranking * - full: Full hybrid pipeline with LLM reranking */ import { readFileSync } from "node:fs"; import { resolve } from "node:path"; import { createStore, getDefaultDbPath, type QMDStore, type SearchResult, type HybridQueryResult, } from "../index.js"; import { scoreResults } from "./score.js"; import type { BenchmarkFixture, BenchmarkQuery, BackendResult, QueryResult, BenchmarkResult, } from "./types.js"; type Backend = { name: string; run: (store: QMDStore, query: string, limit: number, collection?: string) => Promise; }; const BACKENDS: Backend[] = [ { name: "bm25", run: async (store, query, limit, collection) => { const results = await store.searchLex(query, { limit, collection }); return results.map((r: SearchResult) => r.filepath); }, }, { name: "vector", run: async (store, query, limit, collection) => { const results = await store.searchVector(query, { limit, collection }); return results.map((r: SearchResult) => r.filepath); }, }, { name: "hybrid", run: async (store, query, limit, collection) => { const results = await store.search({ query, limit, collection, rerank: false }); return results.map((r: HybridQueryResult) => r.file); }, }, { name: "full", run: async (store, query, limit, collection) => { const results = await store.search({ query, limit, collection, rerank: true }); return results.map((r: HybridQueryResult) => r.file); }, }, ]; async function runQuery( store: QMDStore, backend: Backend, query: BenchmarkQuery, collection?: string, ): Promise { const limit = Math.max(query.expected_in_top_k, 10); const start = Date.now(); let resultFiles: string[]; try { resultFiles = await backend.run(store, query.query, limit, collection); } catch (err: any) { // Backend may not be available (e.g., no embeddings for vector search) return { precision_at_k: 0, recall: 0, mrr: 0, f1: 0, hits_at_k: 0, total_expected: query.expected_files.length, latency_ms: Date.now() - start, top_files: [], }; } const latency_ms = Date.now() - start; const scores = scoreResults(resultFiles, query.expected_files, query.expected_in_top_k); return { ...scores, total_expected: query.expected_files.length, latency_ms, top_files: resultFiles.slice(0, 10), }; } function formatTable(results: QueryResult[]): string { const lines: string[] = []; const pad = (s: string, n: number) => s.slice(0, n).padEnd(n); const num = (n: number) => n.toFixed(2).padStart(5); lines.push( `${pad("Query", 25)} ${pad("Backend", 8)} ${pad("P@k", 6)} ${pad("Recall", 7)} ${pad("MRR", 6)} ${pad("F1", 6)} ${pad("ms", 8)}` ); lines.push("-".repeat(70)); for (const r of results) { for (const [backend, br] of Object.entries(r.backends)) { lines.push( `${pad(r.id, 25)} ${pad(backend, 8)} ${num(br.precision_at_k)} ${num(br.recall)} ${num(br.mrr)} ${num(br.f1)} ${String(Math.round(br.latency_ms)).padStart(7)}ms` ); } lines.push(""); } return lines.join("\n"); } function computeSummary(results: QueryResult[]): BenchmarkResult["summary"] { const summary: BenchmarkResult["summary"] = {}; // Collect all backend names const backendNames = new Set(); for (const r of results) { for (const name of Object.keys(r.backends)) { backendNames.add(name); } } for (const name of backendNames) { let totalP = 0, totalR = 0, totalMrr = 0, totalF1 = 0, totalLat = 0, count = 0; for (const r of results) { const br = r.backends[name]; if (!br) continue; totalP += br.precision_at_k; totalR += br.recall; totalMrr += br.mrr; totalF1 += br.f1; totalLat += br.latency_ms; count++; } if (count > 0) { summary[name] = { avg_precision: totalP / count, avg_recall: totalR / count, avg_mrr: totalMrr / count, avg_f1: totalF1 / count, avg_latency_ms: totalLat / count, }; } } return summary; } export async function runBenchmark( fixturePath: string, options: { json?: boolean; collection?: string; backends?: string[] } = {}, ): Promise { // Load fixture const raw = readFileSync(resolve(fixturePath), "utf-8"); const fixture: BenchmarkFixture = JSON.parse(raw); if (!fixture.queries || !Array.isArray(fixture.queries)) { throw new Error("Invalid fixture: missing 'queries' array"); } // Open store const store = await createStore({ dbPath: getDefaultDbPath() }); // Filter backends if requested const activeBackends = options.backends ? BACKENDS.filter(b => options.backends!.includes(b.name)) : BACKENDS; const collection = options.collection ?? fixture.collection; // Run queries const results: QueryResult[] = []; for (const query of fixture.queries) { const backends: Record = {}; for (const backend of activeBackends) { if (!options.json) { process.stderr.write(` ${query.id} / ${backend.name}...`); } backends[backend.name] = await runQuery(store, backend, query, collection); if (!options.json) { process.stderr.write(` ${Math.round(backends[backend.name]!.latency_ms)}ms\n`); } } results.push({ id: query.id, query: query.query, type: query.type, backends, }); } await store.close(); const summary = computeSummary(results); const timestamp = new Date().toISOString().replace(/[:.]/g, "").slice(0, 15); const benchResult: BenchmarkResult = { timestamp, fixture: fixturePath, results, summary, }; // Output if (options.json) { console.log(JSON.stringify(benchResult, null, 2)); } else { console.log("\n" + formatTable(results)); console.log("Summary:"); console.log("-".repeat(70)); const pad = (s: string, n: number) => s.slice(0, n).padEnd(n); const num = (n: number) => n.toFixed(3).padStart(6); for (const [name, s] of Object.entries(summary)) { console.log( ` ${pad(name, 8)} P@k=${num(s.avg_precision)} Recall=${num(s.avg_recall)} MRR=${num(s.avg_mrr)} F1=${num(s.avg_f1)} Avg=${Math.round(s.avg_latency_ms)}ms` ); } } return benchResult; }