bench.ts 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241
  1. /**
  2. * QMD Benchmark Harness
  3. *
  4. * Runs queries from a fixture file against multiple search backends
  5. * and measures precision@k, recall, MRR, F1, and latency.
  6. *
  7. * Usage:
  8. * qmd bench <fixture.json> [--json] [--collection <name>]
  9. *
  10. * Backends tested:
  11. * - bm25: BM25 keyword search (searchLex)
  12. * - vector: Vector similarity search (searchVector)
  13. * - hybrid: BM25 + vector RRF fusion without reranking
  14. * - full: Full hybrid pipeline with LLM reranking
  15. */
  16. import { readFileSync } from "node:fs";
  17. import { resolve } from "node:path";
  18. import {
  19. createStore,
  20. getDefaultDbPath,
  21. type QMDStore,
  22. type SearchResult,
  23. type HybridQueryResult,
  24. } from "../index.js";
  25. import { scoreResults } from "./score.js";
  26. import type {
  27. BenchmarkFixture,
  28. BenchmarkQuery,
  29. BackendResult,
  30. QueryResult,
  31. BenchmarkResult,
  32. } from "./types.js";
  33. type Backend = {
  34. name: string;
  35. run: (store: QMDStore, query: string, limit: number, collection?: string) => Promise<string[]>;
  36. };
  37. const BACKENDS: Backend[] = [
  38. {
  39. name: "bm25",
  40. run: async (store, query, limit, collection) => {
  41. const results = await store.searchLex(query, { limit, collection });
  42. return results.map((r: SearchResult) => r.filepath);
  43. },
  44. },
  45. {
  46. name: "vector",
  47. run: async (store, query, limit, collection) => {
  48. const results = await store.searchVector(query, { limit, collection });
  49. return results.map((r: SearchResult) => r.filepath);
  50. },
  51. },
  52. {
  53. name: "hybrid",
  54. run: async (store, query, limit, collection) => {
  55. const results = await store.search({ query, limit, collection, rerank: false });
  56. return results.map((r: HybridQueryResult) => r.file);
  57. },
  58. },
  59. {
  60. name: "full",
  61. run: async (store, query, limit, collection) => {
  62. const results = await store.search({ query, limit, collection, rerank: true });
  63. return results.map((r: HybridQueryResult) => r.file);
  64. },
  65. },
  66. ];
  67. async function runQuery(
  68. store: QMDStore,
  69. backend: Backend,
  70. query: BenchmarkQuery,
  71. collection?: string,
  72. ): Promise<BackendResult> {
  73. const limit = Math.max(query.expected_in_top_k, 10);
  74. const start = Date.now();
  75. let resultFiles: string[];
  76. try {
  77. resultFiles = await backend.run(store, query.query, limit, collection);
  78. } catch (err: any) {
  79. // Backend may not be available (e.g., no embeddings for vector search)
  80. return {
  81. precision_at_k: 0,
  82. recall: 0,
  83. mrr: 0,
  84. f1: 0,
  85. hits_at_k: 0,
  86. total_expected: query.expected_files.length,
  87. latency_ms: Date.now() - start,
  88. top_files: [],
  89. };
  90. }
  91. const latency_ms = Date.now() - start;
  92. const scores = scoreResults(resultFiles, query.expected_files, query.expected_in_top_k);
  93. return {
  94. ...scores,
  95. total_expected: query.expected_files.length,
  96. latency_ms,
  97. top_files: resultFiles.slice(0, 10),
  98. };
  99. }
  100. function formatTable(results: QueryResult[]): string {
  101. const lines: string[] = [];
  102. const pad = (s: string, n: number) => s.slice(0, n).padEnd(n);
  103. const num = (n: number) => n.toFixed(2).padStart(5);
  104. lines.push(
  105. `${pad("Query", 25)} ${pad("Backend", 8)} ${pad("P@k", 6)} ${pad("Recall", 7)} ${pad("MRR", 6)} ${pad("F1", 6)} ${pad("ms", 8)}`
  106. );
  107. lines.push("-".repeat(70));
  108. for (const r of results) {
  109. for (const [backend, br] of Object.entries(r.backends)) {
  110. lines.push(
  111. `${pad(r.id, 25)} ${pad(backend, 8)} ${num(br.precision_at_k)} ${num(br.recall)} ${num(br.mrr)} ${num(br.f1)} ${String(Math.round(br.latency_ms)).padStart(7)}ms`
  112. );
  113. }
  114. lines.push("");
  115. }
  116. return lines.join("\n");
  117. }
  118. function computeSummary(results: QueryResult[]): BenchmarkResult["summary"] {
  119. const summary: BenchmarkResult["summary"] = {};
  120. // Collect all backend names
  121. const backendNames = new Set<string>();
  122. for (const r of results) {
  123. for (const name of Object.keys(r.backends)) {
  124. backendNames.add(name);
  125. }
  126. }
  127. for (const name of backendNames) {
  128. let totalP = 0, totalR = 0, totalMrr = 0, totalF1 = 0, totalLat = 0, count = 0;
  129. for (const r of results) {
  130. const br = r.backends[name];
  131. if (!br) continue;
  132. totalP += br.precision_at_k;
  133. totalR += br.recall;
  134. totalMrr += br.mrr;
  135. totalF1 += br.f1;
  136. totalLat += br.latency_ms;
  137. count++;
  138. }
  139. if (count > 0) {
  140. summary[name] = {
  141. avg_precision: totalP / count,
  142. avg_recall: totalR / count,
  143. avg_mrr: totalMrr / count,
  144. avg_f1: totalF1 / count,
  145. avg_latency_ms: totalLat / count,
  146. };
  147. }
  148. }
  149. return summary;
  150. }
  151. export async function runBenchmark(
  152. fixturePath: string,
  153. options: { json?: boolean; collection?: string; backends?: string[] } = {},
  154. ): Promise<BenchmarkResult> {
  155. // Load fixture
  156. const raw = readFileSync(resolve(fixturePath), "utf-8");
  157. const fixture: BenchmarkFixture = JSON.parse(raw);
  158. if (!fixture.queries || !Array.isArray(fixture.queries)) {
  159. throw new Error("Invalid fixture: missing 'queries' array");
  160. }
  161. // Open store
  162. const store = await createStore({ dbPath: getDefaultDbPath() });
  163. // Filter backends if requested
  164. const activeBackends = options.backends
  165. ? BACKENDS.filter(b => options.backends!.includes(b.name))
  166. : BACKENDS;
  167. const collection = options.collection ?? fixture.collection;
  168. // Run queries
  169. const results: QueryResult[] = [];
  170. for (const query of fixture.queries) {
  171. const backends: Record<string, BackendResult> = {};
  172. for (const backend of activeBackends) {
  173. if (!options.json) {
  174. process.stderr.write(` ${query.id} / ${backend.name}...`);
  175. }
  176. backends[backend.name] = await runQuery(store, backend, query, collection);
  177. if (!options.json) {
  178. process.stderr.write(` ${Math.round(backends[backend.name]!.latency_ms)}ms\n`);
  179. }
  180. }
  181. results.push({
  182. id: query.id,
  183. query: query.query,
  184. type: query.type,
  185. backends,
  186. });
  187. }
  188. await store.close();
  189. const summary = computeSummary(results);
  190. const timestamp = new Date().toISOString().replace(/[:.]/g, "").slice(0, 15);
  191. const benchResult: BenchmarkResult = {
  192. timestamp,
  193. fixture: fixturePath,
  194. results,
  195. summary,
  196. };
  197. // Output
  198. if (options.json) {
  199. console.log(JSON.stringify(benchResult, null, 2));
  200. } else {
  201. console.log("\n" + formatTable(results));
  202. console.log("Summary:");
  203. console.log("-".repeat(70));
  204. const pad = (s: string, n: number) => s.slice(0, n).padEnd(n);
  205. const num = (n: number) => n.toFixed(3).padStart(6);
  206. for (const [name, s] of Object.entries(summary)) {
  207. console.log(
  208. ` ${pad(name, 8)} P@k=${num(s.avg_precision)} Recall=${num(s.avg_recall)} MRR=${num(s.avg_mrr)} F1=${num(s.avg_f1)} Avg=${Math.round(s.avg_latency_ms)}ms`
  209. );
  210. }
  211. }
  212. return benchResult;
  213. }