فهرست منبع

feat(cli): add `qmd bench` command for search quality benchmarks

Adds a benchmark harness that measures search quality across backends.
Given a fixture file with queries and expected results, it runs each
query through BM25, vector, hybrid (no rerank), and full pipeline,
then reports precision@k, recall, MRR, F1, and latency.

This is primarily a regression testing tool — users create fixtures
for their own vaults to catch quality regressions after config or
index changes. Ships with an example fixture against the eval-docs
test collection to demonstrate the format.

New files:
  src/bench/bench.ts       — main runner
  src/bench/score.ts       — precision, recall, MRR, F1, path matching
  src/bench/types.ts       — fixture and result types
  src/bench/fixtures/      — example fixture
  test/bench-score.test.ts — unit tests for scoring (16 tests)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
John R Milinovich 1 ماه پیش
والد
کامیت
b7a5a86a9b
7فایلهای تغییر یافته به همراه612 افزوده شده و 0 حذف شده
  1. 4 0
      CHANGELOG.md
  2. 241 0
      src/bench/bench.ts
  3. 87 0
      src/bench/fixtures/example.json
  4. 76 0
      src/bench/score.ts
  5. 72 0
      src/bench/types.ts
  6. 18 0
      src/cli/qmd.ts
  7. 114 0
      test/bench-score.test.ts

+ 4 - 0
CHANGELOG.md

@@ -14,6 +14,10 @@
 - `qmd status` now shows AST grammar availability.
 - SDK: `chunkStrategy` option on `embed()` and `search()` methods.
 - GitHub Actions workflow to build the Nix flake on Linux and macOS.
+- `qmd bench <fixture.json>` command for search quality benchmarks.
+  Measures precision@k, recall, MRR, and F1 across BM25, vector, hybrid,
+  and full pipeline backends. Ships with an example fixture against
+  the eval-docs test collection.
 
 ### Fixes
 

+ 241 - 0
src/bench/bench.ts

@@ -0,0 +1,241 @@
+/**
+ * QMD Benchmark Harness
+ *
+ * Runs queries from a fixture file against multiple search backends
+ * and measures precision@k, recall, MRR, F1, and latency.
+ *
+ * Usage:
+ *   qmd bench <fixture.json> [--json] [--collection <name>]
+ *
+ * Backends tested:
+ *   - bm25: BM25 keyword search (searchLex)
+ *   - vector: Vector similarity search (searchVector)
+ *   - hybrid: BM25 + vector RRF fusion without reranking
+ *   - full: Full hybrid pipeline with LLM reranking
+ */
+
+import { readFileSync } from "node:fs";
+import { resolve } from "node:path";
+import {
+  createStore,
+  getDefaultDbPath,
+  type QMDStore,
+  type SearchResult,
+  type HybridQueryResult,
+} from "../index.js";
+import { scoreResults } from "./score.js";
+import type {
+  BenchmarkFixture,
+  BenchmarkQuery,
+  BackendResult,
+  QueryResult,
+  BenchmarkResult,
+} from "./types.js";
+
+type Backend = {
+  name: string;
+  run: (store: QMDStore, query: string, limit: number, collection?: string) => Promise<string[]>;
+};
+
+const BACKENDS: Backend[] = [
+  {
+    name: "bm25",
+    run: async (store, query, limit, collection) => {
+      const results = await store.searchLex(query, { limit, collection });
+      return results.map((r: SearchResult) => r.filepath);
+    },
+  },
+  {
+    name: "vector",
+    run: async (store, query, limit, collection) => {
+      const results = await store.searchVector(query, { limit, collection });
+      return results.map((r: SearchResult) => r.filepath);
+    },
+  },
+  {
+    name: "hybrid",
+    run: async (store, query, limit, collection) => {
+      const results = await store.search({ query, limit, collection, rerank: false });
+      return results.map((r: HybridQueryResult) => r.file);
+    },
+  },
+  {
+    name: "full",
+    run: async (store, query, limit, collection) => {
+      const results = await store.search({ query, limit, collection, rerank: true });
+      return results.map((r: HybridQueryResult) => r.file);
+    },
+  },
+];
+
+async function runQuery(
+  store: QMDStore,
+  backend: Backend,
+  query: BenchmarkQuery,
+  collection?: string,
+): Promise<BackendResult> {
+  const limit = Math.max(query.expected_in_top_k, 10);
+  const start = Date.now();
+
+  let resultFiles: string[];
+  try {
+    resultFiles = await backend.run(store, query.query, limit, collection);
+  } catch (err: any) {
+    // Backend may not be available (e.g., no embeddings for vector search)
+    return {
+      precision_at_k: 0,
+      recall: 0,
+      mrr: 0,
+      f1: 0,
+      hits_at_k: 0,
+      total_expected: query.expected_files.length,
+      latency_ms: Date.now() - start,
+      top_files: [],
+    };
+  }
+
+  const latency_ms = Date.now() - start;
+  const scores = scoreResults(resultFiles, query.expected_files, query.expected_in_top_k);
+
+  return {
+    ...scores,
+    total_expected: query.expected_files.length,
+    latency_ms,
+    top_files: resultFiles.slice(0, 10),
+  };
+}
+
+function formatTable(results: QueryResult[]): string {
+  const lines: string[] = [];
+  const pad = (s: string, n: number) => s.slice(0, n).padEnd(n);
+  const num = (n: number) => n.toFixed(2).padStart(5);
+
+  lines.push(
+    `${pad("Query", 25)} ${pad("Backend", 8)} ${pad("P@k", 6)} ${pad("Recall", 7)} ${pad("MRR", 6)} ${pad("F1", 6)} ${pad("ms", 8)}`
+  );
+  lines.push("-".repeat(70));
+
+  for (const r of results) {
+    for (const [backend, br] of Object.entries(r.backends)) {
+      lines.push(
+        `${pad(r.id, 25)} ${pad(backend, 8)} ${num(br.precision_at_k)} ${num(br.recall)}  ${num(br.mrr)} ${num(br.f1)} ${String(Math.round(br.latency_ms)).padStart(7)}ms`
+      );
+    }
+    lines.push("");
+  }
+
+  return lines.join("\n");
+}
+
+function computeSummary(results: QueryResult[]): BenchmarkResult["summary"] {
+  const summary: BenchmarkResult["summary"] = {};
+
+  // Collect all backend names
+  const backendNames = new Set<string>();
+  for (const r of results) {
+    for (const name of Object.keys(r.backends)) {
+      backendNames.add(name);
+    }
+  }
+
+  for (const name of backendNames) {
+    let totalP = 0, totalR = 0, totalMrr = 0, totalF1 = 0, totalLat = 0, count = 0;
+    for (const r of results) {
+      const br = r.backends[name];
+      if (!br) continue;
+      totalP += br.precision_at_k;
+      totalR += br.recall;
+      totalMrr += br.mrr;
+      totalF1 += br.f1;
+      totalLat += br.latency_ms;
+      count++;
+    }
+    if (count > 0) {
+      summary[name] = {
+        avg_precision: totalP / count,
+        avg_recall: totalR / count,
+        avg_mrr: totalMrr / count,
+        avg_f1: totalF1 / count,
+        avg_latency_ms: totalLat / count,
+      };
+    }
+  }
+
+  return summary;
+}
+
+export async function runBenchmark(
+  fixturePath: string,
+  options: { json?: boolean; collection?: string; backends?: string[] } = {},
+): Promise<BenchmarkResult> {
+  // Load fixture
+  const raw = readFileSync(resolve(fixturePath), "utf-8");
+  const fixture: BenchmarkFixture = JSON.parse(raw);
+
+  if (!fixture.queries || !Array.isArray(fixture.queries)) {
+    throw new Error("Invalid fixture: missing 'queries' array");
+  }
+
+  // Open store
+  const store = await createStore({ dbPath: getDefaultDbPath() });
+
+  // Filter backends if requested
+  const activeBackends = options.backends
+    ? BACKENDS.filter(b => options.backends!.includes(b.name))
+    : BACKENDS;
+
+  const collection = options.collection ?? fixture.collection;
+
+  // Run queries
+  const results: QueryResult[] = [];
+  for (const query of fixture.queries) {
+    const backends: Record<string, BackendResult> = {};
+
+    for (const backend of activeBackends) {
+      if (!options.json) {
+        process.stderr.write(`  ${query.id} / ${backend.name}...`);
+      }
+      backends[backend.name] = await runQuery(store, backend, query, collection);
+      if (!options.json) {
+        process.stderr.write(` ${Math.round(backends[backend.name]!.latency_ms)}ms\n`);
+      }
+    }
+
+    results.push({
+      id: query.id,
+      query: query.query,
+      type: query.type,
+      backends,
+    });
+  }
+
+  await store.close();
+
+  const summary = computeSummary(results);
+  const timestamp = new Date().toISOString().replace(/[:.]/g, "").slice(0, 15);
+
+  const benchResult: BenchmarkResult = {
+    timestamp,
+    fixture: fixturePath,
+    results,
+    summary,
+  };
+
+  // Output
+  if (options.json) {
+    console.log(JSON.stringify(benchResult, null, 2));
+  } else {
+    console.log("\n" + formatTable(results));
+    console.log("Summary:");
+    console.log("-".repeat(70));
+    const pad = (s: string, n: number) => s.slice(0, n).padEnd(n);
+    const num = (n: number) => n.toFixed(3).padStart(6);
+    for (const [name, s] of Object.entries(summary)) {
+      console.log(
+        `  ${pad(name, 8)} P@k=${num(s.avg_precision)} Recall=${num(s.avg_recall)} MRR=${num(s.avg_mrr)} F1=${num(s.avg_f1)} Avg=${Math.round(s.avg_latency_ms)}ms`
+      );
+    }
+  }
+
+  return benchResult;
+}

+ 87 - 0
src/bench/fixtures/example.json

@@ -0,0 +1,87 @@
+{
+  "description": "Example benchmark fixture for QMD eval-docs. Tests exact keyword, semantic, and cross-domain retrieval across 6 documents.",
+  "version": 1,
+  "collection": "eval-docs",
+  "queries": [
+    {
+      "id": "exact-api",
+      "query": "API versioning",
+      "type": "exact",
+      "description": "Direct keyword match in API design document",
+      "expected_files": ["api-design-principles.md"],
+      "expected_in_top_k": 1
+    },
+    {
+      "id": "exact-fundraising",
+      "query": "Series A fundraising",
+      "type": "exact",
+      "description": "Direct keyword match in fundraising memo",
+      "expected_files": ["startup-fundraising-memo.md"],
+      "expected_in_top_k": 1
+    },
+    {
+      "id": "exact-cap",
+      "query": "CAP theorem",
+      "type": "exact",
+      "description": "Direct keyword match in distributed systems doc",
+      "expected_files": ["distributed-systems-overview.md"],
+      "expected_in_top_k": 1
+    },
+    {
+      "id": "semantic-rest",
+      "query": "how to structure REST endpoints",
+      "type": "semantic",
+      "description": "Conceptual match — no exact keyword overlap with 'API design'",
+      "expected_files": ["api-design-principles.md"],
+      "expected_in_top_k": 3
+    },
+    {
+      "id": "semantic-fundraising",
+      "query": "raising money for startup",
+      "type": "semantic",
+      "description": "Synonym match — 'raising money' should find 'fundraising'",
+      "expected_files": ["startup-fundraising-memo.md"],
+      "expected_in_top_k": 3
+    },
+    {
+      "id": "semantic-overfitting",
+      "query": "how to prevent models from memorizing data",
+      "type": "semantic",
+      "description": "Conceptual match for overfitting in ML primer",
+      "expected_files": ["machine-learning-primer.md"],
+      "expected_in_top_k": 3
+    },
+    {
+      "id": "topical-launch",
+      "query": "what went wrong with the product launch",
+      "type": "topical",
+      "description": "Should find the retrospective document",
+      "expected_files": ["product-launch-retrospective.md"],
+      "expected_in_top_k": 3
+    },
+    {
+      "id": "cross-domain-consistency",
+      "query": "consistency vs availability tradeoffs",
+      "type": "cross-domain",
+      "description": "CAP theorem concept — specific detail in longer document",
+      "expected_files": ["distributed-systems-overview.md"],
+      "expected_in_top_k": 3
+    },
+    {
+      "id": "alias-remote",
+      "query": "working from home guidelines",
+      "type": "alias",
+      "description": "Synonym match — 'working from home' should find 'remote work policy'",
+      "expected_files": ["remote-work-policy.md"],
+      "expected_in_top_k": 3
+    },
+    {
+      "id": "hard-partial",
+      "query": "nouns not verbs",
+      "type": "semantic",
+      "description": "Partial phrase recall — API design principle about resource naming",
+      "expected_files": ["api-design-principles.md"],
+      "expected_in_top_k": 5
+    }
+  ]
+}

+ 76 - 0
src/bench/score.ts

@@ -0,0 +1,76 @@
+/**
+ * Scoring functions for the QMD benchmark harness.
+ *
+ * Computes precision@k, recall, MRR, and F1 for search results
+ * against ground-truth expected files.
+ */
+
+/**
+ * Normalize a file path for comparison.
+ * Strips qmd:// prefix, lowercases, removes leading/trailing slashes.
+ */
+export function normalizePath(p: string): string {
+  if (p.startsWith("qmd://")) {
+    // qmd://collection/path/to/file → path/to/file
+    const withoutScheme = p.slice("qmd://".length);
+    const slashIdx = withoutScheme.indexOf("/");
+    p = slashIdx >= 0 ? withoutScheme.slice(slashIdx + 1) : withoutScheme;
+  }
+  return p.toLowerCase().replace(/^\/+|\/+$/g, "");
+}
+
+/**
+ * Check if two paths refer to the same file.
+ * Handles different path formats by comparing normalized suffixes.
+ */
+export function pathsMatch(result: string, expected: string): boolean {
+  const nr = normalizePath(result);
+  const ne = normalizePath(expected);
+  if (nr === ne) return true;
+  if (nr.endsWith(ne) || ne.endsWith(nr)) return true;
+  return false;
+}
+
+/**
+ * Score a set of search results against expected files.
+ */
+export function scoreResults(
+  resultFiles: string[],
+  expectedFiles: string[],
+  topK: number,
+): { precision_at_k: number; recall: number; mrr: number; f1: number; hits_at_k: number } {
+  // Count hits in top-k
+  const topKResults = resultFiles.slice(0, topK);
+  let hitsAtK = 0;
+  for (const expected of expectedFiles) {
+    if (topKResults.some(r => pathsMatch(r, expected))) {
+      hitsAtK++;
+    }
+  }
+
+  // Count total hits anywhere
+  let totalHits = 0;
+  for (const expected of expectedFiles) {
+    if (resultFiles.some(r => pathsMatch(r, expected))) {
+      totalHits++;
+    }
+  }
+
+  // MRR: reciprocal rank of first relevant result
+  let mrr = 0;
+  for (let i = 0; i < resultFiles.length; i++) {
+    if (expectedFiles.some(e => pathsMatch(resultFiles[i]!, e))) {
+      mrr = 1 / (i + 1);
+      break;
+    }
+  }
+
+  const denominator = Math.min(topK, expectedFiles.length);
+  const precision_at_k = denominator > 0 ? hitsAtK / denominator : 0;
+  const recall = expectedFiles.length > 0 ? totalHits / expectedFiles.length : 0;
+  const f1 = precision_at_k + recall > 0
+    ? 2 * (precision_at_k * recall) / (precision_at_k + recall)
+    : 0;
+
+  return { precision_at_k, recall, mrr, f1, hits_at_k: hitsAtK };
+}

+ 72 - 0
src/bench/types.ts

@@ -0,0 +1,72 @@
+/**
+ * Types for the QMD benchmark harness.
+ *
+ * A benchmark fixture defines queries with expected results.
+ * The harness runs each query through multiple search backends
+ * and measures precision, recall, MRR, and latency.
+ */
+
+export interface BenchmarkQuery {
+  /** Unique identifier for the query */
+  id: string;
+  /** The search query text */
+  query: string;
+  /** Query difficulty/type for grouping results */
+  type: "exact" | "semantic" | "topical" | "cross-domain" | "alias";
+  /** Human-readable description of what this tests */
+  description: string;
+  /** File paths (relative to collection) that should appear in results */
+  expected_files: string[];
+  /** How many of expected_files should appear in top-k results */
+  expected_in_top_k: number;
+}
+
+export interface BenchmarkFixture {
+  /** Description of the benchmark */
+  description: string;
+  /** Fixture format version */
+  version: number;
+  /** Optional collection to search within */
+  collection?: string;
+  /** The test queries */
+  queries: BenchmarkQuery[];
+}
+
+export interface BackendResult {
+  /** Fraction of top-k results that are relevant */
+  precision_at_k: number;
+  /** Fraction of expected files found anywhere in results */
+  recall: number;
+  /** Reciprocal rank of first relevant result (1/rank, 0 if not found) */
+  mrr: number;
+  /** Harmonic mean of precision_at_k and recall */
+  f1: number;
+  /** Number of expected files found in top-k */
+  hits_at_k: number;
+  /** Total expected files */
+  total_expected: number;
+  /** Wall-clock latency in milliseconds */
+  latency_ms: number;
+  /** Top result file paths (for inspection) */
+  top_files: string[];
+}
+
+export interface QueryResult {
+  id: string;
+  query: string;
+  type: string;
+  backends: Record<string, BackendResult>;
+}
+
+export interface BenchmarkResult {
+  timestamp: string;
+  fixture: string;
+  results: QueryResult[];
+  summary: Record<string, {
+    avg_precision: number;
+    avg_recall: number;
+    avg_mrr: number;
+    avg_f1: number;
+    avg_latency_ms: number;
+  }>;
+}

+ 18 - 0
src/cli/qmd.ts

@@ -2606,6 +2606,7 @@ function showHelp(): void {
   console.log("  qmd multi-get <pattern>       - Batch fetch via glob or comma-separated list");
   console.log("  qmd skill show/install        - Show or install the packaged QMD skill");
   console.log("  qmd mcp                       - Start the MCP server (stdio transport for AI agents)");
+  console.log("  qmd bench <fixture.json>      - Run search quality benchmarks against a fixture file");
   console.log("");
   console.log("Collections & context:");
   console.log("  qmd collection add/list/remove/rename/show   - Manage indexed folders");
@@ -3063,6 +3064,23 @@ if (isMain) {
       await querySearch(cli.query, cli.opts);
       break;
 
+    case "bench": {
+      const fixturePath = cli.args[0];
+      if (!fixturePath) {
+        console.error("Usage: qmd bench <fixture.json> [--json] [-c collection]");
+        console.error("");
+        console.error("Run search quality benchmarks against a fixture file.");
+        console.error("See src/bench/fixtures/example.json for the fixture format.");
+        process.exit(1);
+      }
+      const { runBenchmark } = await import("../bench/bench.js");
+      await runBenchmark(fixturePath, {
+        json: !!cli.opts.json,
+        collection: cli.opts.collection,
+      });
+      break;
+    }
+
     case "mcp": {
       const sub = cli.args[0]; // stop | status | undefined
 

+ 114 - 0
test/bench-score.test.ts

@@ -0,0 +1,114 @@
+/**
+ * Tests for the benchmark scoring functions.
+ */
+
+import { describe, test, expect } from "vitest";
+import { normalizePath, pathsMatch, scoreResults } from "../src/bench/score.js";
+
+describe("normalizePath", () => {
+  test("lowercases path", () => {
+    expect(normalizePath("Resources/Concepts/Context Engineering.md"))
+      .toBe("resources/concepts/context engineering.md");
+  });
+
+  test("strips qmd:// prefix", () => {
+    expect(normalizePath("qmd://collection/docs/readme.md"))
+      .toBe("docs/readme.md");
+  });
+
+  test("strips leading/trailing slashes", () => {
+    expect(normalizePath("/docs/readme.md/")).toBe("docs/readme.md");
+  });
+
+  test("handles plain filename", () => {
+    expect(normalizePath("readme.md")).toBe("readme.md");
+  });
+});
+
+describe("pathsMatch", () => {
+  test("exact match", () => {
+    expect(pathsMatch("docs/readme.md", "docs/readme.md")).toBe(true);
+  });
+
+  test("case-insensitive match", () => {
+    expect(pathsMatch("Docs/README.md", "docs/readme.md")).toBe(true);
+  });
+
+  test("suffix match (result is longer)", () => {
+    expect(pathsMatch("/full/path/docs/readme.md", "docs/readme.md")).toBe(true);
+  });
+
+  test("suffix match (expected is longer)", () => {
+    expect(pathsMatch("readme.md", "docs/readme.md")).toBe(true);
+  });
+
+  test("qmd:// prefix handled", () => {
+    expect(pathsMatch("qmd://col/docs/readme.md", "docs/readme.md")).toBe(true);
+  });
+
+  test("different files don't match", () => {
+    expect(pathsMatch("docs/readme.md", "docs/other.md")).toBe(false);
+  });
+});
+
+describe("scoreResults", () => {
+  test("perfect score: all expected in top-k", () => {
+    const result = scoreResults(
+      ["a.md", "b.md", "c.md"],
+      ["a.md", "b.md"],
+      2,
+    );
+    expect(result.precision_at_k).toBe(1);
+    expect(result.recall).toBe(1);
+    expect(result.mrr).toBe(1);
+    expect(result.f1).toBe(1);
+    expect(result.hits_at_k).toBe(2);
+  });
+
+  test("zero score: none found", () => {
+    const result = scoreResults(
+      ["x.md", "y.md", "z.md"],
+      ["a.md", "b.md"],
+      2,
+    );
+    expect(result.precision_at_k).toBe(0);
+    expect(result.recall).toBe(0);
+    expect(result.mrr).toBe(0);
+    expect(result.f1).toBe(0);
+    expect(result.hits_at_k).toBe(0);
+  });
+
+  test("partial: found outside top-k", () => {
+    const result = scoreResults(
+      ["x.md", "y.md", "a.md"],
+      ["a.md"],
+      1,
+    );
+    expect(result.precision_at_k).toBe(0); // not in top-1
+    expect(result.recall).toBe(1); // found somewhere
+    expect(result.mrr).toBeCloseTo(1 / 3); // rank 3
+    expect(result.hits_at_k).toBe(0);
+  });
+
+  test("MRR: first relevant at rank 2", () => {
+    const result = scoreResults(
+      ["x.md", "a.md", "b.md"],
+      ["a.md", "b.md"],
+      3,
+    );
+    expect(result.mrr).toBeCloseTo(0.5); // 1/2
+  });
+
+  test("empty results", () => {
+    const result = scoreResults([], ["a.md"], 1);
+    expect(result.precision_at_k).toBe(0);
+    expect(result.recall).toBe(0);
+    expect(result.mrr).toBe(0);
+  });
+
+  test("empty expected", () => {
+    const result = scoreResults(["a.md"], [], 1);
+    expect(result.precision_at_k).toBe(0);
+    expect(result.recall).toBe(0);
+  });
+});