eval-bm25.test.ts 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135
  1. /**
  2. * BM25-only evaluation tests (unit layer).
  3. *
  4. * This is a fast suite copied from the BM25 block in `models/eval.test.ts`.
  5. */
  6. import { describe, test, expect, beforeAll, afterAll } from "vitest";
  7. import { mkdtempSync, rmSync, readFileSync, readdirSync } from "fs";
  8. import { join, dirname } from "path";
  9. import { tmpdir } from "os";
  10. import type { Database } from "../src/db.js";
  11. import { createHash } from "crypto";
  12. import { fileURLToPath } from "url";
  13. import {
  14. createStore,
  15. searchFTS,
  16. insertDocument,
  17. insertContent,
  18. } from "../src/store";
  19. // Set INDEX_PATH before importing store to prevent using global index
  20. const tempDir = mkdtempSync(join(tmpdir(), "qmd-eval-unit-"));
  21. process.env.INDEX_PATH = join(tempDir, "eval-unit.sqlite");
  22. afterAll(() => {
  23. rmSync(tempDir, { recursive: true, force: true });
  24. });
  25. const evalQueries: {
  26. query: string;
  27. expectedDoc: string;
  28. difficulty: "easy" | "medium" | "hard" | "fusion";
  29. }[] = [
  30. // EASY: Exact keyword matches
  31. { query: "API versioning", expectedDoc: "api-design", difficulty: "easy" },
  32. { query: "Series A fundraising", expectedDoc: "fundraising", difficulty: "easy" },
  33. { query: "CAP theorem", expectedDoc: "distributed-systems", difficulty: "easy" },
  34. { query: "overfitting machine learning", expectedDoc: "machine-learning", difficulty: "easy" },
  35. { query: "remote work VPN", expectedDoc: "remote-work", difficulty: "easy" },
  36. { query: "Project Phoenix retrospective", expectedDoc: "product-launch", difficulty: "easy" },
  37. // MEDIUM: Semantic/conceptual queries
  38. { query: "how to structure REST endpoints", expectedDoc: "api-design", difficulty: "medium" },
  39. { query: "raising money for startup", expectedDoc: "fundraising", difficulty: "medium" },
  40. { query: "consistency vs availability tradeoffs", expectedDoc: "distributed-systems", difficulty: "medium" },
  41. { query: "how to prevent models from memorizing data", expectedDoc: "machine-learning", difficulty: "medium" },
  42. { query: "working from home guidelines", expectedDoc: "remote-work", difficulty: "medium" },
  43. { query: "what went wrong with the launch", expectedDoc: "product-launch", difficulty: "medium" },
  44. // HARD: Vague, partial memory, indirect
  45. { query: "nouns not verbs", expectedDoc: "api-design", difficulty: "hard" },
  46. { query: "Sequoia investor pitch", expectedDoc: "fundraising", difficulty: "hard" },
  47. { query: "Raft algorithm leader election", expectedDoc: "distributed-systems", difficulty: "hard" },
  48. { query: "F1 score precision recall", expectedDoc: "machine-learning", difficulty: "hard" },
  49. { query: "quarterly team gathering travel", expectedDoc: "remote-work", difficulty: "hard" },
  50. { query: "beta program 47 bugs", expectedDoc: "product-launch", difficulty: "hard" },
  51. // FUSION: Multi-signal queries that need both lexical AND semantic matching
  52. // These should have weak individual scores but strong combined RRF scores
  53. { query: "how much runway before running out of money", expectedDoc: "fundraising", difficulty: "fusion" },
  54. { query: "datacenter replication sync strategy", expectedDoc: "distributed-systems", difficulty: "fusion" },
  55. { query: "splitting data for training and testing", expectedDoc: "machine-learning", difficulty: "fusion" },
  56. { query: "JSON response codes error messages", expectedDoc: "api-design", difficulty: "fusion" },
  57. { query: "video calls camera async messaging", expectedDoc: "remote-work", difficulty: "fusion" },
  58. { query: "CI/CD pipeline testing coverage", expectedDoc: "product-launch", difficulty: "fusion" },
  59. ];
  60. function matchesExpected(filepath: string, expectedDoc: string): boolean {
  61. return filepath.toLowerCase().includes(expectedDoc);
  62. }
  63. function calcHitRate(
  64. queries: typeof evalQueries,
  65. searchFn: (query: string) => { filepath: string }[],
  66. topK: number
  67. ): number {
  68. let hits = 0;
  69. for (const { query, expectedDoc } of queries) {
  70. const results = searchFn(query).slice(0, topK);
  71. if (results.some(r => matchesExpected(r.filepath, expectedDoc))) hits++;
  72. }
  73. return hits / queries.length;
  74. }
  75. describe("BM25 Search (FTS)", () => {
  76. let store: ReturnType<typeof createStore>;
  77. let db: Database;
  78. beforeAll(() => {
  79. store = createStore();
  80. db = store.db;
  81. // Load and index eval documents
  82. const evalDocsDir = join(dirname(fileURLToPath(import.meta.url)), "eval-docs");
  83. const files = readdirSync(evalDocsDir).filter(f => f.endsWith(".md"));
  84. for (const file of files) {
  85. const content = readFileSync(join(evalDocsDir, file), "utf-8");
  86. const title = content.split("\n")[0]?.replace(/^#\s*/, "") || file;
  87. const hash = createHash("sha256").update(content).digest("hex").slice(0, 12);
  88. const now = new Date().toISOString();
  89. insertContent(db, hash, content, now);
  90. insertDocument(db, "eval-docs", file, title, hash, now, now);
  91. }
  92. });
  93. afterAll(() => {
  94. store.close();
  95. });
  96. test("easy queries: ≥80% Hit@3", () => {
  97. const easyQueries = evalQueries.filter(q => q.difficulty === "easy");
  98. const hitRate = calcHitRate(easyQueries, q => searchFTS(db, q, 5), 3);
  99. expect(hitRate).toBeGreaterThanOrEqual(0.8);
  100. });
  101. test("medium queries: ≥15% Hit@3 (BM25 struggles with semantic)", () => {
  102. const mediumQueries = evalQueries.filter(q => q.difficulty === "medium");
  103. const hitRate = calcHitRate(mediumQueries, q => searchFTS(db, q, 5), 3);
  104. expect(hitRate).toBeGreaterThanOrEqual(0.15);
  105. });
  106. test("hard queries: ≥15% Hit@5 (BM25 baseline)", () => {
  107. const hardQueries = evalQueries.filter(q => q.difficulty === "hard");
  108. const hitRate = calcHitRate(hardQueries, q => searchFTS(db, q, 5), 5);
  109. expect(hitRate).toBeGreaterThanOrEqual(0.15);
  110. });
  111. test("overall Hit@3 ≥40% (BM25 baseline)", () => {
  112. const hitRate = calcHitRate(evalQueries, q => searchFTS(db, q, 5), 3);
  113. expect(hitRate).toBeGreaterThanOrEqual(0.4);
  114. });
  115. });