eval.test.ts 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416
  1. /**
  2. * Evaluation Tests for QMD Search Quality
  3. *
  4. * Tests search quality against synthetic documents with known-answer queries.
  5. * Validates that search improvements don't regress quality.
  6. *
  7. * Three test suites:
  8. * 1. BM25 (FTS) - lexical search baseline
  9. * 2. Vector Search - semantic search with embeddings
  10. * 3. Hybrid (RRF) - combined lexical + vector with rank fusion
  11. */
  12. import { describe, test, expect, beforeAll, afterAll } from "vitest";
  13. import { mkdtempSync, rmSync, readFileSync, readdirSync } from "fs";
  14. import { join } from "path";
  15. import { tmpdir } from "os";
  16. import { openDatabase } from "../src/db.js";
  17. import type { Database } from "../src/db.js";
  18. import { createHash } from "crypto";
  19. import { fileURLToPath } from "url";
  20. import { dirname } from "path";
  21. // Set INDEX_PATH before importing store to prevent using global index
  22. const tempDir = mkdtempSync(join(tmpdir(), "qmd-eval-"));
  23. process.env.INDEX_PATH = join(tempDir, "eval.sqlite");
  24. import {
  25. createStore,
  26. searchFTS,
  27. searchVec,
  28. insertDocument,
  29. insertContent,
  30. insertEmbedding,
  31. chunkDocumentByTokens,
  32. reciprocalRankFusion,
  33. DEFAULT_EMBED_MODEL,
  34. type RankedResult,
  35. } from "../src/store";
  36. import { getDefaultLlamaCpp, formatDocForEmbedding, disposeDefaultLlamaCpp } from "../src/llm";
  37. // Eval queries with expected documents
  38. const evalQueries: {
  39. query: string;
  40. expectedDoc: string;
  41. difficulty: "easy" | "medium" | "hard" | "fusion";
  42. }[] = [
  43. // EASY: Exact keyword matches
  44. { query: "API versioning", expectedDoc: "api-design", difficulty: "easy" },
  45. { query: "Series A fundraising", expectedDoc: "fundraising", difficulty: "easy" },
  46. { query: "CAP theorem", expectedDoc: "distributed-systems", difficulty: "easy" },
  47. { query: "overfitting machine learning", expectedDoc: "machine-learning", difficulty: "easy" },
  48. { query: "remote work VPN", expectedDoc: "remote-work", difficulty: "easy" },
  49. { query: "Project Phoenix retrospective", expectedDoc: "product-launch", difficulty: "easy" },
  50. // MEDIUM: Semantic/conceptual queries
  51. { query: "how to structure REST endpoints", expectedDoc: "api-design", difficulty: "medium" },
  52. { query: "raising money for startup", expectedDoc: "fundraising", difficulty: "medium" },
  53. { query: "consistency vs availability tradeoffs", expectedDoc: "distributed-systems", difficulty: "medium" },
  54. { query: "how to prevent models from memorizing data", expectedDoc: "machine-learning", difficulty: "medium" },
  55. { query: "working from home guidelines", expectedDoc: "remote-work", difficulty: "medium" },
  56. { query: "what went wrong with the launch", expectedDoc: "product-launch", difficulty: "medium" },
  57. // HARD: Vague, partial memory, indirect
  58. { query: "nouns not verbs", expectedDoc: "api-design", difficulty: "hard" },
  59. { query: "Sequoia investor pitch", expectedDoc: "fundraising", difficulty: "hard" },
  60. { query: "Raft algorithm leader election", expectedDoc: "distributed-systems", difficulty: "hard" },
  61. { query: "F1 score precision recall", expectedDoc: "machine-learning", difficulty: "hard" },
  62. { query: "quarterly team gathering travel", expectedDoc: "remote-work", difficulty: "hard" },
  63. { query: "beta program 47 bugs", expectedDoc: "product-launch", difficulty: "hard" },
  64. // FUSION: Multi-signal queries that need both lexical AND semantic matching
  65. // These should have weak individual scores but strong combined RRF scores
  66. { query: "how much runway before running out of money", expectedDoc: "fundraising", difficulty: "fusion" },
  67. { query: "datacenter replication sync strategy", expectedDoc: "distributed-systems", difficulty: "fusion" },
  68. { query: "splitting data for training and testing", expectedDoc: "machine-learning", difficulty: "fusion" },
  69. { query: "JSON response codes error messages", expectedDoc: "api-design", difficulty: "fusion" },
  70. { query: "video calls camera async messaging", expectedDoc: "remote-work", difficulty: "fusion" },
  71. { query: "CI/CD pipeline testing coverage", expectedDoc: "product-launch", difficulty: "fusion" },
  72. ];
  73. // Helper to check if result matches expected doc
  74. function matchesExpected(filepath: string, expectedDoc: string): boolean {
  75. return filepath.toLowerCase().includes(expectedDoc);
  76. }
  77. // Helper to calculate hit rate
  78. function calcHitRate(
  79. queries: typeof evalQueries,
  80. searchFn: (query: string) => { filepath: string }[],
  81. topK: number
  82. ): number {
  83. let hits = 0;
  84. for (const { query, expectedDoc } of queries) {
  85. const results = searchFn(query).slice(0, topK);
  86. if (results.some(r => matchesExpected(r.filepath, expectedDoc))) hits++;
  87. }
  88. return hits / queries.length;
  89. }
  90. // =============================================================================
  91. // BM25 (Lexical) Tests - Fast, no model loading needed
  92. // =============================================================================
  93. describe("BM25 Search (FTS)", () => {
  94. let store: ReturnType<typeof createStore>;
  95. let db: Database;
  96. beforeAll(() => {
  97. store = createStore();
  98. db = store.db;
  99. // Load and index eval documents
  100. const evalDocsDir = join(dirname(fileURLToPath(import.meta.url)), "eval-docs");
  101. const files = readdirSync(evalDocsDir).filter(f => f.endsWith(".md"));
  102. for (const file of files) {
  103. const content = readFileSync(join(evalDocsDir, file), "utf-8");
  104. const title = content.split("\n")[0]?.replace(/^#\s*/, "") || file;
  105. const hash = createHash("sha256").update(content).digest("hex").slice(0, 12);
  106. const now = new Date().toISOString();
  107. insertContent(db, hash, content, now);
  108. insertDocument(db, "eval-docs", file, title, hash, now, now);
  109. }
  110. });
  111. afterAll(() => {
  112. store.close();
  113. });
  114. test("easy queries: ≥80% Hit@3", () => {
  115. const easyQueries = evalQueries.filter(q => q.difficulty === "easy");
  116. const hitRate = calcHitRate(easyQueries, q => searchFTS(db, q, 5), 3);
  117. expect(hitRate).toBeGreaterThanOrEqual(0.8);
  118. });
  119. test("medium queries: ≥15% Hit@3 (BM25 struggles with semantic)", () => {
  120. const mediumQueries = evalQueries.filter(q => q.difficulty === "medium");
  121. const hitRate = calcHitRate(mediumQueries, q => searchFTS(db, q, 5), 3);
  122. expect(hitRate).toBeGreaterThanOrEqual(0.15);
  123. });
  124. test("hard queries: ≥15% Hit@5 (BM25 baseline)", () => {
  125. const hardQueries = evalQueries.filter(q => q.difficulty === "hard");
  126. const hitRate = calcHitRate(hardQueries, q => searchFTS(db, q, 5), 5);
  127. expect(hitRate).toBeGreaterThanOrEqual(0.15);
  128. });
  129. test("overall Hit@3 ≥40% (BM25 baseline)", () => {
  130. const hitRate = calcHitRate(evalQueries, q => searchFTS(db, q, 5), 3);
  131. expect(hitRate).toBeGreaterThanOrEqual(0.4);
  132. });
  133. });
  134. // =============================================================================
  135. // Vector Search Tests - Requires embedding model
  136. // =============================================================================
  137. describe.skipIf(!!process.env.CI)("Vector Search", () => {
  138. let store: ReturnType<typeof createStore>;
  139. let db: Database;
  140. let hasEmbeddings = false;
  141. beforeAll(async () => {
  142. store = createStore();
  143. db = store.db;
  144. // Check if embeddings already exist (from previous test run)
  145. const vecTable = db.prepare(
  146. `SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`
  147. ).get();
  148. if (vecTable) {
  149. const count = db.prepare(`SELECT COUNT(*) as cnt FROM vectors_vec`).get() as { cnt: number };
  150. if (count.cnt > 0) {
  151. hasEmbeddings = true;
  152. return;
  153. }
  154. }
  155. // Generate embeddings for test documents
  156. const llm = getDefaultLlamaCpp();
  157. store.ensureVecTable(768); // embeddinggemma uses 768 dimensions
  158. const evalDocsDir = join(dirname(fileURLToPath(import.meta.url)), "eval-docs");
  159. const files = readdirSync(evalDocsDir).filter(f => f.endsWith(".md"));
  160. for (const file of files) {
  161. const content = readFileSync(join(evalDocsDir, file), "utf-8");
  162. const hash = createHash("sha256").update(content).digest("hex").slice(0, 12);
  163. const title = content.split("\n")[0]?.replace(/^#\s*/, "") || file;
  164. // Chunk and embed
  165. const chunks = await chunkDocumentByTokens(content);
  166. for (let seq = 0; seq < chunks.length; seq++) {
  167. const chunk = chunks[seq];
  168. if (!chunk) continue;
  169. const formatted = formatDocForEmbedding(chunk.text, title);
  170. const result = await llm.embed(formatted, { model: DEFAULT_EMBED_MODEL, isQuery: false });
  171. if (result?.embedding) {
  172. // Convert to Float32Array for sqlite-vec
  173. const embedding = new Float32Array(result.embedding);
  174. const now = new Date().toISOString();
  175. insertEmbedding(db, hash, seq, chunk.pos, embedding, DEFAULT_EMBED_MODEL, now);
  176. }
  177. }
  178. }
  179. hasEmbeddings = true;
  180. }, 120000); // 2 minute timeout for embedding generation
  181. afterAll(() => {
  182. store.close();
  183. });
  184. // Note: Don't dispose here - Hybrid tests also use llama.
  185. // Dispose happens in the global afterAll.
  186. test("easy queries: ≥60% Hit@3 (vector should match keywords too)", async () => {
  187. if (!hasEmbeddings) return; // Skip if embedding failed
  188. const easyQueries = evalQueries.filter(q => q.difficulty === "easy");
  189. let hits = 0;
  190. for (const { query, expectedDoc } of easyQueries) {
  191. const results = await searchVec(db, query, DEFAULT_EMBED_MODEL, 5);
  192. if (results.slice(0, 3).some(r => matchesExpected(r.filepath, expectedDoc))) hits++;
  193. }
  194. expect(hits / easyQueries.length).toBeGreaterThanOrEqual(0.6);
  195. }, 60000);
  196. test("medium queries: ≥40% Hit@3 (vector excels at semantic)", async () => {
  197. if (!hasEmbeddings) return;
  198. const mediumQueries = evalQueries.filter(q => q.difficulty === "medium");
  199. let hits = 0;
  200. for (const { query, expectedDoc } of mediumQueries) {
  201. const results = await searchVec(db, query, DEFAULT_EMBED_MODEL, 5);
  202. if (results.slice(0, 3).some(r => matchesExpected(r.filepath, expectedDoc))) hits++;
  203. }
  204. // Vector search should do better on semantic queries than BM25
  205. expect(hits / mediumQueries.length).toBeGreaterThanOrEqual(0.4);
  206. }, 60000);
  207. test("hard queries: ≥30% Hit@5 (vector helps with vague queries)", async () => {
  208. if (!hasEmbeddings) return;
  209. const hardQueries = evalQueries.filter(q => q.difficulty === "hard");
  210. let hits = 0;
  211. for (const { query, expectedDoc } of hardQueries) {
  212. const results = await searchVec(db, query, DEFAULT_EMBED_MODEL, 5);
  213. if (results.some(r => matchesExpected(r.filepath, expectedDoc))) hits++;
  214. }
  215. expect(hits / hardQueries.length).toBeGreaterThanOrEqual(0.3);
  216. }, 60000);
  217. test("overall Hit@3 ≥50% (vector baseline)", async () => {
  218. if (!hasEmbeddings) return;
  219. let hits = 0;
  220. for (const { query, expectedDoc } of evalQueries) {
  221. const results = await searchVec(db, query, DEFAULT_EMBED_MODEL, 5);
  222. if (results.slice(0, 3).some(r => matchesExpected(r.filepath, expectedDoc))) hits++;
  223. }
  224. expect(hits / evalQueries.length).toBeGreaterThanOrEqual(0.5);
  225. }, 60000);
  226. });
  227. // =============================================================================
  228. // Hybrid Search (RRF) Tests - Combines BM25 + Vector
  229. // =============================================================================
  230. describe.skipIf(!!process.env.CI)("Hybrid Search (RRF)", () => {
  231. let store: ReturnType<typeof createStore>;
  232. let db: Database;
  233. let hasVectors = false;
  234. beforeAll(() => {
  235. store = createStore();
  236. db = store.db;
  237. // Check if vectors exist
  238. const vecTable = db.prepare(
  239. `SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`
  240. ).get();
  241. if (vecTable) {
  242. const count = db.prepare(`SELECT COUNT(*) as cnt FROM vectors_vec`).get() as { cnt: number };
  243. hasVectors = count.cnt > 0;
  244. }
  245. });
  246. afterAll(() => {
  247. store.close();
  248. });
  249. // Helper: run hybrid search with RRF fusion
  250. async function hybridSearch(query: string, limit: number = 10): Promise<RankedResult[]> {
  251. const rankedLists: RankedResult[][] = [];
  252. // FTS results
  253. const ftsResults = searchFTS(db, query, 20);
  254. if (ftsResults.length > 0) {
  255. rankedLists.push(ftsResults.map(r => ({
  256. file: r.filepath,
  257. displayPath: r.displayPath,
  258. title: r.title,
  259. body: r.body || "",
  260. score: r.score
  261. })));
  262. }
  263. // Vector results
  264. const vecResults = await searchVec(db, query, DEFAULT_EMBED_MODEL, 20);
  265. if (vecResults.length > 0) {
  266. rankedLists.push(vecResults.map(r => ({
  267. file: r.filepath,
  268. displayPath: r.displayPath,
  269. title: r.title,
  270. body: r.body || "",
  271. score: r.score
  272. })));
  273. }
  274. if (rankedLists.length === 0) return [];
  275. // Apply RRF fusion
  276. const fused = reciprocalRankFusion(rankedLists);
  277. return fused.slice(0, limit);
  278. }
  279. test("easy queries: ≥80% Hit@3 (hybrid should match BM25)", async () => {
  280. const easyQueries = evalQueries.filter(q => q.difficulty === "easy");
  281. let hits = 0;
  282. for (const { query, expectedDoc } of easyQueries) {
  283. const results = await hybridSearch(query);
  284. if (results.slice(0, 3).some(r => matchesExpected(r.file, expectedDoc))) hits++;
  285. }
  286. expect(hits / easyQueries.length).toBeGreaterThanOrEqual(0.8);
  287. }, 60000);
  288. test("medium queries: ≥50% Hit@3 with vectors, ≥15% without", async () => {
  289. const mediumQueries = evalQueries.filter(q => q.difficulty === "medium");
  290. let hits = 0;
  291. for (const { query, expectedDoc } of mediumQueries) {
  292. const results = await hybridSearch(query);
  293. if (results.slice(0, 3).some(r => matchesExpected(r.file, expectedDoc))) hits++;
  294. }
  295. // With vectors: hybrid should outperform both BM25 (15%) and vector (40%)
  296. // Without vectors: hybrid is just BM25, so use BM25 threshold
  297. const threshold = hasVectors ? 0.5 : 0.15;
  298. expect(hits / mediumQueries.length).toBeGreaterThanOrEqual(threshold);
  299. }, 60000);
  300. test("hard queries: ≥35% Hit@5 with vectors, ≥15% without", async () => {
  301. const hardQueries = evalQueries.filter(q => q.difficulty === "hard");
  302. let hits = 0;
  303. for (const { query, expectedDoc } of hardQueries) {
  304. const results = await hybridSearch(query);
  305. if (results.some(r => matchesExpected(r.file, expectedDoc))) hits++;
  306. }
  307. const threshold = hasVectors ? 0.35 : 0.15;
  308. expect(hits / hardQueries.length).toBeGreaterThanOrEqual(threshold);
  309. }, 60000);
  310. test("fusion queries: ≥50% Hit@3 (RRF combines weak signals)", async () => {
  311. if (!hasVectors) return; // Fusion requires both methods
  312. const fusionQueries = evalQueries.filter(q => q.difficulty === "fusion");
  313. let hybridHits = 0;
  314. let bm25Hits = 0;
  315. let vecHits = 0;
  316. for (const { query, expectedDoc } of fusionQueries) {
  317. // Hybrid results
  318. const hybridResults = await hybridSearch(query);
  319. if (hybridResults.slice(0, 3).some(r => matchesExpected(r.file, expectedDoc))) hybridHits++;
  320. // BM25 results for comparison
  321. const bm25Results = searchFTS(db, query, 5);
  322. if (bm25Results.slice(0, 3).some(r => matchesExpected(r.filepath, expectedDoc))) bm25Hits++;
  323. // Vector results for comparison
  324. const vecResults = await searchVec(db, query, DEFAULT_EMBED_MODEL, 5);
  325. if (vecResults.slice(0, 3).some(r => matchesExpected(r.filepath, expectedDoc))) vecHits++;
  326. }
  327. const hybridRate = hybridHits / fusionQueries.length;
  328. const bm25Rate = bm25Hits / fusionQueries.length;
  329. const vecRate = vecHits / fusionQueries.length;
  330. // Fusion should achieve at least 50% on these multi-signal queries
  331. expect(hybridRate).toBeGreaterThanOrEqual(0.5);
  332. // Fusion should outperform or match the best individual method
  333. expect(hybridRate).toBeGreaterThanOrEqual(Math.max(bm25Rate, vecRate));
  334. }, 60000);
  335. test("overall Hit@3 ≥60% with vectors, ≥40% without", async () => {
  336. // Filter out fusion queries for overall score (they're tested separately)
  337. const standardQueries = evalQueries.filter(q => q.difficulty !== "fusion");
  338. let hits = 0;
  339. for (const { query, expectedDoc } of standardQueries) {
  340. const results = await hybridSearch(query);
  341. if (results.slice(0, 3).some(r => matchesExpected(r.file, expectedDoc))) hits++;
  342. }
  343. const threshold = hasVectors ? 0.6 : 0.4;
  344. expect(hits / standardQueries.length).toBeGreaterThanOrEqual(threshold);
  345. }, 60000);
  346. });
  347. // =============================================================================
  348. // Cleanup
  349. // =============================================================================
  350. afterAll(async () => {
  351. // Ensure native resources are released to avoid ggml-metal asserts on process exit.
  352. await disposeDefaultLlamaCpp();
  353. rmSync(tempDir, { recursive: true, force: true });
  354. });