eval-harness.ts 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223
  1. /**
  2. * Evaluation Harness for QMD Search
  3. *
  4. * Tests search quality with synthetic queries against known documents.
  5. * Run: bun test/eval-harness.ts
  6. */
  7. import { execSync } from "child_process";
  8. // Test queries with expected documents and difficulty
  9. const evalQueries: {
  10. query: string;
  11. expectedDoc: string; // Partial match on filename
  12. difficulty: "easy" | "medium" | "hard";
  13. description: string;
  14. }[] = [
  15. // EASY: Exact keyword matches
  16. {
  17. query: "API versioning",
  18. expectedDoc: "api-design",
  19. difficulty: "easy",
  20. description: "Direct keyword match"
  21. },
  22. {
  23. query: "Series A fundraising",
  24. expectedDoc: "fundraising",
  25. difficulty: "easy",
  26. description: "Direct keyword match"
  27. },
  28. {
  29. query: "CAP theorem",
  30. expectedDoc: "distributed-systems",
  31. difficulty: "easy",
  32. description: "Direct keyword match"
  33. },
  34. {
  35. query: "overfitting machine learning",
  36. expectedDoc: "machine-learning",
  37. difficulty: "easy",
  38. description: "Direct keyword match"
  39. },
  40. {
  41. query: "remote work VPN",
  42. expectedDoc: "remote-work",
  43. difficulty: "easy",
  44. description: "Direct keyword match"
  45. },
  46. {
  47. query: "Project Phoenix retrospective",
  48. expectedDoc: "product-launch",
  49. difficulty: "easy",
  50. description: "Direct keyword match"
  51. },
  52. // MEDIUM: Semantic/conceptual queries
  53. {
  54. query: "how to structure REST endpoints",
  55. expectedDoc: "api-design",
  56. difficulty: "medium",
  57. description: "Conceptual - no exact match"
  58. },
  59. {
  60. query: "raising money for startup",
  61. expectedDoc: "fundraising",
  62. difficulty: "medium",
  63. description: "Conceptual - synonyms"
  64. },
  65. {
  66. query: "consistency vs availability tradeoffs",
  67. expectedDoc: "distributed-systems",
  68. difficulty: "medium",
  69. description: "Conceptual understanding"
  70. },
  71. {
  72. query: "how to prevent models from memorizing data",
  73. expectedDoc: "machine-learning",
  74. difficulty: "medium",
  75. description: "Conceptual - overfitting"
  76. },
  77. {
  78. query: "working from home guidelines",
  79. expectedDoc: "remote-work",
  80. difficulty: "medium",
  81. description: "Synonym match"
  82. },
  83. {
  84. query: "what went wrong with the launch",
  85. expectedDoc: "product-launch",
  86. difficulty: "medium",
  87. description: "Conceptual query"
  88. },
  89. // HARD: Vague, partial memory, indirect
  90. {
  91. query: "nouns not verbs",
  92. expectedDoc: "api-design",
  93. difficulty: "hard",
  94. description: "Partial phrase recall"
  95. },
  96. {
  97. query: "Sequoia investor pitch",
  98. expectedDoc: "fundraising",
  99. difficulty: "hard",
  100. description: "Indirect reference"
  101. },
  102. {
  103. query: "Raft algorithm leader election",
  104. expectedDoc: "distributed-systems",
  105. difficulty: "hard",
  106. description: "Specific detail in long doc"
  107. },
  108. {
  109. query: "F1 score precision recall",
  110. expectedDoc: "machine-learning",
  111. difficulty: "hard",
  112. description: "Technical detail"
  113. },
  114. {
  115. query: "quarterly team gathering travel",
  116. expectedDoc: "remote-work",
  117. difficulty: "hard",
  118. description: "Specific policy detail"
  119. },
  120. {
  121. query: "beta program 47 bugs",
  122. expectedDoc: "product-launch",
  123. difficulty: "hard",
  124. description: "Specific number recall"
  125. },
  126. ];
  127. interface SearchResult {
  128. file: string;
  129. score: number;
  130. title: string;
  131. }
  132. function runSearch(query: string): SearchResult[] {
  133. try {
  134. const output = execSync(
  135. `bun src/qmd.ts search "${query.replace(/"/g, '\\"')}" --json -n 5 2>/dev/null`,
  136. { encoding: "utf-8", timeout: 30000 }
  137. );
  138. return JSON.parse(output);
  139. } catch (e) {
  140. return [];
  141. }
  142. }
  143. function runQuery(query: string): SearchResult[] {
  144. try {
  145. const output = execSync(
  146. `bun src/qmd.ts query "${query.replace(/"/g, '\\"')}" --json -n 5 2>/dev/null`,
  147. { encoding: "utf-8", timeout: 60000 }
  148. );
  149. return JSON.parse(output);
  150. } catch (e) {
  151. return [];
  152. }
  153. }
  154. function evaluate(mode: "search" | "query") {
  155. const runFn = mode === "search" ? runSearch : runQuery;
  156. const results = {
  157. easy: { total: 0, hit1: 0, hit3: 0, hit5: 0 },
  158. medium: { total: 0, hit1: 0, hit3: 0, hit5: 0 },
  159. hard: { total: 0, hit1: 0, hit3: 0, hit5: 0 },
  160. };
  161. console.log(`\n=== Evaluating ${mode.toUpperCase()} mode ===\n`);
  162. for (const { query, expectedDoc, difficulty, description } of evalQueries) {
  163. const searchResults = runFn(query);
  164. const ranks = searchResults
  165. .map((r, i) => ({ rank: i + 1, matches: r.file.toLowerCase().includes(expectedDoc) }))
  166. .filter(r => r.matches);
  167. const firstHit = ranks.length > 0 ? ranks[0].rank : -1;
  168. results[difficulty].total++;
  169. if (firstHit === 1) results[difficulty].hit1++;
  170. if (firstHit >= 1 && firstHit <= 3) results[difficulty].hit3++;
  171. if (firstHit >= 1 && firstHit <= 5) results[difficulty].hit5++;
  172. const status = firstHit === 1 ? "✓" : firstHit > 0 ? `@${firstHit}` : "✗";
  173. console.log(`[${difficulty.padEnd(6)}] ${status.padEnd(3)} "${query}" → ${description}`);
  174. }
  175. console.log("\n--- Summary ---");
  176. for (const [diff, r] of Object.entries(results)) {
  177. const hit1Pct = ((r.hit1 / r.total) * 100).toFixed(0);
  178. const hit3Pct = ((r.hit3 / r.total) * 100).toFixed(0);
  179. const hit5Pct = ((r.hit5 / r.total) * 100).toFixed(0);
  180. console.log(`${diff.padEnd(8)}: Hit@1=${hit1Pct}% Hit@3=${hit3Pct}% Hit@5=${hit5Pct}% (n=${r.total})`);
  181. }
  182. const total = evalQueries.length;
  183. const totalHit1 = Object.values(results).reduce((a, r) => a + r.hit1, 0);
  184. const totalHit3 = Object.values(results).reduce((a, r) => a + r.hit3, 0);
  185. console.log(`\nOverall: Hit@1=${((totalHit1/total)*100).toFixed(0)}% Hit@3=${((totalHit3/total)*100).toFixed(0)}%`);
  186. }
  187. // Main
  188. console.log("QMD Evaluation Harness");
  189. console.log("=".repeat(50));
  190. console.log(`Testing ${evalQueries.length} queries across 6 documents`);
  191. // Check if eval-docs collection exists
  192. try {
  193. const status = execSync("bun src/qmd.ts status --json 2>/dev/null", { encoding: "utf-8" });
  194. if (!status.includes("eval-docs")) {
  195. console.log("\n⚠️ eval-docs collection not found. Run:");
  196. console.log(" qmd collection add test/eval-docs --name eval-docs");
  197. console.log(" qmd embed");
  198. process.exit(1);
  199. }
  200. } catch {
  201. console.log("\n⚠️ Could not check status. Make sure qmd is working.");
  202. }
  203. // Run evaluations
  204. evaluate("search");
  205. evaluate("query");