eval-deep-research.ts 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209
  1. /**
  2. * Deep Research Evaluation for QMD
  3. *
  4. * Tests end-to-end retrieval quality: query → expansion → reranking → results
  5. *
  6. * These are HARD queries with NO exact keyword matches - they require
  7. * semantic understanding via query expansion and reranking to succeed.
  8. *
  9. * Run: bun test/eval-deep-research.ts
  10. */
  11. import { execSync } from "child_process";
  12. import { readFileSync, existsSync } from "fs";
  13. import { join, dirname } from "path";
  14. import { fileURLToPath } from "url";
  15. const __dirname = dirname(fileURLToPath(import.meta.url));
  16. interface EvalQuery {
  17. query: string;
  18. expected_doc: string;
  19. difficulty: string;
  20. intent: string; // Domain context hint for future intent-aware retrieval
  21. notes: string;
  22. }
  23. interface SearchResult {
  24. file: string;
  25. score: number;
  26. title?: string;
  27. }
  28. function loadQueries(): EvalQuery[] {
  29. const path = join(__dirname, "eval-deep-research.jsonl");
  30. const content = readFileSync(path, "utf-8");
  31. return content
  32. .split("\n")
  33. .filter((line) => line.trim())
  34. .map((line) => JSON.parse(line));
  35. }
  36. function runBM25Search(query: string): SearchResult[] {
  37. try {
  38. const output = execSync(
  39. `bun src/qmd.ts search "${query.replace(/"/g, '\\"')}" -c eval-docs --json -n 5 2>/dev/null`,
  40. { encoding: "utf-8", timeout: 30000 }
  41. );
  42. return JSON.parse(output);
  43. } catch {
  44. return [];
  45. }
  46. }
  47. function runDeepResearch(query: string): SearchResult[] {
  48. try {
  49. const output = execSync(
  50. `bun src/qmd.ts query "${query.replace(/"/g, '\\"')}" -c eval-docs --json -n 5 2>/dev/null`,
  51. { encoding: "utf-8", timeout: 120000 }
  52. );
  53. return JSON.parse(output);
  54. } catch {
  55. return [];
  56. }
  57. }
  58. function matchesExpected(filepath: string, expectedDoc: string): boolean {
  59. return filepath.toLowerCase().includes(expectedDoc.toLowerCase());
  60. }
  61. function findRank(results: SearchResult[], expectedDoc: string): number {
  62. for (let i = 0; i < results.length; i++) {
  63. if (matchesExpected(results[i]!.file, expectedDoc)) {
  64. return i + 1;
  65. }
  66. }
  67. return -1; // Not found
  68. }
  69. interface MethodResults {
  70. hit1: number;
  71. hit3: number;
  72. hit5: number;
  73. total: number;
  74. details: { query: string; rank: number; expected: string; intent?: string }[];
  75. }
  76. function evaluate(
  77. queries: EvalQuery[],
  78. searchFn: (q: string) => SearchResult[],
  79. label: string
  80. ): MethodResults {
  81. const results: MethodResults = {
  82. hit1: 0,
  83. hit3: 0,
  84. hit5: 0,
  85. total: queries.length,
  86. details: [],
  87. };
  88. console.log(`\n${"=".repeat(60)}`);
  89. console.log(` ${label}`);
  90. console.log(`${"=".repeat(60)}\n`);
  91. for (const { query, expected_doc, intent, notes } of queries) {
  92. const searchResults = searchFn(query);
  93. const rank = findRank(searchResults, expected_doc);
  94. results.details.push({ query, rank, expected: expected_doc, intent });
  95. if (rank === 1) results.hit1++;
  96. if (rank >= 1 && rank <= 3) results.hit3++;
  97. if (rank >= 1 && rank <= 5) results.hit5++;
  98. const status =
  99. rank === 1 ? "✓" : rank > 0 && rank <= 3 ? `@${rank}` : rank > 0 ? `@${rank}` : "✗";
  100. const statusPad = status.padEnd(4);
  101. console.log(` ${statusPad} "${query.slice(0, 45).padEnd(45)}" → ${expected_doc}`);
  102. if (rank === -1) {
  103. console.log(` intent: ${intent} | ${notes}`);
  104. }
  105. }
  106. const hit1Pct = ((results.hit1 / results.total) * 100).toFixed(0);
  107. const hit3Pct = ((results.hit3 / results.total) * 100).toFixed(0);
  108. const hit5Pct = ((results.hit5 / results.total) * 100).toFixed(0);
  109. console.log(`\n ${"─".repeat(50)}`);
  110. console.log(` Hit@1: ${hit1Pct}% (${results.hit1}/${results.total})`);
  111. console.log(` Hit@3: ${hit3Pct}% (${results.hit3}/${results.total})`);
  112. console.log(` Hit@5: ${hit5Pct}% (${results.hit5}/${results.total})`);
  113. return results;
  114. }
  115. async function main() {
  116. console.log("QMD Deep Research Evaluation");
  117. console.log("=".repeat(60));
  118. console.log("Testing hard queries that require semantic understanding.");
  119. console.log("These have NO exact keyword matches in documents.");
  120. // Check if eval-docs collection exists
  121. try {
  122. const status = execSync("bun src/qmd.ts status --json 2>/dev/null", {
  123. encoding: "utf-8",
  124. });
  125. if (!status.includes("eval-docs")) {
  126. console.log("\n⚠️ eval-docs collection not found. Run:");
  127. console.log(" qmd collection add test/eval-docs --name eval-docs");
  128. console.log(" qmd embed");
  129. process.exit(1);
  130. }
  131. } catch {
  132. console.log("\n⚠️ Could not check status. Make sure qmd is working.");
  133. }
  134. const queries = loadQueries();
  135. console.log(`\nLoaded ${queries.length} hard queries.`);
  136. // Run BM25 baseline (expected to fail on most)
  137. const bm25Results = evaluate(queries, runBM25Search, "BM25 BASELINE (keyword search)");
  138. // Run deep research (expected to succeed via expansion + reranking)
  139. const deepResults = evaluate(queries, runDeepResearch, "DEEP RESEARCH (expansion + reranking)");
  140. // Comparison
  141. console.log(`\n${"=".repeat(60)}`);
  142. console.log(" COMPARISON");
  143. console.log(`${"=".repeat(60)}`);
  144. console.log(`\n Method Hit@1 Hit@3 Hit@5`);
  145. console.log(` ${"─".repeat(45)}`);
  146. console.log(
  147. ` BM25 (baseline) ${((bm25Results.hit1 / bm25Results.total) * 100).toFixed(0).padStart(3)}% ${((bm25Results.hit3 / bm25Results.total) * 100).toFixed(0).padStart(3)}% ${((bm25Results.hit5 / bm25Results.total) * 100).toFixed(0).padStart(3)}%`
  148. );
  149. console.log(
  150. ` Deep Research ${((deepResults.hit1 / deepResults.total) * 100).toFixed(0).padStart(3)}% ${((deepResults.hit3 / deepResults.total) * 100).toFixed(0).padStart(3)}% ${((deepResults.hit5 / deepResults.total) * 100).toFixed(0).padStart(3)}%`
  151. );
  152. const improvement = deepResults.hit3 - bm25Results.hit3;
  153. console.log(`\n Improvement (Hit@3): +${improvement} queries (${((improvement / bm25Results.total) * 100).toFixed(0)}%)`);
  154. // Show queries where deep research recovered failures
  155. const recovered = deepResults.details.filter(
  156. (d) =>
  157. d.rank >= 1 &&
  158. d.rank <= 3 &&
  159. bm25Results.details.find((b) => b.query === d.query)?.rank === -1
  160. );
  161. if (recovered.length > 0) {
  162. console.log(`\n Recovered by expansion + reranking (${recovered.length}):`);
  163. for (const { query, rank, expected } of recovered.slice(0, 5)) {
  164. console.log(` @${rank} "${query.slice(0, 40)}..." → ${expected}`);
  165. }
  166. if (recovered.length > 5) {
  167. console.log(` ... and ${recovered.length - 5} more`);
  168. }
  169. }
  170. // Exit with error if deep research performs poorly
  171. const deepHit3Pct = (deepResults.hit3 / deepResults.total) * 100;
  172. if (deepHit3Pct < 60) {
  173. console.log(`\n❌ Deep research Hit@3 < 60% (${deepHit3Pct.toFixed(0)}%)`);
  174. process.exit(1);
  175. } else {
  176. console.log(`\n✓ Deep research Hit@3 >= 60% (${deepHit3Pct.toFixed(0)}%)`);
  177. }
  178. }
  179. main();