Sfoglia il codice sorgente

Add vector and hybrid (RRF) search evaluation tests

Three test suites with different thresholds:
- BM25: easy≥80%, medium≥15%, hard≥15%, overall≥40%
- Vector: easy≥60%, medium≥40%, hard≥30%, overall≥50%
- Hybrid (RRF): easy≥80%, medium≥50%, hard≥35%, overall≥60%

Hybrid should outperform individual methods on semantic queries.
Vector/hybrid tests have 60-120s timeouts for embedding generation.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
Tobi Lutke 5 mesi fa
parent
commit
56dea7d6ce
1 ha cambiato i file con 244 aggiunte e 52 eliminazioni
  1. 244 52
      src/eval.test.ts

+ 244 - 52
src/eval.test.ts

@@ -3,6 +3,11 @@
  *
  * Tests search quality against synthetic documents with known-answer queries.
  * Validates that search improvements don't regress quality.
+ *
+ * Three test suites:
+ * 1. BM25 (FTS) - lexical search baseline
+ * 2. Vector Search - semantic search with embeddings
+ * 3. Hybrid (RRF) - combined lexical + vector with rank fusion
  */
 
 import { describe, test, expect, beforeAll, afterAll } from "bun:test";
@@ -19,9 +24,17 @@ import {
   getDb,
   closeDb,
   searchFTS,
+  searchVec,
   insertDocument,
   insertContent,
+  ensureVecTable,
+  insertEmbedding,
+  chunkDocumentByTokens,
+  reciprocalRankFusion,
+  DEFAULT_EMBED_MODEL,
+  type RankedResult,
 } from "./store";
+import { getDefaultLlamaCpp, disposeDefaultLlamaCpp, formatDocForEmbedding } from "./llm";
 
 // Eval queries with expected documents
 const evalQueries: {
@@ -54,11 +67,33 @@ const evalQueries: {
   { query: "beta program 47 bugs", expectedDoc: "product-launch", difficulty: "hard" },
 ];
 
-describe("Search Quality Evaluation", () => {
+// Helper to check if result matches expected doc
+function matchesExpected(filepath: string, expectedDoc: string): boolean {
+  return filepath.toLowerCase().includes(expectedDoc);
+}
+
+// Helper to calculate hit rate
+function calcHitRate(
+  queries: typeof evalQueries,
+  searchFn: (query: string) => { filepath: string }[],
+  topK: number
+): number {
+  let hits = 0;
+  for (const { query, expectedDoc } of queries) {
+    const results = searchFn(query).slice(0, topK);
+    if (results.some(r => matchesExpected(r.filepath, expectedDoc))) hits++;
+  }
+  return hits / queries.length;
+}
+
+// =============================================================================
+// BM25 (Lexical) Tests - Fast, no model loading needed
+// =============================================================================
+
+describe("BM25 Search (FTS)", () => {
   let db: Database;
 
   beforeAll(() => {
-    // Initialize database (INDEX_PATH already set at top of file)
     db = getDb();
 
     // Load and index eval documents
@@ -78,71 +113,228 @@ describe("Search Quality Evaluation", () => {
 
   afterAll(() => {
     closeDb();
-    rmSync(tempDir, { recursive: true, force: true });
   });
 
-  describe("BM25 Search (FTS)", () => {
-    test("easy queries: ≥80% Hit@3", () => {
-      const easyQueries = evalQueries.filter(q => q.difficulty === "easy");
-      let hits = 0;
+  test("easy queries: ≥80% Hit@3", () => {
+    const easyQueries = evalQueries.filter(q => q.difficulty === "easy");
+    const hitRate = calcHitRate(easyQueries, q => searchFTS(db, q, 5), 3);
+    expect(hitRate).toBeGreaterThanOrEqual(0.8);
+  });
 
-      for (const { query, expectedDoc } of easyQueries) {
-        const results = searchFTS(db, query, 5);
-        const top3 = results.slice(0, 3);
-        const found = top3.some(r => r.filepath.toLowerCase().includes(expectedDoc));
-        if (found) hits++;
-      }
+  test("medium queries: ≥15% Hit@3 (BM25 struggles with semantic)", () => {
+    const mediumQueries = evalQueries.filter(q => q.difficulty === "medium");
+    const hitRate = calcHitRate(mediumQueries, q => searchFTS(db, q, 5), 3);
+    expect(hitRate).toBeGreaterThanOrEqual(0.15);
+  });
+
+  test("hard queries: ≥15% Hit@5 (BM25 baseline)", () => {
+    const hardQueries = evalQueries.filter(q => q.difficulty === "hard");
+    const hitRate = calcHitRate(hardQueries, q => searchFTS(db, q, 5), 5);
+    expect(hitRate).toBeGreaterThanOrEqual(0.15);
+  });
+
+  test("overall Hit@3 ≥40% (BM25 baseline)", () => {
+    const hitRate = calcHitRate(evalQueries, q => searchFTS(db, q, 5), 3);
+    expect(hitRate).toBeGreaterThanOrEqual(0.4);
+  });
+});
+
+// =============================================================================
+// Vector Search Tests - Requires embedding model
+// =============================================================================
+
+describe("Vector Search", () => {
+  let db: Database;
+  let hasEmbeddings = false;
 
-      const hitRate = hits / easyQueries.length;
-      expect(hitRate).toBeGreaterThanOrEqual(0.8);
-    });
+  beforeAll(async () => {
+    db = getDb();
 
-    test("medium queries: ≥15% Hit@3 (BM25 struggles with semantic)", () => {
-      const mediumQueries = evalQueries.filter(q => q.difficulty === "medium");
-      let hits = 0;
+    // Check if embeddings already exist (from previous test run)
+    const vecTable = db.prepare(
+      `SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`
+    ).get();
 
-      for (const { query, expectedDoc } of mediumQueries) {
-        const results = searchFTS(db, query, 5);
-        const top3 = results.slice(0, 3);
-        const found = top3.some(r => r.filepath.toLowerCase().includes(expectedDoc));
-        if (found) hits++;
+    if (vecTable) {
+      const count = db.prepare(`SELECT COUNT(*) as cnt FROM vectors_vec`).get() as { cnt: number };
+      if (count.cnt > 0) {
+        hasEmbeddings = true;
+        return;
       }
+    }
 
-      const hitRate = hits / mediumQueries.length;
-      // BM25 alone struggles with semantic queries - baseline is low
-      expect(hitRate).toBeGreaterThanOrEqual(0.15);
-    });
+    // Generate embeddings for test documents
+    const llm = getDefaultLlamaCpp();
+    ensureVecTable(db, 768); // embeddinggemma uses 768 dimensions
 
-    test("hard queries: ≥15% Hit@5 (BM25 baseline)", () => {
-      const hardQueries = evalQueries.filter(q => q.difficulty === "hard");
-      let hits = 0;
+    const evalDocsDir = join(import.meta.dir, "../test/eval-docs");
+    const files = readdirSync(evalDocsDir).filter(f => f.endsWith(".md"));
 
-      for (const { query, expectedDoc } of hardQueries) {
-        const results = searchFTS(db, query, 5);
-        const found = results.some(r => r.filepath.toLowerCase().includes(expectedDoc));
-        if (found) hits++;
+    for (const file of files) {
+      const content = readFileSync(join(evalDocsDir, file), "utf-8");
+      const hash = Bun.hash(content).toString(16).slice(0, 12);
+      const title = content.split("\n")[0]?.replace(/^#\s*/, "") || file;
+
+      // Chunk and embed
+      const chunks = await chunkDocumentByTokens(content, llm);
+      for (let seq = 0; seq < chunks.length; seq++) {
+        const chunk = chunks[seq];
+        const formatted = formatDocForEmbedding(chunk.text, title);
+        const result = await llm.embed(formatted, { model: DEFAULT_EMBED_MODEL, isQuery: false });
+        if (result?.embedding) {
+          insertEmbedding(db, hash, seq, chunk.pos, result.embedding);
+        }
       }
+    }
+    hasEmbeddings = true;
+  }, 120000); // 2 minute timeout for embedding generation
 
-      const hitRate = hits / hardQueries.length;
-      // BM25 alone really struggles with vague queries
-      expect(hitRate).toBeGreaterThanOrEqual(0.15);
-    });
+  afterAll(async () => {
+    await disposeDefaultLlamaCpp();
   });
 
-  describe("Overall Quality", () => {
-    test("overall Hit@3 ≥40% (BM25 baseline)", () => {
-      let hits = 0;
+  test("easy queries: ≥60% Hit@3 (vector should match keywords too)", async () => {
+    if (!hasEmbeddings) return; // Skip if embedding failed
 
-      for (const { query, expectedDoc } of evalQueries) {
-        const results = searchFTS(db, query, 5);
-        const top3 = results.slice(0, 3);
-        const found = top3.some(r => r.filepath.toLowerCase().includes(expectedDoc));
-        if (found) hits++;
-      }
+    const easyQueries = evalQueries.filter(q => q.difficulty === "easy");
+    let hits = 0;
+    for (const { query, expectedDoc } of easyQueries) {
+      const results = await searchVec(db, query, DEFAULT_EMBED_MODEL, 5);
+      if (results.slice(0, 3).some(r => matchesExpected(r.filepath, expectedDoc))) hits++;
+    }
+    expect(hits / easyQueries.length).toBeGreaterThanOrEqual(0.6);
+  }, 60000);
+
+  test("medium queries: ≥40% Hit@3 (vector excels at semantic)", async () => {
+    if (!hasEmbeddings) return;
+
+    const mediumQueries = evalQueries.filter(q => q.difficulty === "medium");
+    let hits = 0;
+    for (const { query, expectedDoc } of mediumQueries) {
+      const results = await searchVec(db, query, DEFAULT_EMBED_MODEL, 5);
+      if (results.slice(0, 3).some(r => matchesExpected(r.filepath, expectedDoc))) hits++;
+    }
+    // Vector search should do better on semantic queries than BM25
+    expect(hits / mediumQueries.length).toBeGreaterThanOrEqual(0.4);
+  }, 60000);
 
-      const hitRate = hits / evalQueries.length;
-      // BM25 alone: ~40% is baseline, hybrid should be higher
-      expect(hitRate).toBeGreaterThanOrEqual(0.4);
-    });
+  test("hard queries: ≥30% Hit@5 (vector helps with vague queries)", async () => {
+    if (!hasEmbeddings) return;
+
+    const hardQueries = evalQueries.filter(q => q.difficulty === "hard");
+    let hits = 0;
+    for (const { query, expectedDoc } of hardQueries) {
+      const results = await searchVec(db, query, DEFAULT_EMBED_MODEL, 5);
+      if (results.some(r => matchesExpected(r.filepath, expectedDoc))) hits++;
+    }
+    expect(hits / hardQueries.length).toBeGreaterThanOrEqual(0.3);
+  }, 60000);
+
+  test("overall Hit@3 ≥50% (vector baseline)", async () => {
+    if (!hasEmbeddings) return;
+
+    let hits = 0;
+    for (const { query, expectedDoc } of evalQueries) {
+      const results = await searchVec(db, query, DEFAULT_EMBED_MODEL, 5);
+      if (results.slice(0, 3).some(r => matchesExpected(r.filepath, expectedDoc))) hits++;
+    }
+    expect(hits / evalQueries.length).toBeGreaterThanOrEqual(0.5);
+  }, 60000);
+});
+
+// =============================================================================
+// Hybrid Search (RRF) Tests - Combines BM25 + Vector
+// =============================================================================
+
+describe("Hybrid Search (RRF)", () => {
+  let db: Database;
+
+  beforeAll(() => {
+    db = getDb();
   });
+
+  // Helper: run hybrid search with RRF fusion
+  async function hybridSearch(query: string, limit: number = 10): Promise<RankedResult[]> {
+    const rankedLists: RankedResult[][] = [];
+
+    // FTS results
+    const ftsResults = searchFTS(db, query, 20);
+    if (ftsResults.length > 0) {
+      rankedLists.push(ftsResults.map(r => ({
+        file: r.filepath,
+        displayPath: r.displayPath,
+        title: r.title,
+        body: r.body || "",
+        score: r.score
+      })));
+    }
+
+    // Vector results
+    const vecResults = await searchVec(db, query, DEFAULT_EMBED_MODEL, 20);
+    if (vecResults.length > 0) {
+      rankedLists.push(vecResults.map(r => ({
+        file: r.filepath,
+        displayPath: r.displayPath,
+        title: r.title,
+        body: r.body || "",
+        score: r.score
+      })));
+    }
+
+    if (rankedLists.length === 0) return [];
+
+    // Apply RRF fusion
+    const fused = reciprocalRankFusion(rankedLists);
+    return fused.slice(0, limit);
+  }
+
+  test("easy queries: ≥80% Hit@3 (hybrid should match BM25)", async () => {
+    const easyQueries = evalQueries.filter(q => q.difficulty === "easy");
+    let hits = 0;
+    for (const { query, expectedDoc } of easyQueries) {
+      const results = await hybridSearch(query);
+      if (results.slice(0, 3).some(r => matchesExpected(r.file, expectedDoc))) hits++;
+    }
+    expect(hits / easyQueries.length).toBeGreaterThanOrEqual(0.8);
+  }, 60000);
+
+  test("medium queries: ≥50% Hit@3 (hybrid should beat both)", async () => {
+    const mediumQueries = evalQueries.filter(q => q.difficulty === "medium");
+    let hits = 0;
+    for (const { query, expectedDoc } of mediumQueries) {
+      const results = await hybridSearch(query);
+      if (results.slice(0, 3).some(r => matchesExpected(r.file, expectedDoc))) hits++;
+    }
+    // Hybrid should outperform both BM25 (15%) and vector (40%) alone
+    expect(hits / mediumQueries.length).toBeGreaterThanOrEqual(0.5);
+  }, 60000);
+
+  test("hard queries: ≥35% Hit@5 (hybrid combines signals)", async () => {
+    const hardQueries = evalQueries.filter(q => q.difficulty === "hard");
+    let hits = 0;
+    for (const { query, expectedDoc } of hardQueries) {
+      const results = await hybridSearch(query);
+      if (results.some(r => matchesExpected(r.file, expectedDoc))) hits++;
+    }
+    // Hybrid should beat BM25 (15%) and vector (30%)
+    expect(hits / hardQueries.length).toBeGreaterThanOrEqual(0.35);
+  }, 60000);
+
+  test("overall Hit@3 ≥60% (hybrid beats individual methods)", async () => {
+    let hits = 0;
+    for (const { query, expectedDoc } of evalQueries) {
+      const results = await hybridSearch(query);
+      if (results.slice(0, 3).some(r => matchesExpected(r.file, expectedDoc))) hits++;
+    }
+    // Hybrid should beat BM25 (40%) and vector (50%)
+    expect(hits / evalQueries.length).toBeGreaterThanOrEqual(0.6);
+  }, 60000);
+});
+
+// =============================================================================
+// Cleanup
+// =============================================================================
+
+afterAll(() => {
+  rmSync(tempDir, { recursive: true, force: true });
 });