6 mesi fa · 56dea7d6ce
--- a/src/eval.test.ts
+++ b/src/eval.test.ts
@@ -3,6 +3,11 @@
 
				  *
			
 
				  * Tests search quality against synthetic documents with known-answer queries.
			
 
				  * Validates that search improvements don't regress quality.
			
 
				+ *
			
 
				+ * Three test suites:
			
 
				+ * 1. BM25 (FTS) - lexical search baseline
			
 
				+ * 2. Vector Search - semantic search with embeddings
			
 
				+ * 3. Hybrid (RRF) - combined lexical + vector with rank fusion
			
 
				  */
			
 
				 
			
 
				 import { describe, test, expect, beforeAll, afterAll } from "bun:test";
			
@@ -19,9 +24,17 @@ import {
 
				   getDb,
			
 
				   closeDb,
			
 
				   searchFTS,
			
 
				+  searchVec,
			
 
				   insertDocument,
			
 
				   insertContent,
			
 
				+  ensureVecTable,
			
 
				+  insertEmbedding,
			
 
				+  chunkDocumentByTokens,
			
 
				+  reciprocalRankFusion,
			
 
				+  DEFAULT_EMBED_MODEL,
			
 
				+  type RankedResult,
			
 
				 } from "./store";
			
 
				+import { getDefaultLlamaCpp, disposeDefaultLlamaCpp, formatDocForEmbedding } from "./llm";
			
 
				 
			
 
				 // Eval queries with expected documents
			
 
				 const evalQueries: {
			
@@ -54,11 +67,33 @@ const evalQueries: {
 
				   { query: "beta program 47 bugs", expectedDoc: "product-launch", difficulty: "hard" },
			
 
				 ];
			
 
				 
			
 
				-describe("Search Quality Evaluation", () => {
			
 
				+// Helper to check if result matches expected doc
			
 
				+function matchesExpected(filepath: string, expectedDoc: string): boolean {
			
 
				+  return filepath.toLowerCase().includes(expectedDoc);
			
 
				+}
			
 
				+
			
 
				+// Helper to calculate hit rate
			
 
				+function calcHitRate(
			
 
				+  queries: typeof evalQueries,
			
 
				+  searchFn: (query: string) => { filepath: string }[],
			
 
				+  topK: number
			
 
				+): number {
			
 
				+  let hits = 0;
			
 
				+  for (const { query, expectedDoc } of queries) {
			
 
				+    const results = searchFn(query).slice(0, topK);
			
 
				+    if (results.some(r => matchesExpected(r.filepath, expectedDoc))) hits++;
			
 
				+  }
			
 
				+  return hits / queries.length;
			
 
				+}
			
 
				+
			
 
				+// =============================================================================
			
 
				+// BM25 (Lexical) Tests - Fast, no model loading needed
			
 
				+// =============================================================================
			
 
				+
			
 
				+describe("BM25 Search (FTS)", () => {
			
 
				   let db: Database;
			
 
				 
			
 
				   beforeAll(() => {
			
 
				-    // Initialize database (INDEX_PATH already set at top of file)
			
 
				     db = getDb();
			
 
				 
			
 
				     // Load and index eval documents
			
@@ -78,71 +113,228 @@ describe("Search Quality Evaluation", () => {
 
				 
			
 
				   afterAll(() => {
			
 
				     closeDb();
			
 
				-    rmSync(tempDir, { recursive: true, force: true });
			
 
				   });
			
 
				 
			
 
				-  describe("BM25 Search (FTS)", () => {
			
 
				-    test("easy queries: ≥80% Hit@3", () => {
			
 
				-      const easyQueries = evalQueries.filter(q => q.difficulty === "easy");
			
 
				-      let hits = 0;
			
 
				+  test("easy queries: ≥80% Hit@3", () => {
			
 
				+    const easyQueries = evalQueries.filter(q => q.difficulty === "easy");
			
 
				+    const hitRate = calcHitRate(easyQueries, q => searchFTS(db, q, 5), 3);
			
 
				+    expect(hitRate).toBeGreaterThanOrEqual(0.8);
			
 
				+  });
			
 
				 
			
 
				-      for (const { query, expectedDoc } of easyQueries) {
			
 
				-        const results = searchFTS(db, query, 5);
			
 
				-        const top3 = results.slice(0, 3);
			
 
				-        const found = top3.some(r => r.filepath.toLowerCase().includes(expectedDoc));
			
 
				-        if (found) hits++;
			
 
				-      }
			
 
				+  test("medium queries: ≥15% Hit@3 (BM25 struggles with semantic)", () => {
			
 
				+    const mediumQueries = evalQueries.filter(q => q.difficulty === "medium");
			
 
				+    const hitRate = calcHitRate(mediumQueries, q => searchFTS(db, q, 5), 3);
			
 
				+    expect(hitRate).toBeGreaterThanOrEqual(0.15);
			
 
				+  });
			
 
				+
			
 
				+  test("hard queries: ≥15% Hit@5 (BM25 baseline)", () => {
			
 
				+    const hardQueries = evalQueries.filter(q => q.difficulty === "hard");
			
 
				+    const hitRate = calcHitRate(hardQueries, q => searchFTS(db, q, 5), 5);
			
 
				+    expect(hitRate).toBeGreaterThanOrEqual(0.15);
			
 
				+  });
			
 
				+
			
 
				+  test("overall Hit@3 ≥40% (BM25 baseline)", () => {
			
 
				+    const hitRate = calcHitRate(evalQueries, q => searchFTS(db, q, 5), 3);
			
 
				+    expect(hitRate).toBeGreaterThanOrEqual(0.4);
			
 
				+  });
			
 
				+});
			
 
				+
			
 
				+// =============================================================================
			
 
				+// Vector Search Tests - Requires embedding model
			
 
				+// =============================================================================
			
 
				+
			
 
				+describe("Vector Search", () => {
			
 
				+  let db: Database;
			
 
				+  let hasEmbeddings = false;
			
 
				 
			
 
				-      const hitRate = hits / easyQueries.length;
			
 
				-      expect(hitRate).toBeGreaterThanOrEqual(0.8);
			
 
				-    });
			
 
				+  beforeAll(async () => {
			
 
				+    db = getDb();
			
 
				 
			
 
				-    test("medium queries: ≥15% Hit@3 (BM25 struggles with semantic)", () => {
			
 
				-      const mediumQueries = evalQueries.filter(q => q.difficulty === "medium");
			
 
				-      let hits = 0;
			
 
				+    // Check if embeddings already exist (from previous test run)
			
 
				+    const vecTable = db.prepare(
			
 
				+      `SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`
			
 
				+    ).get();
			
 
				 
			
 
				-      for (const { query, expectedDoc } of mediumQueries) {
			
 
				-        const results = searchFTS(db, query, 5);
			
 
				-        const top3 = results.slice(0, 3);
			
 
				-        const found = top3.some(r => r.filepath.toLowerCase().includes(expectedDoc));
			
 
				-        if (found) hits++;
			
 
				+    if (vecTable) {
			
 
				+      const count = db.prepare(`SELECT COUNT(*) as cnt FROM vectors_vec`).get() as { cnt: number };
			
 
				+      if (count.cnt > 0) {
			
 
				+        hasEmbeddings = true;
			
 
				+        return;
			
 
				       }
			
 
				+    }
			
 
				 
			
 
				-      const hitRate = hits / mediumQueries.length;
			
 
				-      // BM25 alone struggles with semantic queries - baseline is low
			
 
				-      expect(hitRate).toBeGreaterThanOrEqual(0.15);
			
 
				-    });
			
 
				+    // Generate embeddings for test documents
			
 
				+    const llm = getDefaultLlamaCpp();
			
 
				+    ensureVecTable(db, 768); // embeddinggemma uses 768 dimensions
			
 
				 
			
 
				-    test("hard queries: ≥15% Hit@5 (BM25 baseline)", () => {
			
 
				-      const hardQueries = evalQueries.filter(q => q.difficulty === "hard");
			
 
				-      let hits = 0;
			
 
				+    const evalDocsDir = join(import.meta.dir, "../test/eval-docs");
			
 
				+    const files = readdirSync(evalDocsDir).filter(f => f.endsWith(".md"));
			
 
				 
			
 
				-      for (const { query, expectedDoc } of hardQueries) {
			
 
				-        const results = searchFTS(db, query, 5);
			
 
				-        const found = results.some(r => r.filepath.toLowerCase().includes(expectedDoc));
			
 
				-        if (found) hits++;
			
 
				+    for (const file of files) {
			
 
				+      const content = readFileSync(join(evalDocsDir, file), "utf-8");
			
 
				+      const hash = Bun.hash(content).toString(16).slice(0, 12);
			
 
				+      const title = content.split("\n")[0]?.replace(/^#\s*/, "") || file;
			
 
				+
			
 
				+      // Chunk and embed
			
 
				+      const chunks = await chunkDocumentByTokens(content, llm);
			
 
				+      for (let seq = 0; seq < chunks.length; seq++) {
			
 
				+        const chunk = chunks[seq];
			
 
				+        const formatted = formatDocForEmbedding(chunk.text, title);
			
 
				+        const result = await llm.embed(formatted, { model: DEFAULT_EMBED_MODEL, isQuery: false });
			
 
				+        if (result?.embedding) {
			
 
				+          insertEmbedding(db, hash, seq, chunk.pos, result.embedding);
			
 
				+        }
			
 
				       }
			
 
				+    }
			
 
				+    hasEmbeddings = true;
			
 
				+  }, 120000); // 2 minute timeout for embedding generation
			
 
				 
			
 
				-      const hitRate = hits / hardQueries.length;
			
 
				-      // BM25 alone really struggles with vague queries
			
 
				-      expect(hitRate).toBeGreaterThanOrEqual(0.15);
			
 
				-    });
			
 
				+  afterAll(async () => {
			
 
				+    await disposeDefaultLlamaCpp();
			
 
				   });
			
 
				 
			
 
				-  describe("Overall Quality", () => {
			
 
				-    test("overall Hit@3 ≥40% (BM25 baseline)", () => {
			
 
				-      let hits = 0;
			
 
				+  test("easy queries: ≥60% Hit@3 (vector should match keywords too)", async () => {
			
 
				+    if (!hasEmbeddings) return; // Skip if embedding failed
			
 
				 
			
 
				-      for (const { query, expectedDoc } of evalQueries) {
			
 
				-        const results = searchFTS(db, query, 5);
			
 
				-        const top3 = results.slice(0, 3);
			
 
				-        const found = top3.some(r => r.filepath.toLowerCase().includes(expectedDoc));
			
 
				-        if (found) hits++;
			
 
				-      }
			
 
				+    const easyQueries = evalQueries.filter(q => q.difficulty === "easy");
			
 
				+    let hits = 0;
			
 
				+    for (const { query, expectedDoc } of easyQueries) {
			
 
				+      const results = await searchVec(db, query, DEFAULT_EMBED_MODEL, 5);
			
 
				+      if (results.slice(0, 3).some(r => matchesExpected(r.filepath, expectedDoc))) hits++;
			
 
				+    }
			
 
				+    expect(hits / easyQueries.length).toBeGreaterThanOrEqual(0.6);
			
 
				+  }, 60000);
			
 
				+
			
 
				+  test("medium queries: ≥40% Hit@3 (vector excels at semantic)", async () => {
			
 
				+    if (!hasEmbeddings) return;
			
 
				+
			
 
				+    const mediumQueries = evalQueries.filter(q => q.difficulty === "medium");
			
 
				+    let hits = 0;
			
 
				+    for (const { query, expectedDoc } of mediumQueries) {
			
 
				+      const results = await searchVec(db, query, DEFAULT_EMBED_MODEL, 5);
			
 
				+      if (results.slice(0, 3).some(r => matchesExpected(r.filepath, expectedDoc))) hits++;
			
 
				+    }
			
 
				+    // Vector search should do better on semantic queries than BM25
			
 
				+    expect(hits / mediumQueries.length).toBeGreaterThanOrEqual(0.4);
			
 
				+  }, 60000);
			
 
				 
			
 
				-      const hitRate = hits / evalQueries.length;
			
 
				-      // BM25 alone: ~40% is baseline, hybrid should be higher
			
 
				-      expect(hitRate).toBeGreaterThanOrEqual(0.4);
			
 
				-    });
			
 
				+  test("hard queries: ≥30% Hit@5 (vector helps with vague queries)", async () => {
			
 
				+    if (!hasEmbeddings) return;
			
 
				+
			
 
				+    const hardQueries = evalQueries.filter(q => q.difficulty === "hard");
			
 
				+    let hits = 0;
			
 
				+    for (const { query, expectedDoc } of hardQueries) {
			
 
				+      const results = await searchVec(db, query, DEFAULT_EMBED_MODEL, 5);
			
 
				+      if (results.some(r => matchesExpected(r.filepath, expectedDoc))) hits++;
			
 
				+    }
			
 
				+    expect(hits / hardQueries.length).toBeGreaterThanOrEqual(0.3);
			
 
				+  }, 60000);
			
 
				+
			
 
				+  test("overall Hit@3 ≥50% (vector baseline)", async () => {
			
 
				+    if (!hasEmbeddings) return;
			
 
				+
			
 
				+    let hits = 0;
			
 
				+    for (const { query, expectedDoc } of evalQueries) {
			
 
				+      const results = await searchVec(db, query, DEFAULT_EMBED_MODEL, 5);
			
 
				+      if (results.slice(0, 3).some(r => matchesExpected(r.filepath, expectedDoc))) hits++;
			
 
				+    }
			
 
				+    expect(hits / evalQueries.length).toBeGreaterThanOrEqual(0.5);
			
 
				+  }, 60000);
			
 
				+});
			
 
				+
			
 
				+// =============================================================================
			
 
				+// Hybrid Search (RRF) Tests - Combines BM25 + Vector
			
 
				+// =============================================================================
			
 
				+
			
 
				+describe("Hybrid Search (RRF)", () => {
			
 
				+  let db: Database;
			
 
				+
			
 
				+  beforeAll(() => {
			
 
				+    db = getDb();
			
 
				   });
			
 
				+
			
 
				+  // Helper: run hybrid search with RRF fusion
			
 
				+  async function hybridSearch(query: string, limit: number = 10): Promise<RankedResult[]> {
			
 
				+    const rankedLists: RankedResult[][] = [];
			
 
				+
			
 
				+    // FTS results
			
 
				+    const ftsResults = searchFTS(db, query, 20);
			
 
				+    if (ftsResults.length > 0) {
			
 
				+      rankedLists.push(ftsResults.map(r => ({
			
 
				+        file: r.filepath,
			
 
				+        displayPath: r.displayPath,
			
 
				+        title: r.title,
			
 
				+        body: r.body || "",
			
 
				+        score: r.score
			
 
				+      })));
			
 
				+    }
			
 
				+
			
 
				+    // Vector results
			
 
				+    const vecResults = await searchVec(db, query, DEFAULT_EMBED_MODEL, 20);
			
 
				+    if (vecResults.length > 0) {
			
 
				+      rankedLists.push(vecResults.map(r => ({
			
 
				+        file: r.filepath,
			
 
				+        displayPath: r.displayPath,
			
 
				+        title: r.title,
			
 
				+        body: r.body || "",
			
 
				+        score: r.score
			
 
				+      })));
			
 
				+    }
			
 
				+
			
 
				+    if (rankedLists.length === 0) return [];
			
 
				+
			
 
				+    // Apply RRF fusion
			
 
				+    const fused = reciprocalRankFusion(rankedLists);
			
 
				+    return fused.slice(0, limit);
			
 
				+  }
			
 
				+
			
 
				+  test("easy queries: ≥80% Hit@3 (hybrid should match BM25)", async () => {
			
 
				+    const easyQueries = evalQueries.filter(q => q.difficulty === "easy");
			
 
				+    let hits = 0;
			
 
				+    for (const { query, expectedDoc } of easyQueries) {
			
 
				+      const results = await hybridSearch(query);
			
 
				+      if (results.slice(0, 3).some(r => matchesExpected(r.file, expectedDoc))) hits++;
			
 
				+    }
			
 
				+    expect(hits / easyQueries.length).toBeGreaterThanOrEqual(0.8);
			
 
				+  }, 60000);
			
 
				+
			
 
				+  test("medium queries: ≥50% Hit@3 (hybrid should beat both)", async () => {
			
 
				+    const mediumQueries = evalQueries.filter(q => q.difficulty === "medium");
			
 
				+    let hits = 0;
			
 
				+    for (const { query, expectedDoc } of mediumQueries) {
			
 
				+      const results = await hybridSearch(query);
			
 
				+      if (results.slice(0, 3).some(r => matchesExpected(r.file, expectedDoc))) hits++;
			
 
				+    }
			
 
				+    // Hybrid should outperform both BM25 (15%) and vector (40%) alone
			
 
				+    expect(hits / mediumQueries.length).toBeGreaterThanOrEqual(0.5);
			
 
				+  }, 60000);
			
 
				+
			
 
				+  test("hard queries: ≥35% Hit@5 (hybrid combines signals)", async () => {
			
 
				+    const hardQueries = evalQueries.filter(q => q.difficulty === "hard");
			
 
				+    let hits = 0;
			
 
				+    for (const { query, expectedDoc } of hardQueries) {
			
 
				+      const results = await hybridSearch(query);
			
 
				+      if (results.some(r => matchesExpected(r.file, expectedDoc))) hits++;
			
 
				+    }
			
 
				+    // Hybrid should beat BM25 (15%) and vector (30%)
			
 
				+    expect(hits / hardQueries.length).toBeGreaterThanOrEqual(0.35);
			
 
				+  }, 60000);
			
 
				+
			
 
				+  test("overall Hit@3 ≥60% (hybrid beats individual methods)", async () => {
			
 
				+    let hits = 0;
			
 
				+    for (const { query, expectedDoc } of evalQueries) {
			
 
				+      const results = await hybridSearch(query);
			
 
				+      if (results.slice(0, 3).some(r => matchesExpected(r.file, expectedDoc))) hits++;
			
 
				+    }
			
 
				+    // Hybrid should beat BM25 (40%) and vector (50%)
			
 
				+    expect(hits / evalQueries.length).toBeGreaterThanOrEqual(0.6);
			
 
				+  }, 60000);
			
 
				+});
			
 
				+
			
 
				+// =============================================================================
			
 
				+// Cleanup
			
 
				+// =============================================================================
			
 
				+
			
 
				+afterAll(() => {
			
 
				+  rmSync(tempDir, { recursive: true, force: true });
			
 
				 });