|
@@ -3,6 +3,11 @@
|
|
|
*
|
|
*
|
|
|
* Tests search quality against synthetic documents with known-answer queries.
|
|
* Tests search quality against synthetic documents with known-answer queries.
|
|
|
* Validates that search improvements don't regress quality.
|
|
* Validates that search improvements don't regress quality.
|
|
|
|
|
+ *
|
|
|
|
|
+ * Three test suites:
|
|
|
|
|
+ * 1. BM25 (FTS) - lexical search baseline
|
|
|
|
|
+ * 2. Vector Search - semantic search with embeddings
|
|
|
|
|
+ * 3. Hybrid (RRF) - combined lexical + vector with rank fusion
|
|
|
*/
|
|
*/
|
|
|
|
|
|
|
|
import { describe, test, expect, beforeAll, afterAll } from "bun:test";
|
|
import { describe, test, expect, beforeAll, afterAll } from "bun:test";
|
|
@@ -19,9 +24,17 @@ import {
|
|
|
getDb,
|
|
getDb,
|
|
|
closeDb,
|
|
closeDb,
|
|
|
searchFTS,
|
|
searchFTS,
|
|
|
|
|
+ searchVec,
|
|
|
insertDocument,
|
|
insertDocument,
|
|
|
insertContent,
|
|
insertContent,
|
|
|
|
|
+ ensureVecTable,
|
|
|
|
|
+ insertEmbedding,
|
|
|
|
|
+ chunkDocumentByTokens,
|
|
|
|
|
+ reciprocalRankFusion,
|
|
|
|
|
+ DEFAULT_EMBED_MODEL,
|
|
|
|
|
+ type RankedResult,
|
|
|
} from "./store";
|
|
} from "./store";
|
|
|
|
|
+import { getDefaultLlamaCpp, disposeDefaultLlamaCpp, formatDocForEmbedding } from "./llm";
|
|
|
|
|
|
|
|
// Eval queries with expected documents
|
|
// Eval queries with expected documents
|
|
|
const evalQueries: {
|
|
const evalQueries: {
|
|
@@ -54,11 +67,33 @@ const evalQueries: {
|
|
|
{ query: "beta program 47 bugs", expectedDoc: "product-launch", difficulty: "hard" },
|
|
{ query: "beta program 47 bugs", expectedDoc: "product-launch", difficulty: "hard" },
|
|
|
];
|
|
];
|
|
|
|
|
|
|
|
-describe("Search Quality Evaluation", () => {
|
|
|
|
|
|
|
+// Helper to check if result matches expected doc
|
|
|
|
|
+function matchesExpected(filepath: string, expectedDoc: string): boolean {
|
|
|
|
|
+ return filepath.toLowerCase().includes(expectedDoc);
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+// Helper to calculate hit rate
|
|
|
|
|
+function calcHitRate(
|
|
|
|
|
+ queries: typeof evalQueries,
|
|
|
|
|
+ searchFn: (query: string) => { filepath: string }[],
|
|
|
|
|
+ topK: number
|
|
|
|
|
+): number {
|
|
|
|
|
+ let hits = 0;
|
|
|
|
|
+ for (const { query, expectedDoc } of queries) {
|
|
|
|
|
+ const results = searchFn(query).slice(0, topK);
|
|
|
|
|
+ if (results.some(r => matchesExpected(r.filepath, expectedDoc))) hits++;
|
|
|
|
|
+ }
|
|
|
|
|
+ return hits / queries.length;
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+// =============================================================================
|
|
|
|
|
+// BM25 (Lexical) Tests - Fast, no model loading needed
|
|
|
|
|
+// =============================================================================
|
|
|
|
|
+
|
|
|
|
|
+describe("BM25 Search (FTS)", () => {
|
|
|
let db: Database;
|
|
let db: Database;
|
|
|
|
|
|
|
|
beforeAll(() => {
|
|
beforeAll(() => {
|
|
|
- // Initialize database (INDEX_PATH already set at top of file)
|
|
|
|
|
db = getDb();
|
|
db = getDb();
|
|
|
|
|
|
|
|
// Load and index eval documents
|
|
// Load and index eval documents
|
|
@@ -78,71 +113,228 @@ describe("Search Quality Evaluation", () => {
|
|
|
|
|
|
|
|
afterAll(() => {
|
|
afterAll(() => {
|
|
|
closeDb();
|
|
closeDb();
|
|
|
- rmSync(tempDir, { recursive: true, force: true });
|
|
|
|
|
});
|
|
});
|
|
|
|
|
|
|
|
- describe("BM25 Search (FTS)", () => {
|
|
|
|
|
- test("easy queries: ≥80% Hit@3", () => {
|
|
|
|
|
- const easyQueries = evalQueries.filter(q => q.difficulty === "easy");
|
|
|
|
|
- let hits = 0;
|
|
|
|
|
|
|
+ test("easy queries: ≥80% Hit@3", () => {
|
|
|
|
|
+ const easyQueries = evalQueries.filter(q => q.difficulty === "easy");
|
|
|
|
|
+ const hitRate = calcHitRate(easyQueries, q => searchFTS(db, q, 5), 3);
|
|
|
|
|
+ expect(hitRate).toBeGreaterThanOrEqual(0.8);
|
|
|
|
|
+ });
|
|
|
|
|
|
|
|
- for (const { query, expectedDoc } of easyQueries) {
|
|
|
|
|
- const results = searchFTS(db, query, 5);
|
|
|
|
|
- const top3 = results.slice(0, 3);
|
|
|
|
|
- const found = top3.some(r => r.filepath.toLowerCase().includes(expectedDoc));
|
|
|
|
|
- if (found) hits++;
|
|
|
|
|
- }
|
|
|
|
|
|
|
+ test("medium queries: ≥15% Hit@3 (BM25 struggles with semantic)", () => {
|
|
|
|
|
+ const mediumQueries = evalQueries.filter(q => q.difficulty === "medium");
|
|
|
|
|
+ const hitRate = calcHitRate(mediumQueries, q => searchFTS(db, q, 5), 3);
|
|
|
|
|
+ expect(hitRate).toBeGreaterThanOrEqual(0.15);
|
|
|
|
|
+ });
|
|
|
|
|
+
|
|
|
|
|
+ test("hard queries: ≥15% Hit@5 (BM25 baseline)", () => {
|
|
|
|
|
+ const hardQueries = evalQueries.filter(q => q.difficulty === "hard");
|
|
|
|
|
+ const hitRate = calcHitRate(hardQueries, q => searchFTS(db, q, 5), 5);
|
|
|
|
|
+ expect(hitRate).toBeGreaterThanOrEqual(0.15);
|
|
|
|
|
+ });
|
|
|
|
|
+
|
|
|
|
|
+ test("overall Hit@3 ≥40% (BM25 baseline)", () => {
|
|
|
|
|
+ const hitRate = calcHitRate(evalQueries, q => searchFTS(db, q, 5), 3);
|
|
|
|
|
+ expect(hitRate).toBeGreaterThanOrEqual(0.4);
|
|
|
|
|
+ });
|
|
|
|
|
+});
|
|
|
|
|
+
|
|
|
|
|
+// =============================================================================
|
|
|
|
|
+// Vector Search Tests - Requires embedding model
|
|
|
|
|
+// =============================================================================
|
|
|
|
|
+
|
|
|
|
|
+describe("Vector Search", () => {
|
|
|
|
|
+ let db: Database;
|
|
|
|
|
+ let hasEmbeddings = false;
|
|
|
|
|
|
|
|
- const hitRate = hits / easyQueries.length;
|
|
|
|
|
- expect(hitRate).toBeGreaterThanOrEqual(0.8);
|
|
|
|
|
- });
|
|
|
|
|
|
|
+ beforeAll(async () => {
|
|
|
|
|
+ db = getDb();
|
|
|
|
|
|
|
|
- test("medium queries: ≥15% Hit@3 (BM25 struggles with semantic)", () => {
|
|
|
|
|
- const mediumQueries = evalQueries.filter(q => q.difficulty === "medium");
|
|
|
|
|
- let hits = 0;
|
|
|
|
|
|
|
+ // Check if embeddings already exist (from previous test run)
|
|
|
|
|
+ const vecTable = db.prepare(
|
|
|
|
|
+ `SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`
|
|
|
|
|
+ ).get();
|
|
|
|
|
|
|
|
- for (const { query, expectedDoc } of mediumQueries) {
|
|
|
|
|
- const results = searchFTS(db, query, 5);
|
|
|
|
|
- const top3 = results.slice(0, 3);
|
|
|
|
|
- const found = top3.some(r => r.filepath.toLowerCase().includes(expectedDoc));
|
|
|
|
|
- if (found) hits++;
|
|
|
|
|
|
|
+ if (vecTable) {
|
|
|
|
|
+ const count = db.prepare(`SELECT COUNT(*) as cnt FROM vectors_vec`).get() as { cnt: number };
|
|
|
|
|
+ if (count.cnt > 0) {
|
|
|
|
|
+ hasEmbeddings = true;
|
|
|
|
|
+ return;
|
|
|
}
|
|
}
|
|
|
|
|
+ }
|
|
|
|
|
|
|
|
- const hitRate = hits / mediumQueries.length;
|
|
|
|
|
- // BM25 alone struggles with semantic queries - baseline is low
|
|
|
|
|
- expect(hitRate).toBeGreaterThanOrEqual(0.15);
|
|
|
|
|
- });
|
|
|
|
|
|
|
+ // Generate embeddings for test documents
|
|
|
|
|
+ const llm = getDefaultLlamaCpp();
|
|
|
|
|
+ ensureVecTable(db, 768); // embeddinggemma uses 768 dimensions
|
|
|
|
|
|
|
|
- test("hard queries: ≥15% Hit@5 (BM25 baseline)", () => {
|
|
|
|
|
- const hardQueries = evalQueries.filter(q => q.difficulty === "hard");
|
|
|
|
|
- let hits = 0;
|
|
|
|
|
|
|
+ const evalDocsDir = join(import.meta.dir, "../test/eval-docs");
|
|
|
|
|
+ const files = readdirSync(evalDocsDir).filter(f => f.endsWith(".md"));
|
|
|
|
|
|
|
|
- for (const { query, expectedDoc } of hardQueries) {
|
|
|
|
|
- const results = searchFTS(db, query, 5);
|
|
|
|
|
- const found = results.some(r => r.filepath.toLowerCase().includes(expectedDoc));
|
|
|
|
|
- if (found) hits++;
|
|
|
|
|
|
|
+ for (const file of files) {
|
|
|
|
|
+ const content = readFileSync(join(evalDocsDir, file), "utf-8");
|
|
|
|
|
+ const hash = Bun.hash(content).toString(16).slice(0, 12);
|
|
|
|
|
+ const title = content.split("\n")[0]?.replace(/^#\s*/, "") || file;
|
|
|
|
|
+
|
|
|
|
|
+ // Chunk and embed
|
|
|
|
|
+ const chunks = await chunkDocumentByTokens(content, llm);
|
|
|
|
|
+ for (let seq = 0; seq < chunks.length; seq++) {
|
|
|
|
|
+ const chunk = chunks[seq];
|
|
|
|
|
+ const formatted = formatDocForEmbedding(chunk.text, title);
|
|
|
|
|
+ const result = await llm.embed(formatted, { model: DEFAULT_EMBED_MODEL, isQuery: false });
|
|
|
|
|
+ if (result?.embedding) {
|
|
|
|
|
+ insertEmbedding(db, hash, seq, chunk.pos, result.embedding);
|
|
|
|
|
+ }
|
|
|
}
|
|
}
|
|
|
|
|
+ }
|
|
|
|
|
+ hasEmbeddings = true;
|
|
|
|
|
+ }, 120000); // 2 minute timeout for embedding generation
|
|
|
|
|
|
|
|
- const hitRate = hits / hardQueries.length;
|
|
|
|
|
- // BM25 alone really struggles with vague queries
|
|
|
|
|
- expect(hitRate).toBeGreaterThanOrEqual(0.15);
|
|
|
|
|
- });
|
|
|
|
|
|
|
+ afterAll(async () => {
|
|
|
|
|
+ await disposeDefaultLlamaCpp();
|
|
|
});
|
|
});
|
|
|
|
|
|
|
|
- describe("Overall Quality", () => {
|
|
|
|
|
- test("overall Hit@3 ≥40% (BM25 baseline)", () => {
|
|
|
|
|
- let hits = 0;
|
|
|
|
|
|
|
+ test("easy queries: ≥60% Hit@3 (vector should match keywords too)", async () => {
|
|
|
|
|
+ if (!hasEmbeddings) return; // Skip if embedding failed
|
|
|
|
|
|
|
|
- for (const { query, expectedDoc } of evalQueries) {
|
|
|
|
|
- const results = searchFTS(db, query, 5);
|
|
|
|
|
- const top3 = results.slice(0, 3);
|
|
|
|
|
- const found = top3.some(r => r.filepath.toLowerCase().includes(expectedDoc));
|
|
|
|
|
- if (found) hits++;
|
|
|
|
|
- }
|
|
|
|
|
|
|
+ const easyQueries = evalQueries.filter(q => q.difficulty === "easy");
|
|
|
|
|
+ let hits = 0;
|
|
|
|
|
+ for (const { query, expectedDoc } of easyQueries) {
|
|
|
|
|
+ const results = await searchVec(db, query, DEFAULT_EMBED_MODEL, 5);
|
|
|
|
|
+ if (results.slice(0, 3).some(r => matchesExpected(r.filepath, expectedDoc))) hits++;
|
|
|
|
|
+ }
|
|
|
|
|
+ expect(hits / easyQueries.length).toBeGreaterThanOrEqual(0.6);
|
|
|
|
|
+ }, 60000);
|
|
|
|
|
+
|
|
|
|
|
+ test("medium queries: ≥40% Hit@3 (vector excels at semantic)", async () => {
|
|
|
|
|
+ if (!hasEmbeddings) return;
|
|
|
|
|
+
|
|
|
|
|
+ const mediumQueries = evalQueries.filter(q => q.difficulty === "medium");
|
|
|
|
|
+ let hits = 0;
|
|
|
|
|
+ for (const { query, expectedDoc } of mediumQueries) {
|
|
|
|
|
+ const results = await searchVec(db, query, DEFAULT_EMBED_MODEL, 5);
|
|
|
|
|
+ if (results.slice(0, 3).some(r => matchesExpected(r.filepath, expectedDoc))) hits++;
|
|
|
|
|
+ }
|
|
|
|
|
+ // Vector search should do better on semantic queries than BM25
|
|
|
|
|
+ expect(hits / mediumQueries.length).toBeGreaterThanOrEqual(0.4);
|
|
|
|
|
+ }, 60000);
|
|
|
|
|
|
|
|
- const hitRate = hits / evalQueries.length;
|
|
|
|
|
- // BM25 alone: ~40% is baseline, hybrid should be higher
|
|
|
|
|
- expect(hitRate).toBeGreaterThanOrEqual(0.4);
|
|
|
|
|
- });
|
|
|
|
|
|
|
+ test("hard queries: ≥30% Hit@5 (vector helps with vague queries)", async () => {
|
|
|
|
|
+ if (!hasEmbeddings) return;
|
|
|
|
|
+
|
|
|
|
|
+ const hardQueries = evalQueries.filter(q => q.difficulty === "hard");
|
|
|
|
|
+ let hits = 0;
|
|
|
|
|
+ for (const { query, expectedDoc } of hardQueries) {
|
|
|
|
|
+ const results = await searchVec(db, query, DEFAULT_EMBED_MODEL, 5);
|
|
|
|
|
+ if (results.some(r => matchesExpected(r.filepath, expectedDoc))) hits++;
|
|
|
|
|
+ }
|
|
|
|
|
+ expect(hits / hardQueries.length).toBeGreaterThanOrEqual(0.3);
|
|
|
|
|
+ }, 60000);
|
|
|
|
|
+
|
|
|
|
|
+ test("overall Hit@3 ≥50% (vector baseline)", async () => {
|
|
|
|
|
+ if (!hasEmbeddings) return;
|
|
|
|
|
+
|
|
|
|
|
+ let hits = 0;
|
|
|
|
|
+ for (const { query, expectedDoc } of evalQueries) {
|
|
|
|
|
+ const results = await searchVec(db, query, DEFAULT_EMBED_MODEL, 5);
|
|
|
|
|
+ if (results.slice(0, 3).some(r => matchesExpected(r.filepath, expectedDoc))) hits++;
|
|
|
|
|
+ }
|
|
|
|
|
+ expect(hits / evalQueries.length).toBeGreaterThanOrEqual(0.5);
|
|
|
|
|
+ }, 60000);
|
|
|
|
|
+});
|
|
|
|
|
+
|
|
|
|
|
+// =============================================================================
|
|
|
|
|
+// Hybrid Search (RRF) Tests - Combines BM25 + Vector
|
|
|
|
|
+// =============================================================================
|
|
|
|
|
+
|
|
|
|
|
+describe("Hybrid Search (RRF)", () => {
|
|
|
|
|
+ let db: Database;
|
|
|
|
|
+
|
|
|
|
|
+ beforeAll(() => {
|
|
|
|
|
+ db = getDb();
|
|
|
});
|
|
});
|
|
|
|
|
+
|
|
|
|
|
+ // Helper: run hybrid search with RRF fusion
|
|
|
|
|
+ async function hybridSearch(query: string, limit: number = 10): Promise<RankedResult[]> {
|
|
|
|
|
+ const rankedLists: RankedResult[][] = [];
|
|
|
|
|
+
|
|
|
|
|
+ // FTS results
|
|
|
|
|
+ const ftsResults = searchFTS(db, query, 20);
|
|
|
|
|
+ if (ftsResults.length > 0) {
|
|
|
|
|
+ rankedLists.push(ftsResults.map(r => ({
|
|
|
|
|
+ file: r.filepath,
|
|
|
|
|
+ displayPath: r.displayPath,
|
|
|
|
|
+ title: r.title,
|
|
|
|
|
+ body: r.body || "",
|
|
|
|
|
+ score: r.score
|
|
|
|
|
+ })));
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // Vector results
|
|
|
|
|
+ const vecResults = await searchVec(db, query, DEFAULT_EMBED_MODEL, 20);
|
|
|
|
|
+ if (vecResults.length > 0) {
|
|
|
|
|
+ rankedLists.push(vecResults.map(r => ({
|
|
|
|
|
+ file: r.filepath,
|
|
|
|
|
+ displayPath: r.displayPath,
|
|
|
|
|
+ title: r.title,
|
|
|
|
|
+ body: r.body || "",
|
|
|
|
|
+ score: r.score
|
|
|
|
|
+ })));
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ if (rankedLists.length === 0) return [];
|
|
|
|
|
+
|
|
|
|
|
+ // Apply RRF fusion
|
|
|
|
|
+ const fused = reciprocalRankFusion(rankedLists);
|
|
|
|
|
+ return fused.slice(0, limit);
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ test("easy queries: ≥80% Hit@3 (hybrid should match BM25)", async () => {
|
|
|
|
|
+ const easyQueries = evalQueries.filter(q => q.difficulty === "easy");
|
|
|
|
|
+ let hits = 0;
|
|
|
|
|
+ for (const { query, expectedDoc } of easyQueries) {
|
|
|
|
|
+ const results = await hybridSearch(query);
|
|
|
|
|
+ if (results.slice(0, 3).some(r => matchesExpected(r.file, expectedDoc))) hits++;
|
|
|
|
|
+ }
|
|
|
|
|
+ expect(hits / easyQueries.length).toBeGreaterThanOrEqual(0.8);
|
|
|
|
|
+ }, 60000);
|
|
|
|
|
+
|
|
|
|
|
+ test("medium queries: ≥50% Hit@3 (hybrid should beat both)", async () => {
|
|
|
|
|
+ const mediumQueries = evalQueries.filter(q => q.difficulty === "medium");
|
|
|
|
|
+ let hits = 0;
|
|
|
|
|
+ for (const { query, expectedDoc } of mediumQueries) {
|
|
|
|
|
+ const results = await hybridSearch(query);
|
|
|
|
|
+ if (results.slice(0, 3).some(r => matchesExpected(r.file, expectedDoc))) hits++;
|
|
|
|
|
+ }
|
|
|
|
|
+ // Hybrid should outperform both BM25 (15%) and vector (40%) alone
|
|
|
|
|
+ expect(hits / mediumQueries.length).toBeGreaterThanOrEqual(0.5);
|
|
|
|
|
+ }, 60000);
|
|
|
|
|
+
|
|
|
|
|
+ test("hard queries: ≥35% Hit@5 (hybrid combines signals)", async () => {
|
|
|
|
|
+ const hardQueries = evalQueries.filter(q => q.difficulty === "hard");
|
|
|
|
|
+ let hits = 0;
|
|
|
|
|
+ for (const { query, expectedDoc } of hardQueries) {
|
|
|
|
|
+ const results = await hybridSearch(query);
|
|
|
|
|
+ if (results.some(r => matchesExpected(r.file, expectedDoc))) hits++;
|
|
|
|
|
+ }
|
|
|
|
|
+ // Hybrid should beat BM25 (15%) and vector (30%)
|
|
|
|
|
+ expect(hits / hardQueries.length).toBeGreaterThanOrEqual(0.35);
|
|
|
|
|
+ }, 60000);
|
|
|
|
|
+
|
|
|
|
|
+ test("overall Hit@3 ≥60% (hybrid beats individual methods)", async () => {
|
|
|
|
|
+ let hits = 0;
|
|
|
|
|
+ for (const { query, expectedDoc } of evalQueries) {
|
|
|
|
|
+ const results = await hybridSearch(query);
|
|
|
|
|
+ if (results.slice(0, 3).some(r => matchesExpected(r.file, expectedDoc))) hits++;
|
|
|
|
|
+ }
|
|
|
|
|
+ // Hybrid should beat BM25 (40%) and vector (50%)
|
|
|
|
|
+ expect(hits / evalQueries.length).toBeGreaterThanOrEqual(0.6);
|
|
|
|
|
+ }, 60000);
|
|
|
|
|
+});
|
|
|
|
|
+
|
|
|
|
|
+// =============================================================================
|
|
|
|
|
+// Cleanup
|
|
|
|
|
+// =============================================================================
|
|
|
|
|
+
|
|
|
|
|
+afterAll(() => {
|
|
|
|
|
+ rmSync(tempDir, { recursive: true, force: true });
|
|
|
});
|
|
});
|