Browse Source

Add regression test and explanatory comments

- Add detailed comments explaining why two-step query is necessary
- Add regression test for sqlite-vec JOIN hang bug
- Link to PR in comments for future reference

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
Brendan McCord 4 months ago
parent
commit
01d74727f7
3 changed files with 46 additions and 2 deletions
  1. 4 1
      src/qmd.ts
  2. 36 0
      src/store.test.ts
  3. 6 1
      src/store.ts

+ 4 - 1
src/qmd.ts

@@ -1972,7 +1972,10 @@ async function vectorSearch(query: string, opts: OutputOptions, model: string =
   const perQueryLimit = opts.all ? 500 : 20;
   const allResults = new Map<string, { file: string; displayPath: string; title: string; body: string; score: number; hash: string }>();
 
-  // Run vector searches sequentially (node-llama-cpp embedding context doesn't handle concurrent calls)
+  // IMPORTANT: Run vector searches sequentially, not with Promise.all.
+  // node-llama-cpp's embedding context hangs when multiple concurrent embed() calls
+  // are made. This is a known limitation of the LlamaEmbeddingContext.
+  // See: https://github.com/tobi/qmd/pull/23
   for (const q of vectorQueries) {
     const vecResults = await searchVec(db, q, model, perQueryLimit, collectionName as any);
     for (const r of vecResults) {

+ 36 - 0
src/store.test.ts

@@ -1855,6 +1855,42 @@ describe("LlamaCpp Integration", () => {
     await cleanupTestDb(store);
   });
 
+  // Regression test for https://github.com/tobi/qmd/pull/23
+  // sqlite-vec virtual tables hang when combined with JOINs in the same query.
+  // The fix uses a two-step approach: vector query first, then separate JOINs.
+  test("searchVec uses two-step query to avoid sqlite-vec JOIN hang", async () => {
+    const store = await createTestStore();
+
+    // Add test document with embedding
+    await store.addDocument("collection", "test.md", "Test content for vector search");
+    await store.updateIndex();
+
+    // Generate embedding for the test doc
+    const llm = (await import("./llm.js")).getDefaultLlamaCpp();
+    const embedding = await llm.embed("Test content for vector search");
+    if (embedding) {
+      // Manually insert vector to test the query path
+      const hash = hashContent("Test content for vector search");
+      store.db.prepare(`
+        INSERT OR REPLACE INTO content_vectors (hash, seq, pos) VALUES (?, 0, 0)
+      `).run(hash);
+      store.db.prepare(`
+        INSERT OR REPLACE INTO vectors_vec (hash_seq, embedding) VALUES (?, ?)
+      `).run(`${hash}_0`, new Float32Array(embedding.embedding));
+    }
+
+    // This should complete quickly (not hang) due to the two-step fix
+    const startTime = Date.now();
+    const results = await store.searchVec("test content", "embeddinggemma", 5);
+    const elapsed = Date.now() - startTime;
+
+    // If the query took more than 10 seconds, something is wrong
+    // (the hang bug would cause it to never return)
+    expect(elapsed).toBeLessThan(10000);
+
+    await cleanupTestDb(store);
+  }, 30000);
+
   test("expandQuery returns original plus expanded queries", async () => {
     const store = await createTestStore();
 

+ 6 - 1
src/store.ts

@@ -1679,7 +1679,12 @@ export async function searchVec(db: Database, query: string, model: string, limi
   const embedding = await getEmbedding(query, model, true);
   if (!embedding) return [];
 
-  // Step 1: Get vector matches (sqlite-vec doesn't work with JOINs)
+  // IMPORTANT: We use a two-step query approach here because sqlite-vec virtual tables
+  // hang indefinitely when combined with JOINs in the same query. Do NOT try to
+  // "optimize" this by combining into a single query with JOINs - it will break.
+  // See: https://github.com/tobi/qmd/pull/23
+
+  // Step 1: Get vector matches from sqlite-vec (no JOINs allowed)
   const vecResults = db.prepare(`
     SELECT hash_seq, distance
     FROM vectors_vec