瀏覽代碼

Merge origin/nodejs: Node.js compat, perf improvements, vitest

Brings in Node.js compatibility (tsx, vitest), GPU auto-detection,
parallel embedding/reranking contexts, and flash attention support.
Preserves @tobilu/qmd package scope and publish config from main.
Tobi Lutke 3 月之前
父節點
當前提交
4df5505bd6
共有 9 個文件被更改,包括 769 次插入156 次删除
  1. 12 5
      README.md
  2. 28 12
      package.json
  3. 327 0
      src/bench-rerank.ts
  4. 9 4
      src/llm.test.ts
  5. 245 56
      src/llm.ts
  6. 61 30
      src/mcp.ts
  7. 34 5
      src/qmd.ts
  8. 2 2
      src/store.test.ts
  9. 51 42
      src/store.ts

+ 12 - 5
README.md

@@ -9,9 +9,15 @@ QMD combines BM25 full-text search, vector semantic search, and LLM re-ranking
 ## Quick Start
 
 ```sh
-# Install globally
+# Install globally (Node or Bun)
+npm install -g @tobilu/qmd
+# or
 bun install -g @tobilu/qmd
 
+# Or run directly
+npx @tobilu/qmd ...
+bunx @tobilu/qmd ...
+
 # Create collections for your notes, docs, and meeting transcripts
 qmd collection add ~/notes --name notes
 qmd collection add ~/Documents/meetings --name meetings
@@ -231,6 +237,7 @@ The `query` command uses **Reciprocal Rank Fusion (RRF)** with position-aware bl
 
 ### System Requirements
 
+- **Node.js** >= 22
 - **Bun** >= 1.0.0
 - **macOS**: Homebrew SQLite (for extension support)
   ```sh
@@ -252,18 +259,18 @@ Models are downloaded from HuggingFace and cached in `~/.cache/qmd/models/`.
 ## Installation
 
 ```sh
+npm install -g @tobilu/qmd
+# or
 bun install -g @tobilu/qmd
 ```
 
-Make sure `~/.bun/bin` is in your PATH.
-
 ### Development
 
 ```sh
 git clone https://github.com/tobi/qmd
 cd qmd
-bun install
-bun link
+npm install
+npm link
 ```
 
 ## Usage

+ 28 - 12
package.json

@@ -1,6 +1,6 @@
 {
   "name": "@tobilu/qmd",
-  "version": "0.9.0",
+  "version": "0.9.9",
   "description": "Query Markup Documents - On-device hybrid search for markdown files with BM25, vector search, and LLM reranking",
   "type": "module",
   "bin": {
@@ -15,15 +15,26 @@
     "CHANGELOG.md"
   ],
   "scripts": {
-    "test": "bun test --preload ./src/test-preload.ts",
-    "qmd": "bun src/qmd.ts",
-    "index": "bun src/qmd.ts index",
-    "vector": "bun src/qmd.ts vector",
-    "search": "bun src/qmd.ts search",
-    "vsearch": "bun src/qmd.ts vsearch",
-    "rerank": "bun src/qmd.ts rerank",
-    "link": "bun link",
-    "inspector": "npx @modelcontextprotocol/inspector bun src/qmd.ts mcp",
+    "test": "vitest run",
+    "test:unit": "vitest run --reporter=verbose src/*.test.ts",
+    "test:models": "vitest run --reporter=verbose src/models/*.test.ts",
+    "test:integration": "vitest run --reporter=verbose src/integration/*.test.ts",
+    "test:unit:bun": "bun run vitest run --reporter=verbose --testTimeout=120000 src/*.test.ts",
+    "test:models:bun": "bun run vitest run --reporter=verbose --testTimeout=120000 src/models/*.test.ts",
+    "test:integration:bun": "bun run vitest run --reporter=verbose --testTimeout=120000 src/integration/*.test.ts",
+    "test:unit:node": "npx vitest run --reporter=verbose --testTimeout=120000 src/*.test.ts",
+    "test:models:node": "npx vitest run --reporter=verbose --testTimeout=120000 src/models/*.test.ts",
+    "test:integration:node": "npx vitest run --reporter=verbose --testTimeout=120000 src/integration/*.test.ts",
+    "test:ci:bun": "npm run test:unit:bun && npm run test:models:bun && npm run test:integration:bun",
+    "test:ci:node": "npm run test:unit:node && npm run test:models:node && npm run test:integration:node",
+    "test:ci": "npm run test:unit && npm run test:models && npm run test:integration",
+    "qmd": "tsx src/qmd.ts",
+    "index": "tsx src/qmd.ts index",
+    "vector": "tsx src/qmd.ts vector",
+    "search": "tsx src/qmd.ts search",
+    "vsearch": "tsx src/qmd.ts vsearch",
+    "rerank": "tsx src/qmd.ts rerank",
+    "inspector": "npx @modelcontextprotocol/inspector tsx src/qmd.ts mcp",
     "release": "./scripts/release.sh"
   },
   "publishConfig": {
@@ -39,7 +50,10 @@
   },
   "dependencies": {
     "@modelcontextprotocol/sdk": "^1.25.1",
+    "better-sqlite3": "^11.0.0",
+    "fast-glob": "^3.3.0",
     "node-llama-cpp": "^3.14.5",
+    "picomatch": "^4.0.0",
     "sqlite-vec": "^0.1.7-alpha.2",
     "yaml": "^2.8.2",
     "zod": "^4.2.1"
@@ -51,13 +65,15 @@
     "sqlite-vec-win32-x64": "^0.1.7-alpha.2"
   },
   "devDependencies": {
-    "@types/bun": "latest"
+    "@types/better-sqlite3": "^7.6.0",
+    "tsx": "^4.0.0",
+    "vitest": "^3.0.0"
   },
   "peerDependencies": {
     "typescript": "^5.9.3"
   },
   "engines": {
-    "bun": ">=1.0.0"
+    "node": ">=22.0.0"
   },
   "keywords": [
     "markdown",

+ 327 - 0
src/bench-rerank.ts

@@ -0,0 +1,327 @@
+#!/usr/bin/env bun
+/**
+ * QMD Reranker Benchmark
+ *
+ * Measures reranking performance across different configurations.
+ * Reports device, parallelism, memory, VRAM, and throughput.
+ *
+ * Usage:
+ *   bun src/bench-rerank.ts              # full benchmark
+ *   bun src/bench-rerank.ts --quick      # quick smoke test (10 docs, 1 iteration)
+ *   bun src/bench-rerank.ts --docs 100   # custom doc count
+ */
+
+import {
+  getLlama,
+  getLlamaGpuTypes,
+  resolveModelFile,
+  LlamaLogLevel,
+  type Llama,
+  type LlamaModel,
+} from "node-llama-cpp";
+import { homedir } from "os";
+import { join } from "path";
+import { cpus } from "os";
+
+// ============================================================================
+// Config
+// ============================================================================
+
+const RERANK_MODEL = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
+const MODEL_CACHE = join(homedir(), ".cache", "qmd", "models");
+const CONTEXT_SIZE = 2048;
+
+const args = process.argv.slice(2);
+const quick = args.includes("--quick");
+const docsIdx = args.indexOf("--docs");
+const DOC_COUNT = docsIdx >= 0 ? parseInt(args[docsIdx + 1]!) : (quick ? 10 : 40);
+const ITERATIONS = quick ? 1 : 3;
+const PARALLEL_CONFIGS = quick ? [1, 4] : [1, 2, 4, 8];
+
+// ============================================================================
+// Test data — realistic-ish chunks of varying length
+// ============================================================================
+
+const QUERY = "How do AI agents work and what are their limitations?";
+
+function generateDocs(n: number): string[] {
+  const templates = [
+    "Artificial intelligence agents are software systems that perceive their environment and take actions to achieve goals. They use techniques like reinforcement learning, planning, and natural language processing to operate autonomously.",
+    "The transformer architecture, introduced in 2017, revolutionized natural language processing. Self-attention mechanisms allow models to weigh the importance of different parts of input sequences when generating outputs.",
+    "Machine learning models require careful evaluation to avoid overfitting. Cross-validation, holdout sets, and metrics like precision, recall, and F1 score help assess generalization performance.",
+    "Retrieval-augmented generation combines information retrieval with language models. Documents are embedded into vector spaces, retrieved based on query similarity, and used as context for generation.",
+    "Neural network training involves forward propagation, loss computation, and backpropagation. Optimizers like Adam and SGD adjust weights to minimize the loss function over training iterations.",
+    "Large language models exhibit emergent capabilities at scale, including few-shot learning, chain-of-thought reasoning, and instruction following. These properties were not explicitly trained for.",
+    "Embedding models convert text into dense vector representations that capture semantic meaning. Similar texts produce similar vectors, enabling efficient similarity search and clustering.",
+    "Autonomous agents face challenges including hallucination, lack of grounding, limited planning horizons, and difficulty with multi-step reasoning. Safety and alignment remain open research problems.",
+    "The attention mechanism computes query-key-value interactions to determine which parts of the input are most relevant. Multi-head attention allows the model to attend to different representation subspaces.",
+    "Fine-tuning adapts a pre-trained model to specific tasks using domain-specific data. Techniques like LoRA reduce the number of trainable parameters while maintaining performance.",
+  ];
+  return Array.from({ length: n }, (_, i) => templates[i % templates.length]!);
+}
+
+// ============================================================================
+// Helpers
+// ============================================================================
+
+function formatBytes(bytes: number): string {
+  if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`;
+  if (bytes < 1024 * 1024 * 1024) return `${(bytes / (1024 * 1024)).toFixed(1)} MB`;
+  return `${(bytes / (1024 * 1024 * 1024)).toFixed(2)} GB`;
+}
+
+function getMemUsage(): { rss: number; heapUsed: number } {
+  const m = process.memoryUsage();
+  return { rss: m.rss, heapUsed: m.heapUsed };
+}
+
+function median(arr: number[]): number {
+  const sorted = [...arr].sort((a, b) => a - b);
+  const mid = Math.floor(sorted.length / 2);
+  return sorted.length % 2 !== 0 ? sorted[mid]! : (sorted[mid - 1]! + sorted[mid]!) / 2;
+}
+
+// ============================================================================
+// Benchmark runner
+// ============================================================================
+
+interface BenchResult {
+  parallelism: number;
+  contextSize: number;
+  flashAttention: boolean;
+  times: number[];       // ms per run
+  medianMs: number;
+  docsPerSec: number;
+  vramPerContext: number; // bytes
+  totalVram: number;      // bytes
+  peakRss: number;        // bytes
+}
+
+async function benchmarkConfig(
+  model: LlamaModel,
+  llama: Llama,
+  docs: string[],
+  parallelism: number,
+  flash: boolean,
+): Promise<BenchResult> {
+  // Measure VRAM before
+  const vramBefore = llama.gpu ? await llama.getVramState() : null;
+  const rssBefore = getMemUsage().rss;
+
+  // Create contexts. On CPU, split threads evenly across contexts.
+  const cpuThreads = !llama.gpu ? Math.floor(llama.cpuMathCores / parallelism) : 0;
+  const contexts = [];
+  for (let i = 0; i < parallelism; i++) {
+    try {
+      contexts.push(await model.createRankingContext({
+        contextSize: CONTEXT_SIZE,
+        flashAttention: flash,
+        ...(cpuThreads > 0 ? { threads: cpuThreads } : {}),
+      }));
+    } catch {
+      if (contexts.length === 0) {
+        // Try without flash
+        contexts.push(await model.createRankingContext({
+          contextSize: CONTEXT_SIZE,
+          ...(cpuThreads > 0 ? { threads: cpuThreads } : {}),
+        }));
+      }
+      break;
+    }
+  }
+  const actualParallelism = contexts.length;
+
+  // Measure VRAM after context creation
+  const vramAfter = llama.gpu ? await llama.getVramState() : null;
+  const vramUsed = vramBefore && vramAfter ? vramAfter.used - vramBefore.used : 0;
+  const vramPerCtx = actualParallelism > 0 ? vramUsed / actualParallelism : 0;
+
+  // Warm up
+  await contexts[0]!.rankAll(QUERY, docs.slice(0, 2));
+
+  // Benchmark iterations
+  const times: number[] = [];
+  let peakRss = getMemUsage().rss;
+
+  for (let iter = 0; iter < ITERATIONS; iter++) {
+    const chunkSize = Math.ceil(docs.length / actualParallelism);
+
+    const t0 = performance.now();
+    const allScores = await Promise.all(
+      Array.from({ length: actualParallelism }, (_, i) => {
+        const chunk = docs.slice(i * chunkSize, (i + 1) * chunkSize);
+        return chunk.length > 0 ? contexts[i]!.rankAll(QUERY, chunk) : Promise.resolve([]);
+      })
+    );
+    const elapsed = performance.now() - t0;
+    times.push(elapsed);
+
+    // Verify scores are valid
+    const flat = allScores.flat();
+    if (flat.some(s => s < 0 || s > 1 || isNaN(s))) {
+      throw new Error("Invalid scores detected");
+    }
+
+    const currentRss = getMemUsage().rss;
+    if (currentRss > peakRss) peakRss = currentRss;
+  }
+
+  // Cleanup
+  for (const ctx of contexts) await ctx.dispose();
+
+  const med = median(times);
+  return {
+    parallelism: actualParallelism,
+    contextSize: CONTEXT_SIZE,
+    flashAttention: flash,
+    times,
+    medianMs: med,
+    docsPerSec: (docs.length / med) * 1000,
+    vramPerContext: vramPerCtx,
+    totalVram: vramUsed,
+    peakRss,
+  };
+}
+
+// ============================================================================
+// Main
+// ============================================================================
+
+async function main() {
+  console.log("═══════════════════════════════════════════════════════════════");
+  console.log("  QMD Reranker Benchmark");
+  console.log("═══════════════════════════════════════════════════════════════\n");
+
+  // Detect GPU
+  const gpuTypes = await getLlamaGpuTypes();
+  const preferred = (["cuda", "metal", "vulkan"] as const).find(g => gpuTypes.includes(g));
+
+  let llama: Llama;
+  let gpuLabel: string;
+  if (preferred) {
+    try {
+      llama = await getLlama({ gpu: preferred, logLevel: LlamaLogLevel.error });
+      gpuLabel = `${preferred}`;
+    } catch {
+      llama = await getLlama({ gpu: false, logLevel: LlamaLogLevel.error });
+      gpuLabel = "cpu (gpu init failed)";
+    }
+  } else {
+    llama = await getLlama({ gpu: false, logLevel: LlamaLogLevel.error });
+    gpuLabel = "cpu";
+  }
+
+  // System info
+  const cpuInfo = cpus();
+  const cpuModel = cpuInfo[0]?.model || "unknown";
+  const cpuCount = cpuInfo.length;
+
+  console.log("System");
+  console.log(`  CPU:       ${cpuModel}`);
+  console.log(`  Cores:     ${cpuCount} (${llama.cpuMathCores} math)`);
+  console.log(`  Device:    ${gpuLabel}`);
+
+  if (llama.gpu) {
+    const gpuNames = await llama.getGpuDeviceNames();
+    const counts = new Map<string, number>();
+    for (const name of gpuNames) counts.set(name, (counts.get(name) || 0) + 1);
+    const devStr = Array.from(counts.entries())
+      .map(([name, n]) => n > 1 ? `${n}× ${name}` : name).join(", ");
+    console.log(`  GPU:       ${devStr}`);
+    const vram = await llama.getVramState();
+    console.log(`  VRAM:      ${formatBytes(vram.total)} total, ${formatBytes(vram.free)} free`);
+  }
+
+  console.log(`  RAM:       ${formatBytes(getMemUsage().rss)} RSS at start`);
+
+  // Load model
+  console.log(`\nModel`);
+  console.log(`  URI:       ${RERANK_MODEL}`);
+  const modelPath = await resolveModelFile(RERANK_MODEL, MODEL_CACHE);
+  const vramPreModel = llama.gpu ? await llama.getVramState() : null;
+  const model = await llama.loadModel({ modelPath });
+  const vramPostModel = llama.gpu ? await llama.getVramState() : null;
+  const modelVram = vramPreModel && vramPostModel ? vramPostModel.used - vramPreModel.used : 0;
+  console.log(`  Params:    ${model.trainContextSize} train ctx`);
+  if (modelVram > 0) console.log(`  VRAM:      ${formatBytes(modelVram)} (model weights)`);
+
+  // Generate test docs
+  const docs = generateDocs(DOC_COUNT);
+  console.log(`\nBenchmark`);
+  console.log(`  Documents: ${DOC_COUNT}`);
+  console.log(`  Ctx size:  ${CONTEXT_SIZE}`);
+  console.log(`  Iterations:${ITERATIONS}`);
+  console.log(`  Query:     "${QUERY.slice(0, 50)}..."`);
+
+  // Run benchmarks
+  const results: BenchResult[] = [];
+
+  for (const p of PARALLEL_CONFIGS) {
+    if (!llama.gpu && p > 1) {
+      // CPU: only test if we have enough cores (at least 4 per context)
+      if (llama.cpuMathCores < p * 4) {
+        console.log(`\n  [${p} ctx] skipped (need ${p * 4} cores, have ${llama.cpuMathCores})`);
+        continue;
+      }
+    }
+
+    // Test with flash attention
+    process.stdout.write(`\n  [${p} ctx, flash] running...`);
+    try {
+      const r = await benchmarkConfig(model, llama, docs, p, true);
+      results.push(r);
+      process.stdout.write(` ${r.medianMs.toFixed(0)}ms (${r.docsPerSec.toFixed(1)} docs/s)\n`);
+    } catch (e: any) {
+      process.stdout.write(` failed: ${e.message}\n`);
+      // Try without flash
+      process.stdout.write(`  [${p} ctx, no flash] running...`);
+      try {
+        const r = await benchmarkConfig(model, llama, docs, p, false);
+        results.push(r);
+        process.stdout.write(` ${r.medianMs.toFixed(0)}ms (${r.docsPerSec.toFixed(1)} docs/s)\n`);
+      } catch (e2: any) {
+        process.stdout.write(` failed: ${e2.message}\n`);
+      }
+    }
+  }
+
+  // Summary table
+  console.log("\n═══════════════════════════════════════════════════════════════");
+  console.log("  Results");
+  console.log("═══════════════════════════════════════════════════════════════\n");
+
+  const header = "  Ctx  Flash  Median    Docs/s   VRAM/ctx   Total VRAM  Peak RSS";
+  const sep    = "  ───  ─────  ──────    ──────   ────────   ──────────  ────────";
+  console.log(header);
+  console.log(sep);
+
+  const baseline = results[0]?.medianMs ?? 1;
+  for (const r of results) {
+    const speedup = baseline / r.medianMs;
+    const speedupStr = r === results[0] ? "      " : `(${speedup.toFixed(1)}×)`;
+    console.log(
+      `  ${String(r.parallelism).padStart(3)}  ` +
+      `${r.flashAttention ? " yes " : "  no "}  ` +
+      `${r.medianMs.toFixed(0).padStart(5)}ms  ` +
+      `${r.docsPerSec.toFixed(1).padStart(6)}  ` +
+      `${formatBytes(r.vramPerContext).padStart(8)}  ` +
+      `${formatBytes(r.totalVram).padStart(10)}  ` +
+      `${formatBytes(r.peakRss).padStart(8)}  ` +
+      speedupStr
+    );
+  }
+
+  // Best config
+  if (results.length > 0) {
+    const best = results.reduce((a, b) => a.docsPerSec > b.docsPerSec ? a : b);
+    console.log(`\n  Best: ${best.parallelism} contexts, flash=${best.flashAttention}`);
+    console.log(`        ${best.medianMs.toFixed(0)}ms for ${DOC_COUNT} docs (${best.docsPerSec.toFixed(1)} docs/s)`);
+    if (best.totalVram > 0) console.log(`        ${formatBytes(best.totalVram)} VRAM`);
+  }
+
+  console.log("");
+  await model.dispose();
+  await llama.dispose();
+}
+
+main().catch(console.error);

+ 9 - 4
src/llm.test.ts

@@ -221,10 +221,15 @@ describe.skipIf(!!process.env.CI)("LlamaCpp Integration", () => {
       const successCount = allResults.filter(r => r !== null).length;
       expect(successCount).toBe(10);
 
-      // THE KEY ASSERTION: Only 1 context should be created, not 5
-      // Without the fix, contextCreateCount would be 5 (one per concurrent embedBatch call)
-      console.log(`Context creation count: ${contextCreateCount} (expected: 1)`);
-      expect(contextCreateCount).toBe(1);
+      // THE KEY ASSERTION: Contexts should be created once (by ensureEmbedContexts),
+      // not duplicated per concurrent embedBatch call. The exact count depends on
+      // available VRAM (computeParallelism), but should not be 5 (one per call).
+      // Without the fix, contextCreateCount would be 5× the intended count (one set per concurrent call).
+      // With the promise guard, contexts are created exactly once regardless of concurrent callers.
+      // The count depends on VRAM (computeParallelism), but should be ≤ 8 (the cap).
+      console.log(`Context creation count: ${contextCreateCount} (expected: ≤ 8, not 5× duplicated)`);
+      expect(contextCreateCount).toBeGreaterThanOrEqual(1);
+      expect(contextCreateCount).toBeLessThanOrEqual(8);
       
       await freshLlm.dispose();
     }, 60000);

+ 245 - 56
src/llm.ts

@@ -6,6 +6,7 @@
 
 import {
   getLlama,
+  getLlamaGpuTypes,
   resolveModelFile,
   LlamaChatSession,
   LlamaLogLevel,
@@ -354,10 +355,10 @@ const DEFAULT_INACTIVITY_TIMEOUT_MS = 5 * 60 * 1000;
 export class LlamaCpp implements LLM {
   private llama: Llama | null = null;
   private embedModel: LlamaModel | null = null;
-  private embedContext: LlamaEmbeddingContext | null = null;
+  private embedContexts: LlamaEmbeddingContext[] = [];
   private generateModel: LlamaModel | null = null;
   private rerankModel: LlamaModel | null = null;
-  private rerankContext: Awaited<ReturnType<LlamaModel["createRankingContext"]>> | null = null;
+  private rerankContexts: Awaited<ReturnType<LlamaModel["createRankingContext"]>>[] = [];
 
   private embedModelUri: string;
   private generateModelUri: string;
@@ -366,7 +367,6 @@ export class LlamaCpp implements LLM {
 
   // Ensure we don't load the same model/context concurrently (which can allocate duplicate VRAM).
   private embedModelLoadPromise: Promise<LlamaModel> | null = null;
-  private embedContextCreatePromise: Promise<LlamaEmbeddingContext> | null = null;
   private generateModelLoadPromise: Promise<LlamaModel> | null = null;
   private rerankModelLoadPromise: Promise<LlamaModel> | null = null;
 
@@ -423,7 +423,7 @@ export class LlamaCpp implements LLM {
    * Check if any contexts are currently loaded (and therefore worth unloading on inactivity).
    */
   private hasLoadedContexts(): boolean {
-    return !!(this.embedContext || this.rerankContext);
+    return !!(this.embedContexts.length > 0 || this.rerankContexts.length > 0);
   }
 
   /**
@@ -445,14 +445,14 @@ export class LlamaCpp implements LLM {
     }
 
     // Dispose contexts first
-    if (this.embedContext) {
-      await this.embedContext.dispose();
-      this.embedContext = null;
+    for (const ctx of this.embedContexts) {
+      await ctx.dispose();
     }
-    if (this.rerankContext) {
-      await this.rerankContext.dispose();
-      this.rerankContext = null;
+    this.embedContexts = [];
+    for (const ctx of this.rerankContexts) {
+      await ctx.dispose();
     }
+    this.rerankContexts = [];
 
     // Optionally dispose models too (opt-in)
     if (this.disposeModelsOnInactivity) {
@@ -491,7 +491,33 @@ export class LlamaCpp implements LLM {
    */
   private async ensureLlama(): Promise<Llama> {
     if (!this.llama) {
-      this.llama = await getLlama({ logLevel: LlamaLogLevel.error });
+      // Detect available GPU types and use the best one.
+      // We can't rely on gpu:"auto" — it returns false even when CUDA is available
+      // (likely a binary/build config issue in node-llama-cpp).
+      const gpuTypes = await getLlamaGpuTypes();
+      // Prefer CUDA > Metal > Vulkan > CPU
+      const preferred = (["cuda", "metal", "vulkan"] as const).find(g => gpuTypes.includes(g));
+
+      let llama: Llama;
+      if (preferred) {
+        try {
+          llama = await getLlama({ gpu: preferred, logLevel: LlamaLogLevel.error });
+        } catch {
+          llama = await getLlama({ gpu: false, logLevel: LlamaLogLevel.error });
+          process.stderr.write(
+            `QMD Warning: ${preferred} reported available but failed to initialize. Falling back to CPU.\n`
+          );
+        }
+      } else {
+        llama = await getLlama({ gpu: false, logLevel: LlamaLogLevel.error });
+      }
+
+      if (!llama.gpu) {
+        process.stderr.write(
+          "QMD Warning: no GPU acceleration, running on CPU (slow). Run 'qmd status' for details.\n"
+        );
+      }
+      this.llama = llama;
     }
     return this.llama;
   }
@@ -535,34 +561,92 @@ export class LlamaCpp implements LLM {
   }
 
   /**
-   * Load embedding context (lazy). Context can be disposed and recreated without reloading the model.
-   * Uses promise guard to prevent concurrent context creation race condition.
+   * Compute how many parallel contexts to create.
+   *
+   * GPU: constrained by VRAM (25% of free, capped at 8).
+   * CPU: constrained by cores. Splitting threads across contexts enables
+   *      true parallelism (each context runs on its own cores). Use at most
+   *      half the math cores, with at least 4 threads per context.
    */
-  private async ensureEmbedContext(): Promise<LlamaEmbeddingContext> {
-    if (!this.embedContext) {
-      // If context creation is already in progress, wait for it
-      if (this.embedContextCreatePromise) {
-        return await this.embedContextCreatePromise;
+  private async computeParallelism(perContextMB: number): Promise<number> {
+    const llama = await this.ensureLlama();
+
+    if (llama.gpu) {
+      try {
+        const vram = await llama.getVramState();
+        const freeMB = vram.free / (1024 * 1024);
+        const maxByVram = Math.floor((freeMB * 0.25) / perContextMB);
+        return Math.max(1, Math.min(8, maxByVram));
+      } catch {
+        return 2;
       }
+    }
 
-      // Start context creation and store promise so concurrent calls wait
-      this.embedContextCreatePromise = (async () => {
-        const model = await this.ensureEmbedModel();
-        const context = await model.createEmbeddingContext();
-        this.embedContext = context;
-        return context;
-      })();
+    // CPU: split cores across contexts. At least 4 threads per context.
+    const cores = llama.cpuMathCores || 4;
+    const maxContexts = Math.floor(cores / 4);
+    return Math.max(1, Math.min(4, maxContexts));
+  }
 
-      try {
-        const context = await this.embedContextCreatePromise;
-        this.touchActivity();
-        return context;
-      } finally {
-        this.embedContextCreatePromise = null;
+  /**
+   * Get the number of threads each context should use, given N parallel contexts.
+   * Splits available math cores evenly across contexts.
+   */
+  private async threadsPerContext(parallelism: number): Promise<number> {
+    const llama = await this.ensureLlama();
+    if (llama.gpu) return 0; // GPU: let the library decide
+    const cores = llama.cpuMathCores || 4;
+    return Math.max(1, Math.floor(cores / parallelism));
+  }
+
+  /**
+   * Load embedding contexts (lazy). Creates multiple for parallel embedding.
+   * Uses promise guard to prevent concurrent context creation race condition.
+   */
+  private embedContextsCreatePromise: Promise<LlamaEmbeddingContext[]> | null = null;
+
+  private async ensureEmbedContexts(): Promise<LlamaEmbeddingContext[]> {
+    if (this.embedContexts.length > 0) {
+      this.touchActivity();
+      return this.embedContexts;
+    }
+
+    if (this.embedContextsCreatePromise) {
+      return await this.embedContextsCreatePromise;
+    }
+
+    this.embedContextsCreatePromise = (async () => {
+      const model = await this.ensureEmbedModel();
+      // Embed contexts are ~143 MB each (nomic-embed 2048 ctx)
+      const n = await this.computeParallelism(150);
+      const threads = await this.threadsPerContext(n);
+      for (let i = 0; i < n; i++) {
+        try {
+          this.embedContexts.push(await model.createEmbeddingContext({
+            ...(threads > 0 ? { threads } : {}),
+          }));
+        } catch {
+          if (this.embedContexts.length === 0) throw new Error("Failed to create any embedding context");
+          break;
+        }
       }
+      this.touchActivity();
+      return this.embedContexts;
+    })();
+
+    try {
+      return await this.embedContextsCreatePromise;
+    } finally {
+      this.embedContextsCreatePromise = null;
     }
-    this.touchActivity();
-    return this.embedContext;
+  }
+
+  /**
+   * Get a single embed context (for single-embed calls). Uses first from pool.
+   */
+  private async ensureEmbedContext(): Promise<LlamaEmbeddingContext> {
+    const contexts = await this.ensureEmbedContexts();
+    return contexts[0]!;
   }
 
   /**
@@ -624,15 +708,50 @@ export class LlamaCpp implements LLM {
   }
 
   /**
-   * Load rerank context (lazy). Context can be disposed and recreated without reloading the model.
+   * Load rerank contexts (lazy). Creates multiple contexts for parallel ranking.
+   * Each context has its own sequence, so they can evaluate independently.
+   *
+   * Tuning choices:
+   * - contextSize 1024: reranking chunks are ~800 tokens max, 1024 is plenty
+   * - flashAttention: ~20% less VRAM per context (568 vs 711 MB)
+   * - Combined: drops from 11.6 GB (auto, no flash) to 568 MB per context (20×)
    */
-  private async ensureRerankContext(): Promise<Awaited<ReturnType<LlamaModel["createRankingContext"]>>> {
-    if (!this.rerankContext) {
+  // Qwen3 reranker template adds ~200 tokens overhead (system prompt, tags, etc.)
+  // Chunks are max 800 tokens, so 800 + 200 + query ≈ 1100 tokens typical.
+  // Use 2048 for safety margin. Still 17× less than auto (40960).
+  private static readonly RERANK_CONTEXT_SIZE = 2048;
+
+  private async ensureRerankContexts(): Promise<Awaited<ReturnType<LlamaModel["createRankingContext"]>>[]> {
+    if (this.rerankContexts.length === 0) {
       const model = await this.ensureRerankModel();
-      this.rerankContext = await model.createRankingContext();
+      // ~960 MB per context with flash attention at contextSize 2048
+      const n = await this.computeParallelism(1000);
+      const threads = await this.threadsPerContext(n);
+      for (let i = 0; i < n; i++) {
+        try {
+          this.rerankContexts.push(await model.createRankingContext({
+            contextSize: LlamaCpp.RERANK_CONTEXT_SIZE,
+            flashAttention: true,
+            ...(threads > 0 ? { threads } : {}),
+          }));
+        } catch {
+          if (this.rerankContexts.length === 0) {
+            // Flash attention might not be supported — retry without it
+            try {
+              this.rerankContexts.push(await model.createRankingContext({
+                contextSize: LlamaCpp.RERANK_CONTEXT_SIZE,
+                ...(threads > 0 ? { threads } : {}),
+              }));
+            } catch {
+              throw new Error("Failed to create any rerank context");
+            }
+          }
+          break;
+        }
+      }
     }
     this.touchActivity();
-    return this.rerankContext;
+    return this.rerankContexts;
   }
 
   // ==========================================================================
@@ -703,26 +822,51 @@ export class LlamaCpp implements LLM {
     if (texts.length === 0) return [];
 
     try {
-      const context = await this.ensureEmbedContext();
-
-      // node-llama-cpp handles batching internally when we make parallel requests
-      const embeddings = await Promise.all(
-        texts.map(async (text) => {
+      const contexts = await this.ensureEmbedContexts();
+      const n = contexts.length;
+
+      if (n === 1) {
+        // Single context: sequential (no point splitting)
+        const context = contexts[0]!;
+        const embeddings = [];
+        for (const text of texts) {
           try {
             const embedding = await context.getEmbeddingFor(text);
-            this.touchActivity();  // Keep-alive during slow batches
-            return {
-              embedding: Array.from(embedding.vector),
-              model: this.embedModelUri,
-            };
+            this.touchActivity();
+            embeddings.push({ embedding: Array.from(embedding.vector), model: this.embedModelUri });
           } catch (err) {
             console.error("Embedding error for text:", err);
-            return null;
+            embeddings.push(null);
+          }
+        }
+        return embeddings;
+      }
+
+      // Multiple contexts: split texts across contexts for parallel evaluation
+      const chunkSize = Math.ceil(texts.length / n);
+      const chunks = Array.from({ length: n }, (_, i) =>
+        texts.slice(i * chunkSize, (i + 1) * chunkSize)
+      );
+
+      const chunkResults = await Promise.all(
+        chunks.map(async (chunk, i) => {
+          const ctx = contexts[i]!;
+          const results: (EmbeddingResult | null)[] = [];
+          for (const text of chunk) {
+            try {
+              const embedding = await ctx.getEmbeddingFor(text);
+              this.touchActivity();
+              results.push({ embedding: Array.from(embedding.vector), model: this.embedModelUri });
+            } catch (err) {
+              console.error("Embedding error for text:", err);
+              results.push(null);
+            }
           }
+          return results;
         })
       );
 
-      return embeddings;
+      return chunkResults.flat();
     } catch (error) {
       console.error("Batch embedding error:", error);
       return texts.map(() => null);
@@ -879,7 +1023,7 @@ export class LlamaCpp implements LLM {
     // Ping activity at start to keep models alive during this operation
     this.touchActivity();
 
-    const context = await this.ensureRerankContext();
+    const contexts = await this.ensureRerankContexts();
 
     // Build a map from document text to original indices (for lookup after sorting)
     const textToDoc = new Map<string, { file: string; index: number }>();
@@ -890,8 +1034,24 @@ export class LlamaCpp implements LLM {
     // Extract just the text for ranking
     const texts = documents.map((doc) => doc.text);
 
-    // Use the proper ranking API - returns [{document: string, score: number}] sorted by score
-    const ranked = await context.rankAndSort(query, texts);
+    // Split documents across contexts for parallel evaluation.
+    // Each context has its own sequence with a lock, so parallelism comes
+    // from multiple contexts evaluating different chunks simultaneously.
+    const n = contexts.length;
+    const chunkSize = Math.ceil(texts.length / n);
+    const chunks = Array.from({ length: n }, (_, i) =>
+      texts.slice(i * chunkSize, (i + 1) * chunkSize)
+    ).filter(chunk => chunk.length > 0);
+
+    const allScores = await Promise.all(
+      chunks.map((chunk, i) => contexts[i]!.rankAll(query, chunk))
+    );
+
+    // Reassemble scores in original order and sort
+    const flatScores = allScores.flat();
+    const ranked = texts
+      .map((text, i) => ({ document: text, score: flatScores[i]! }))
+      .sort((a, b) => b.score - a.score);
 
     // Map back to our result format using the text-to-doc map
     const results: RerankDocumentResult[] = ranked.map((item) => {
@@ -909,6 +1069,35 @@ export class LlamaCpp implements LLM {
     };
   }
 
+  /**
+   * Get device/GPU info for status display.
+   * Initializes llama if not already done.
+   */
+  async getDeviceInfo(): Promise<{
+    gpu: string | false;
+    gpuOffloading: boolean;
+    gpuDevices: string[];
+    vram?: { total: number; used: number; free: number };
+    cpuCores: number;
+  }> {
+    const llama = await this.ensureLlama();
+    const gpuDevices = await llama.getGpuDeviceNames();
+    let vram: { total: number; used: number; free: number } | undefined;
+    if (llama.gpu) {
+      try {
+        const state = await llama.getVramState();
+        vram = { total: state.total, used: state.used, free: state.free };
+      } catch { /* no vram info */ }
+    }
+    return {
+      gpu: llama.gpu,
+      gpuOffloading: llama.supportsGpuOffloading,
+      gpuDevices,
+      vram,
+      cpuCores: llama.cpuMathCores,
+    };
+  }
+
   async dispose(): Promise<void> {
     // Prevent double-dispose
     if (this.disposed) {
@@ -932,8 +1121,8 @@ export class LlamaCpp implements LLM {
     }
 
     // Clear references
-    this.embedContext = null;
-    this.rerankContext = null;
+    this.embedContexts = [];
+    this.rerankContexts = [];
     this.embedModel = null;
     this.generateModel = null;
     this.rerankModel = null;
@@ -941,7 +1130,7 @@ export class LlamaCpp implements LLM {
 
     // Clear any in-flight load/create promises
     this.embedModelLoadPromise = null;
-    this.embedContextCreatePromise = null;
+    this.embedContextsCreatePromise = null;
     this.generateModelLoadPromise = null;
     this.rerankModelLoadPromise = null;
   }

+ 61 - 30
src/mcp.ts

@@ -1,4 +1,3 @@
-#!/usr/bin/env bun
 /**
  * QMD MCP Server - Model Context Protocol server for QMD
  *
@@ -8,6 +7,8 @@
  * Follows MCP spec 2025-06-18 for proper response types.
  */
 
+import { createServer, type IncomingMessage, type ServerResponse } from "node:http";
+import { fileURLToPath } from "url";
 import { McpServer, ResourceTemplate } from "@modelcontextprotocol/sdk/server/mcp.js";
 import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
 import { WebStandardStreamableHTTPServerTransport }
@@ -147,7 +148,7 @@ function buildInstructions(store: Store): string {
  */
 function createMcpServer(store: Store): McpServer {
   const server = new McpServer(
-    { name: "qmd", version: "1.0.0" },
+    { name: "qmd", version: "0.9.9" },
     { instructions: buildInstructions(store) },
   );
 
@@ -237,9 +238,7 @@ function createMcpServer(store: Store): McpServer {
       },
     },
     async ({ query, limit, minScore, collection }) => {
-      // Note: Collection filtering is now done post-search since collections are managed in YAML
-      const results = store.searchFTS(query, limit || 10)
-        .filter(r => !collection || r.collectionName === collection);
+      const results = store.searchFTS(query, limit || 10, collection);
       const filtered: SearchResultItem[] = results
         .filter(r => r.score >= (minScore || 0))
         .map(r => {
@@ -541,7 +540,7 @@ export async function startMcpServer(): Promise<void> {
 // =============================================================================
 
 export type HttpServerHandle = {
-  httpServer: ReturnType<typeof Bun.serve>;
+  httpServer: import("http").Server;
   port: number;
   stop: () => Promise<void>;
 };
@@ -588,47 +587,79 @@ export async function startMcpHttpServer(port: number, options?: { quiet?: boole
     if (!quiet) console.error(msg);
   }
 
-  const httpServer = Bun.serve({
-    port,
-    hostname: "localhost",
-    async fetch(req) {
-      const reqStart = Date.now();
-      const pathname = new URL(req.url).pathname;
-
-      if (pathname === "/health" && req.method === "GET") {
-        const res = Response.json({
-          status: "ok",
-          uptime: Math.floor((Date.now() - startTime) / 1000),
-        });
+  // Helper to collect request body
+  async function collectBody(req: IncomingMessage): Promise<string> {
+    const chunks: Buffer[] = [];
+    for await (const chunk of req) chunks.push(chunk as Buffer);
+    return Buffer.concat(chunks).toString();
+  }
+
+  const httpServer = createServer(async (nodeReq: IncomingMessage, nodeRes: ServerResponse) => {
+    const reqStart = Date.now();
+    const pathname = nodeReq.url || "/";
+
+    try {
+      if (pathname === "/health" && nodeReq.method === "GET") {
+        const body = JSON.stringify({ status: "ok", uptime: Math.floor((Date.now() - startTime) / 1000) });
+        nodeRes.writeHead(200, { "Content-Type": "application/json" });
+        nodeRes.end(body);
         log(`${ts()} GET /health (${Date.now() - reqStart}ms)`);
-        return res;
+        return;
       }
 
-      if (pathname === "/mcp" && req.method === "POST") {
-        const body = await req.json();
+      if (pathname === "/mcp" && nodeReq.method === "POST") {
+        const rawBody = await collectBody(nodeReq);
+        const body = JSON.parse(rawBody);
         const label = describeRequest(body);
-        const res = await transport.handleRequest(req, { parsedBody: body });
+        const url = `http://localhost:${port}${pathname}`;
+        const headers: Record<string, string> = {};
+        for (const [k, v] of Object.entries(nodeReq.headers)) {
+          if (typeof v === "string") headers[k] = v;
+        }
+        const request = new Request(url, { method: "POST", headers, body: rawBody });
+        const response = await transport.handleRequest(request, { parsedBody: body });
+        nodeRes.writeHead(response.status, Object.fromEntries(response.headers));
+        nodeRes.end(Buffer.from(await response.arrayBuffer()));
         log(`${ts()} POST /mcp ${label} (${Date.now() - reqStart}ms)`);
-        return res;
+        return;
       }
 
-      // Pass other methods (GET, DELETE) to transport for protocol handling
       if (pathname === "/mcp") {
-        return transport.handleRequest(req);
+        const url = `http://localhost:${port}${pathname}`;
+        const headers: Record<string, string> = {};
+        for (const [k, v] of Object.entries(nodeReq.headers)) {
+          if (typeof v === "string") headers[k] = v;
+        }
+        const rawBody = nodeReq.method !== "GET" && nodeReq.method !== "HEAD" ? await collectBody(nodeReq) : undefined;
+        const request = new Request(url, { method: nodeReq.method || "GET", headers, ...(rawBody ? { body: rawBody } : {}) });
+        const response = await transport.handleRequest(request);
+        nodeRes.writeHead(response.status, Object.fromEntries(response.headers));
+        nodeRes.end(Buffer.from(await response.arrayBuffer()));
+        return;
       }
 
-      return new Response("Not Found", { status: 404 });
-    },
+      nodeRes.writeHead(404);
+      nodeRes.end("Not Found");
+    } catch (err) {
+      console.error("HTTP handler error:", err);
+      nodeRes.writeHead(500);
+      nodeRes.end("Internal Server Error");
+    }
+  });
+
+  await new Promise<void>((resolve, reject) => {
+    httpServer.on("error", reject);
+    httpServer.listen(port, "localhost", () => resolve());
   });
 
-  const actualPort = httpServer.port;
+  const actualPort = (httpServer.address() as import("net").AddressInfo).port;
 
   let stopping = false;
   const stop = async () => {
     if (stopping) return;
     stopping = true;
     await transport.close();
-    httpServer.stop();
+    httpServer.close();
     store.close();
     await disposeDefaultLlamaCpp();
   };
@@ -649,6 +680,6 @@ export async function startMcpHttpServer(port: number, options?: { quiet?: boole
 }
 
 // Run if this is the main module
-if (import.meta.main) {
+if (fileURLToPath(import.meta.url) === process.argv[1] || process.argv[1]?.endsWith("/mcp.ts")) {
   startMcpServer().catch(console.error);
 }

+ 34 - 5
src/qmd.ts

@@ -65,7 +65,7 @@ import {
   createStore,
   getDefaultDbPath,
 } from "./store.js";
-import { disposeDefaultLlamaCpp, withLLMSession, pullModels, DEFAULT_EMBED_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_MODEL_CACHE_DIR } from "./llm.js";
+import { disposeDefaultLlamaCpp, getDefaultLlamaCpp, withLLMSession, pullModels, DEFAULT_EMBED_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_MODEL_CACHE_DIR } from "./llm.js";
 import {
   formatSearchResults,
   formatDocuments,
@@ -249,7 +249,7 @@ function formatBytes(bytes: number): string {
   return `${(bytes / (1024 * 1024 * 1024)).toFixed(1)} GB`;
 }
 
-function showStatus(): void {
+async function showStatus(): Promise<void> {
   const dbPath = getDbPath();
   const db = getDb();
 
@@ -362,6 +362,36 @@ function showStatus(): void {
     console.log(`\n${c.dim}No collections. Run 'qmd collection add .' to index markdown files.${c.reset}`);
   }
 
+  // Device / GPU info
+  try {
+    const llm = getDefaultLlamaCpp();
+    const device = await llm.getDeviceInfo();
+    console.log(`\n${c.bold}Device${c.reset}`);
+    if (device.gpu) {
+      console.log(`  GPU:      ${c.green}${device.gpu}${c.reset} (offloading: ${device.gpuOffloading ? 'yes' : 'no'})`);
+      if (device.gpuDevices.length > 0) {
+        // Deduplicate and count GPUs
+        const counts = new Map<string, number>();
+        for (const name of device.gpuDevices) {
+          counts.set(name, (counts.get(name) || 0) + 1);
+        }
+        const deviceStr = Array.from(counts.entries())
+          .map(([name, count]) => count > 1 ? `${count}× ${name}` : name)
+          .join(', ');
+        console.log(`  Devices:  ${deviceStr}`);
+      }
+      if (device.vram) {
+        console.log(`  VRAM:     ${formatBytes(device.vram.free)} free / ${formatBytes(device.vram.total)} total`);
+      }
+    } else {
+      console.log(`  GPU:      ${c.yellow}none${c.reset} (running on CPU — models will be slow)`);
+      console.log(`  ${c.dim}Tip: Install CUDA, Vulkan, or Metal support for GPU acceleration.${c.reset}`);
+    }
+    console.log(`  CPU:      ${device.cpuCores} math cores`);
+  } catch {
+    // Don't fail status if LLM init fails
+  }
+
   closeDb();
 }
 
@@ -1871,8 +1901,7 @@ function search(query: string, opts: OutputOptions): void {
 
   // Use large limit for --all, otherwise fetch more than needed and let outputResults filter
   const fetchLimit = opts.all ? 100000 : Math.max(50, opts.limit * 2);
-  // searchFTS accepts collection name as number parameter for legacy reasons (will be fixed in store.ts)
-  const results = searchFTS(db, query, fetchLimit, collectionName as any);
+  const results = searchFTS(db, query, fetchLimit, collectionName);
 
   // Add context to results
   const resultsWithContext = results.map(r => ({
@@ -2348,7 +2377,7 @@ if (import.meta.main) {
     }
 
     case "status":
-      showStatus();
+      await showStatus();
       break;
 
     case "update":

+ 2 - 2
src/store.test.ts

@@ -1219,8 +1219,8 @@ describe("FTS Search", () => {
     const allResults = store.searchFTS("searchable", 10);
     expect(allResults).toHaveLength(2);
 
-    // Filter by collection name (collectionId is now treated as collection name string)
-    const filtered = store.searchFTS("searchable", 10, collection1 as unknown as number);
+    // Filter by collection name
+    const filtered = store.searchFTS("searchable", 10, collection1);
     expect(filtered).toHaveLength(1);
     expect(filtered[0]!.displayPath).toBe(`${collection1}/doc1.md`);
 

+ 51 - 42
src/store.ts

@@ -830,8 +830,8 @@ export type Store = {
   toVirtualPath: (absolutePath: string) => string | null;
 
   // Search
-  searchFTS: (query: string, limit?: number, collectionId?: number) => SearchResult[];
-  searchVec: (query: string, model: string, limit?: number, collectionName?: string) => Promise<SearchResult[]>;
+  searchFTS: (query: string, limit?: number, collectionName?: string) => SearchResult[];
+  searchVec: (query: string, model: string, limit?: number, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[]) => Promise<SearchResult[]>;
 
   // Query expansion & reranking
   expandQuery: (query: string, model?: string) => Promise<ExpandedQuery[]>;
@@ -913,8 +913,8 @@ export function createStore(dbPath?: string): Store {
     toVirtualPath: (absolutePath: string) => toVirtualPath(db, absolutePath),
 
     // Search
-    searchFTS: (query: string, limit?: number, collectionId?: number) => searchFTS(db, query, limit, collectionId),
-    searchVec: (query: string, model: string, limit?: number, collectionName?: string) => searchVec(db, query, model, limit, collectionName),
+    searchFTS: (query: string, limit?: number, collectionName?: string) => searchFTS(db, query, limit, collectionName),
+    searchVec: (query: string, model: string, limit?: number, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[]) => searchVec(db, query, model, limit, collectionName, session, precomputedEmbedding),
 
     // Query expansion & reranking
     expandQuery: (query: string, model?: string) => expandQuery(query, model, db),
@@ -2020,7 +2020,7 @@ function buildFTS5Query(query: string): string | null {
   return terms.map(t => `"${t}"*`).join(' AND ');
 }
 
-export function searchFTS(db: Database, query: string, limit: number = 20, collectionId?: number): SearchResult[] {
+export function searchFTS(db: Database, query: string, limit: number = 20, collectionName?: string): SearchResult[] {
   const ftsQuery = buildFTS5Query(query);
   if (!ftsQuery) return [];
 
@@ -2039,12 +2039,9 @@ export function searchFTS(db: Database, query: string, limit: number = 20, colle
   `;
   const params: (string | number)[] = [ftsQuery];
 
-  if (collectionId) {
-    // Note: collectionId is a legacy parameter that should be phased out
-    // Collections are now managed in YAML. For now, we interpret it as a collection name filter.
-    // This code path is likely unused as collection filtering should be done at CLI level.
+  if (collectionName) {
     sql += ` AND d.collection = ?`;
-    params.push(String(collectionId));
+    params.push(String(collectionName));
   }
 
   // bm25 lower is better; sort ascending.
@@ -2080,11 +2077,11 @@ export function searchFTS(db: Database, query: string, limit: number = 20, colle
 // Vector Search
 // =============================================================================
 
-export async function searchVec(db: Database, query: string, model: string, limit: number = 20, collectionName?: string, session?: ILLMSession): Promise<SearchResult[]> {
+export async function searchVec(db: Database, query: string, model: string, limit: number = 20, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[]): Promise<SearchResult[]> {
   const tableExists = db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
   if (!tableExists) return [];
 
-  const embedding = await getEmbedding(query, model, true, session);
+  const embedding = precomputedEmbedding ?? await getEmbedding(query, model, true, session);
   if (!embedding) return [];
 
   // IMPORTANT: We use a two-step query approach here because sqlite-vec virtual tables
@@ -2848,8 +2845,8 @@ export async function hybridQuery(
   ).get();
 
   // Step 1: BM25 probe — strong signal skips expensive LLM expansion
-  const initialFts = store.searchFTS(query, 20)
-    .filter(r => !collection || r.collectionName === collection);
+  // Pass collection directly into FTS query (filter at SQL level, not post-hoc)
+  const initialFts = store.searchFTS(query, 20, collection);
   const topScore = initialFts[0]?.score ?? 0;
   const secondScore = initialFts[1]?.score ?? 0;
   const hasStrongSignal = initialFts.length > 0
@@ -2875,26 +2872,15 @@ export async function hybridQuery(
   }
 
   // Step 3: Route searches by query type
-  // Original query → vector search (FTS already covered by probe in step 1).
-  // Vector searches run sequentially — node-llama-cpp's embed context
-  // hangs on concurrent embed() calls (known limitation).
-  if (hasVectors) {
-    const vecResults = await store.searchVec(query, DEFAULT_EMBED_MODEL, 20, collection);
-    if (vecResults.length > 0) {
-      for (const r of vecResults) docidMap.set(r.filepath, r.docid);
-      rankedLists.push(vecResults.map(r => ({
-        file: r.filepath, displayPath: r.displayPath,
-        title: r.title, body: r.body || "", score: r.score,
-      })));
-    }
-  }
+  //
+  // Strategy: run all FTS queries immediately (they're sync/instant), then
+  // batch-embed all vector queries in one embedBatch() call, then run
+  // sqlite-vec lookups with pre-computed embeddings.
 
-  // Expanded queries → route by type: lex→FTS only, vec/hyde→vector only.
-  // This restores the CLI's query-type-aware routing that was lost in the initial refactor.
+  // 3a: Run FTS for all lex expansions right away (no LLM needed)
   for (const q of expanded) {
     if (q.type === 'lex') {
-      const ftsResults = store.searchFTS(q.text, 20)
-        .filter(r => !collection || r.collectionName === collection);
+      const ftsResults = store.searchFTS(q.text, 20, collection);
       if (ftsResults.length > 0) {
         for (const r of ftsResults) docidMap.set(r.filepath, r.docid);
         rankedLists.push(ftsResults.map(r => ({
@@ -2902,17 +2888,40 @@ export async function hybridQuery(
           title: r.title, body: r.body || "", score: r.score,
         })));
       }
-    } else {
-      // vec or hyde → vector search only
-      if (hasVectors) {
-        const vecResults = await store.searchVec(q.text, DEFAULT_EMBED_MODEL, 20, collection);
-        if (vecResults.length > 0) {
-          for (const r of vecResults) docidMap.set(r.filepath, r.docid);
-          rankedLists.push(vecResults.map(r => ({
-            file: r.filepath, displayPath: r.displayPath,
-            title: r.title, body: r.body || "", score: r.score,
-          })));
-        }
+    }
+  }
+
+  // 3b: Collect all texts that need vector search (original query + vec/hyde expansions)
+  if (hasVectors) {
+    const vecQueries: { text: string; isOriginal: boolean }[] = [
+      { text: query, isOriginal: true },
+    ];
+    for (const q of expanded) {
+      if (q.type === 'vec' || q.type === 'hyde') {
+        vecQueries.push({ text: q.text, isOriginal: false });
+      }
+    }
+
+    // Batch embed all vector queries in a single call
+    const llm = getDefaultLlamaCpp();
+    const textsToEmbed = vecQueries.map(q => formatQueryForEmbedding(q.text));
+    const embeddings = await llm.embedBatch(textsToEmbed);
+
+    // Run sqlite-vec lookups with pre-computed embeddings
+    for (let i = 0; i < vecQueries.length; i++) {
+      const embedding = embeddings[i]?.embedding;
+      if (!embedding) continue;
+
+      const vecResults = await store.searchVec(
+        vecQueries[i]!.text, DEFAULT_EMBED_MODEL, 20, collection,
+        undefined, embedding
+      );
+      if (vecResults.length > 0) {
+        for (const r of vecResults) docidMap.set(r.filepath, r.docid);
+        rankedLists.push(vecResults.map(r => ({
+          file: r.filepath, displayPath: r.displayPath,
+          title: r.title, body: r.body || "", score: r.score,
+        })));
       }
     }
   }