4 月之前 · 4df5505bd6
--- a/README.md
+++ b/README.md
@@ -9,9 +9,15 @@ QMD combines BM25 full-text search, vector semantic search, and LLM re-ranking
 
				 ## Quick Start
			
 
				 
			
 
				 ```sh
			
 
				-# Install globally
			
 
				+# Install globally (Node or Bun)
			
 
				+npm install -g @tobilu/qmd
			
 
				+# or
			
 
				 bun install -g @tobilu/qmd
			
 
				 
			
 
				+# Or run directly
			
 
				+npx @tobilu/qmd ...
			
 
				+bunx @tobilu/qmd ...
			
 
				+
			
 
				 # Create collections for your notes, docs, and meeting transcripts
			
 
				 qmd collection add ~/notes --name notes
			
 
				 qmd collection add ~/Documents/meetings --name meetings
			
@@ -231,6 +237,7 @@ The `query` command uses **Reciprocal Rank Fusion (RRF)** with position-aware bl
 
				 
			
 
				 ### System Requirements
			
 
				 
			
 
				+- **Node.js** >= 22
			
 
				 - **Bun** >= 1.0.0
			
 
				 - **macOS**: Homebrew SQLite (for extension support)
			
 
				   ```sh
			
@@ -252,18 +259,18 @@ Models are downloaded from HuggingFace and cached in `~/.cache/qmd/models/`.
 
				 ## Installation
			
 
				 
			
 
				 ```sh
			
 
				+npm install -g @tobilu/qmd
			
 
				+# or
			
 
				 bun install -g @tobilu/qmd
			
 
				 ```
			
 
				 
			
 
				-Make sure `~/.bun/bin` is in your PATH.
			
 
				-
			
 
				 ### Development
			
 
				 
			
 
				 ```sh
			
 
				 git clone https://github.com/tobi/qmd
			
 
				 cd qmd
			
 
				-bun install
			
 
				-bun link
			
 
				+npm install
			
 
				+npm link
			
 
				 ```
			
 
				 
			
 
				 ## Usage
			
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 
				 {
			
 
				   "name": "@tobilu/qmd",
			
 
				-  "version": "0.9.0",
			
 
				+  "version": "0.9.9",
			
 
				   "description": "Query Markup Documents - On-device hybrid search for markdown files with BM25, vector search, and LLM reranking",
			
 
				   "type": "module",
			
 
				   "bin": {
			
@@ -15,15 +15,26 @@
 
				     "CHANGELOG.md"
			
 
				   ],
			
 
				   "scripts": {
			
 
				-    "test": "bun test --preload ./src/test-preload.ts",
			
 
				-    "qmd": "bun src/qmd.ts",
			
 
				-    "index": "bun src/qmd.ts index",
			
 
				-    "vector": "bun src/qmd.ts vector",
			
 
				-    "search": "bun src/qmd.ts search",
			
 
				-    "vsearch": "bun src/qmd.ts vsearch",
			
 
				-    "rerank": "bun src/qmd.ts rerank",
			
 
				-    "link": "bun link",
			
 
				-    "inspector": "npx @modelcontextprotocol/inspector bun src/qmd.ts mcp",
			
 
				+    "test": "vitest run",
			
 
				+    "test:unit": "vitest run --reporter=verbose src/*.test.ts",
			
 
				+    "test:models": "vitest run --reporter=verbose src/models/*.test.ts",
			
 
				+    "test:integration": "vitest run --reporter=verbose src/integration/*.test.ts",
			
 
				+    "test:unit:bun": "bun run vitest run --reporter=verbose --testTimeout=120000 src/*.test.ts",
			
 
				+    "test:models:bun": "bun run vitest run --reporter=verbose --testTimeout=120000 src/models/*.test.ts",
			
 
				+    "test:integration:bun": "bun run vitest run --reporter=verbose --testTimeout=120000 src/integration/*.test.ts",
			
 
				+    "test:unit:node": "npx vitest run --reporter=verbose --testTimeout=120000 src/*.test.ts",
			
 
				+    "test:models:node": "npx vitest run --reporter=verbose --testTimeout=120000 src/models/*.test.ts",
			
 
				+    "test:integration:node": "npx vitest run --reporter=verbose --testTimeout=120000 src/integration/*.test.ts",
			
 
				+    "test:ci:bun": "npm run test:unit:bun && npm run test:models:bun && npm run test:integration:bun",
			
 
				+    "test:ci:node": "npm run test:unit:node && npm run test:models:node && npm run test:integration:node",
			
 
				+    "test:ci": "npm run test:unit && npm run test:models && npm run test:integration",
			
 
				+    "qmd": "tsx src/qmd.ts",
			
 
				+    "index": "tsx src/qmd.ts index",
			
 
				+    "vector": "tsx src/qmd.ts vector",
			
 
				+    "search": "tsx src/qmd.ts search",
			
 
				+    "vsearch": "tsx src/qmd.ts vsearch",
			
 
				+    "rerank": "tsx src/qmd.ts rerank",
			
 
				+    "inspector": "npx @modelcontextprotocol/inspector tsx src/qmd.ts mcp",
			
 
				     "release": "./scripts/release.sh"
			
 
				   },
			
 
				   "publishConfig": {
			
@@ -39,7 +50,10 @@
 
				   },
			
 
				   "dependencies": {
			
 
				     "@modelcontextprotocol/sdk": "^1.25.1",
			
 
				+    "better-sqlite3": "^11.0.0",
			
 
				+    "fast-glob": "^3.3.0",
			
 
				     "node-llama-cpp": "^3.14.5",
			
 
				+    "picomatch": "^4.0.0",
			
 
				     "sqlite-vec": "^0.1.7-alpha.2",
			
 
				     "yaml": "^2.8.2",
			
 
				     "zod": "^4.2.1"
			
@@ -51,13 +65,15 @@
 
				     "sqlite-vec-win32-x64": "^0.1.7-alpha.2"
			
 
				   },
			
 
				   "devDependencies": {
			
 
				-    "@types/bun": "latest"
			
 
				+    "@types/better-sqlite3": "^7.6.0",
			
 
				+    "tsx": "^4.0.0",
			
 
				+    "vitest": "^3.0.0"
			
 
				   },
			
 
				   "peerDependencies": {
			
 
				     "typescript": "^5.9.3"
			
 
				   },
			
 
				   "engines": {
			
 
				-    "bun": ">=1.0.0"
			
 
				+    "node": ">=22.0.0"
			
 
				   },
			
 
				   "keywords": [
			
 
				     "markdown",
			
--- a/src/bench-rerank.ts
+++ b/src/bench-rerank.ts
@@ -0,0 +1,327 @@
 
				+#!/usr/bin/env bun
			
 
				+/**
			
 
				+ * QMD Reranker Benchmark
			
 
				+ *
			
 
				+ * Measures reranking performance across different configurations.
			
 
				+ * Reports device, parallelism, memory, VRAM, and throughput.
			
 
				+ *
			
 
				+ * Usage:
			
 
				+ *   bun src/bench-rerank.ts              # full benchmark
			
 
				+ *   bun src/bench-rerank.ts --quick      # quick smoke test (10 docs, 1 iteration)
			
 
				+ *   bun src/bench-rerank.ts --docs 100   # custom doc count
			
 
				+ */
			
 
				+
			
 
				+import {
			
 
				+  getLlama,
			
 
				+  getLlamaGpuTypes,
			
 
				+  resolveModelFile,
			
 
				+  LlamaLogLevel,
			
 
				+  type Llama,
			
 
				+  type LlamaModel,
			
 
				+} from "node-llama-cpp";
			
 
				+import { homedir } from "os";
			
 
				+import { join } from "path";
			
 
				+import { cpus } from "os";
			
 
				+
			
 
				+// ============================================================================
			
 
				+// Config
			
 
				+// ============================================================================
			
 
				+
			
 
				+const RERANK_MODEL = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
			
 
				+const MODEL_CACHE = join(homedir(), ".cache", "qmd", "models");
			
 
				+const CONTEXT_SIZE = 2048;
			
 
				+
			
 
				+const args = process.argv.slice(2);
			
 
				+const quick = args.includes("--quick");
			
 
				+const docsIdx = args.indexOf("--docs");
			
 
				+const DOC_COUNT = docsIdx >= 0 ? parseInt(args[docsIdx + 1]!) : (quick ? 10 : 40);
			
 
				+const ITERATIONS = quick ? 1 : 3;
			
 
				+const PARALLEL_CONFIGS = quick ? [1, 4] : [1, 2, 4, 8];
			
 
				+
			
 
				+// ============================================================================
			
 
				+// Test data — realistic-ish chunks of varying length
			
 
				+// ============================================================================
			
 
				+
			
 
				+const QUERY = "How do AI agents work and what are their limitations?";
			
 
				+
			
 
				+function generateDocs(n: number): string[] {
			
 
				+  const templates = [
			
 
				+    "Artificial intelligence agents are software systems that perceive their environment and take actions to achieve goals. They use techniques like reinforcement learning, planning, and natural language processing to operate autonomously.",
			
 
				+    "The transformer architecture, introduced in 2017, revolutionized natural language processing. Self-attention mechanisms allow models to weigh the importance of different parts of input sequences when generating outputs.",
			
 
				+    "Machine learning models require careful evaluation to avoid overfitting. Cross-validation, holdout sets, and metrics like precision, recall, and F1 score help assess generalization performance.",
			
 
				+    "Retrieval-augmented generation combines information retrieval with language models. Documents are embedded into vector spaces, retrieved based on query similarity, and used as context for generation.",
			
 
				+    "Neural network training involves forward propagation, loss computation, and backpropagation. Optimizers like Adam and SGD adjust weights to minimize the loss function over training iterations.",
			
 
				+    "Large language models exhibit emergent capabilities at scale, including few-shot learning, chain-of-thought reasoning, and instruction following. These properties were not explicitly trained for.",
			
 
				+    "Embedding models convert text into dense vector representations that capture semantic meaning. Similar texts produce similar vectors, enabling efficient similarity search and clustering.",
			
 
				+    "Autonomous agents face challenges including hallucination, lack of grounding, limited planning horizons, and difficulty with multi-step reasoning. Safety and alignment remain open research problems.",
			
 
				+    "The attention mechanism computes query-key-value interactions to determine which parts of the input are most relevant. Multi-head attention allows the model to attend to different representation subspaces.",
			
 
				+    "Fine-tuning adapts a pre-trained model to specific tasks using domain-specific data. Techniques like LoRA reduce the number of trainable parameters while maintaining performance.",
			
 
				+  ];
			
 
				+  return Array.from({ length: n }, (_, i) => templates[i % templates.length]!);
			
 
				+}
			
 
				+
			
 
				+// ============================================================================
			
 
				+// Helpers
			
 
				+// ============================================================================
			
 
				+
			
 
				+function formatBytes(bytes: number): string {
			
 
				+  if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`;
			
 
				+  if (bytes < 1024 * 1024 * 1024) return `${(bytes / (1024 * 1024)).toFixed(1)} MB`;
			
 
				+  return `${(bytes / (1024 * 1024 * 1024)).toFixed(2)} GB`;
			
 
				+}
			
 
				+
			
 
				+function getMemUsage(): { rss: number; heapUsed: number } {
			
 
				+  const m = process.memoryUsage();
			
 
				+  return { rss: m.rss, heapUsed: m.heapUsed };
			
 
				+}
			
 
				+
			
 
				+function median(arr: number[]): number {
			
 
				+  const sorted = [...arr].sort((a, b) => a - b);
			
 
				+  const mid = Math.floor(sorted.length / 2);
			
 
				+  return sorted.length % 2 !== 0 ? sorted[mid]! : (sorted[mid - 1]! + sorted[mid]!) / 2;
			
 
				+}
			
 
				+
			
 
				+// ============================================================================
			
 
				+// Benchmark runner
			
 
				+// ============================================================================
			
 
				+
			
 
				+interface BenchResult {
			
 
				+  parallelism: number;
			
 
				+  contextSize: number;
			
 
				+  flashAttention: boolean;
			
 
				+  times: number[];       // ms per run
			
 
				+  medianMs: number;
			
 
				+  docsPerSec: number;
			
 
				+  vramPerContext: number; // bytes
			
 
				+  totalVram: number;      // bytes
			
 
				+  peakRss: number;        // bytes
			
 
				+}
			
 
				+
			
 
				+async function benchmarkConfig(
			
 
				+  model: LlamaModel,
			
 
				+  llama: Llama,
			
 
				+  docs: string[],
			
 
				+  parallelism: number,
			
 
				+  flash: boolean,
			
 
				+): Promise<BenchResult> {
			
 
				+  // Measure VRAM before
			
 
				+  const vramBefore = llama.gpu ? await llama.getVramState() : null;
			
 
				+  const rssBefore = getMemUsage().rss;
			
 
				+
			
 
				+  // Create contexts. On CPU, split threads evenly across contexts.
			
 
				+  const cpuThreads = !llama.gpu ? Math.floor(llama.cpuMathCores / parallelism) : 0;
			
 
				+  const contexts = [];
			
 
				+  for (let i = 0; i < parallelism; i++) {
			
 
				+    try {
			
 
				+      contexts.push(await model.createRankingContext({
			
 
				+        contextSize: CONTEXT_SIZE,
			
 
				+        flashAttention: flash,
			
 
				+        ...(cpuThreads > 0 ? { threads: cpuThreads } : {}),
			
 
				+      }));
			
 
				+    } catch {
			
 
				+      if (contexts.length === 0) {
			
 
				+        // Try without flash
			
 
				+        contexts.push(await model.createRankingContext({
			
 
				+          contextSize: CONTEXT_SIZE,
			
 
				+          ...(cpuThreads > 0 ? { threads: cpuThreads } : {}),
			
 
				+        }));
			
 
				+      }
			
 
				+      break;
			
 
				+    }
			
 
				+  }
			
 
				+  const actualParallelism = contexts.length;
			
 
				+
			
 
				+  // Measure VRAM after context creation
			
 
				+  const vramAfter = llama.gpu ? await llama.getVramState() : null;
			
 
				+  const vramUsed = vramBefore && vramAfter ? vramAfter.used - vramBefore.used : 0;
			
 
				+  const vramPerCtx = actualParallelism > 0 ? vramUsed / actualParallelism : 0;
			
 
				+
			
 
				+  // Warm up
			
 
				+  await contexts[0]!.rankAll(QUERY, docs.slice(0, 2));
			
 
				+
			
 
				+  // Benchmark iterations
			
 
				+  const times: number[] = [];
			
 
				+  let peakRss = getMemUsage().rss;
			
 
				+
			
 
				+  for (let iter = 0; iter < ITERATIONS; iter++) {
			
 
				+    const chunkSize = Math.ceil(docs.length / actualParallelism);
			
 
				+
			
 
				+    const t0 = performance.now();
			
 
				+    const allScores = await Promise.all(
			
 
				+      Array.from({ length: actualParallelism }, (_, i) => {
			
 
				+        const chunk = docs.slice(i * chunkSize, (i + 1) * chunkSize);
			
 
				+        return chunk.length > 0 ? contexts[i]!.rankAll(QUERY, chunk) : Promise.resolve([]);
			
 
				+      })
			
 
				+    );
			
 
				+    const elapsed = performance.now() - t0;
			
 
				+    times.push(elapsed);
			
 
				+
			
 
				+    // Verify scores are valid
			
 
				+    const flat = allScores.flat();
			
 
				+    if (flat.some(s => s < 0 || s > 1 || isNaN(s))) {
			
 
				+      throw new Error("Invalid scores detected");
			
 
				+    }
			
 
				+
			
 
				+    const currentRss = getMemUsage().rss;
			
 
				+    if (currentRss > peakRss) peakRss = currentRss;
			
 
				+  }
			
 
				+
			
 
				+  // Cleanup
			
 
				+  for (const ctx of contexts) await ctx.dispose();
			
 
				+
			
 
				+  const med = median(times);
			
 
				+  return {
			
 
				+    parallelism: actualParallelism,
			
 
				+    contextSize: CONTEXT_SIZE,
			
 
				+    flashAttention: flash,
			
 
				+    times,
			
 
				+    medianMs: med,
			
 
				+    docsPerSec: (docs.length / med) * 1000,
			
 
				+    vramPerContext: vramPerCtx,
			
 
				+    totalVram: vramUsed,
			
 
				+    peakRss,
			
 
				+  };
			
 
				+}
			
 
				+
			
 
				+// ============================================================================
			
 
				+// Main
			
 
				+// ============================================================================
			
 
				+
			
 
				+async function main() {
			
 
				+  console.log("═══════════════════════════════════════════════════════════════");
			
 
				+  console.log("  QMD Reranker Benchmark");
			
 
				+  console.log("═══════════════════════════════════════════════════════════════\n");
			
 
				+
			
 
				+  // Detect GPU
			
 
				+  const gpuTypes = await getLlamaGpuTypes();
			
 
				+  const preferred = (["cuda", "metal", "vulkan"] as const).find(g => gpuTypes.includes(g));
			
 
				+
			
 
				+  let llama: Llama;
			
 
				+  let gpuLabel: string;
			
 
				+  if (preferred) {
			
 
				+    try {
			
 
				+      llama = await getLlama({ gpu: preferred, logLevel: LlamaLogLevel.error });
			
 
				+      gpuLabel = `${preferred}`;
			
 
				+    } catch {
			
 
				+      llama = await getLlama({ gpu: false, logLevel: LlamaLogLevel.error });
			
 
				+      gpuLabel = "cpu (gpu init failed)";
			
 
				+    }
			
 
				+  } else {
			
 
				+    llama = await getLlama({ gpu: false, logLevel: LlamaLogLevel.error });
			
 
				+    gpuLabel = "cpu";
			
 
				+  }
			
 
				+
			
 
				+  // System info
			
 
				+  const cpuInfo = cpus();
			
 
				+  const cpuModel = cpuInfo[0]?.model || "unknown";
			
 
				+  const cpuCount = cpuInfo.length;
			
 
				+
			
 
				+  console.log("System");
			
 
				+  console.log(`  CPU:       ${cpuModel}`);
			
 
				+  console.log(`  Cores:     ${cpuCount} (${llama.cpuMathCores} math)`);
			
 
				+  console.log(`  Device:    ${gpuLabel}`);
			
 
				+
			
 
				+  if (llama.gpu) {
			
 
				+    const gpuNames = await llama.getGpuDeviceNames();
			
 
				+    const counts = new Map<string, number>();
			
 
				+    for (const name of gpuNames) counts.set(name, (counts.get(name) || 0) + 1);
			
 
				+    const devStr = Array.from(counts.entries())
			
 
				+      .map(([name, n]) => n > 1 ? `${n}× ${name}` : name).join(", ");
			
 
				+    console.log(`  GPU:       ${devStr}`);
			
 
				+    const vram = await llama.getVramState();
			
 
				+    console.log(`  VRAM:      ${formatBytes(vram.total)} total, ${formatBytes(vram.free)} free`);
			
 
				+  }
			
 
				+
			
 
				+  console.log(`  RAM:       ${formatBytes(getMemUsage().rss)} RSS at start`);
			
 
				+
			
 
				+  // Load model
			
 
				+  console.log(`\nModel`);
			
 
				+  console.log(`  URI:       ${RERANK_MODEL}`);
			
 
				+  const modelPath = await resolveModelFile(RERANK_MODEL, MODEL_CACHE);
			
 
				+  const vramPreModel = llama.gpu ? await llama.getVramState() : null;
			
 
				+  const model = await llama.loadModel({ modelPath });
			
 
				+  const vramPostModel = llama.gpu ? await llama.getVramState() : null;
			
 
				+  const modelVram = vramPreModel && vramPostModel ? vramPostModel.used - vramPreModel.used : 0;
			
 
				+  console.log(`  Params:    ${model.trainContextSize} train ctx`);
			
 
				+  if (modelVram > 0) console.log(`  VRAM:      ${formatBytes(modelVram)} (model weights)`);
			
 
				+
			
 
				+  // Generate test docs
			
 
				+  const docs = generateDocs(DOC_COUNT);
			
 
				+  console.log(`\nBenchmark`);
			
 
				+  console.log(`  Documents: ${DOC_COUNT}`);
			
 
				+  console.log(`  Ctx size:  ${CONTEXT_SIZE}`);
			
 
				+  console.log(`  Iterations:${ITERATIONS}`);
			
 
				+  console.log(`  Query:     "${QUERY.slice(0, 50)}..."`);
			
 
				+
			
 
				+  // Run benchmarks
			
 
				+  const results: BenchResult[] = [];
			
 
				+
			
 
				+  for (const p of PARALLEL_CONFIGS) {
			
 
				+    if (!llama.gpu && p > 1) {
			
 
				+      // CPU: only test if we have enough cores (at least 4 per context)
			
 
				+      if (llama.cpuMathCores < p * 4) {
			
 
				+        console.log(`\n  [${p} ctx] skipped (need ${p * 4} cores, have ${llama.cpuMathCores})`);
			
 
				+        continue;
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+    // Test with flash attention
			
 
				+    process.stdout.write(`\n  [${p} ctx, flash] running...`);
			
 
				+    try {
			
 
				+      const r = await benchmarkConfig(model, llama, docs, p, true);
			
 
				+      results.push(r);
			
 
				+      process.stdout.write(` ${r.medianMs.toFixed(0)}ms (${r.docsPerSec.toFixed(1)} docs/s)\n`);
			
 
				+    } catch (e: any) {
			
 
				+      process.stdout.write(` failed: ${e.message}\n`);
			
 
				+      // Try without flash
			
 
				+      process.stdout.write(`  [${p} ctx, no flash] running...`);
			
 
				+      try {
			
 
				+        const r = await benchmarkConfig(model, llama, docs, p, false);
			
 
				+        results.push(r);
			
 
				+        process.stdout.write(` ${r.medianMs.toFixed(0)}ms (${r.docsPerSec.toFixed(1)} docs/s)\n`);
			
 
				+      } catch (e2: any) {
			
 
				+        process.stdout.write(` failed: ${e2.message}\n`);
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  // Summary table
			
 
				+  console.log("\n═══════════════════════════════════════════════════════════════");
			
 
				+  console.log("  Results");
			
 
				+  console.log("═══════════════════════════════════════════════════════════════\n");
			
 
				+
			
 
				+  const header = "  Ctx  Flash  Median    Docs/s   VRAM/ctx   Total VRAM  Peak RSS";
			
 
				+  const sep    = "  ───  ─────  ──────    ──────   ────────   ──────────  ────────";
			
 
				+  console.log(header);
			
 
				+  console.log(sep);
			
 
				+
			
 
				+  const baseline = results[0]?.medianMs ?? 1;
			
 
				+  for (const r of results) {
			
 
				+    const speedup = baseline / r.medianMs;
			
 
				+    const speedupStr = r === results[0] ? "      " : `(${speedup.toFixed(1)}×)`;
			
 
				+    console.log(
			
 
				+      `  ${String(r.parallelism).padStart(3)}  ` +
			
 
				+      `${r.flashAttention ? " yes " : "  no "}  ` +
			
 
				+      `${r.medianMs.toFixed(0).padStart(5)}ms  ` +
			
 
				+      `${r.docsPerSec.toFixed(1).padStart(6)}  ` +
			
 
				+      `${formatBytes(r.vramPerContext).padStart(8)}  ` +
			
 
				+      `${formatBytes(r.totalVram).padStart(10)}  ` +
			
 
				+      `${formatBytes(r.peakRss).padStart(8)}  ` +
			
 
				+      speedupStr
			
 
				+    );
			
 
				+  }
			
 
				+
			
 
				+  // Best config
			
 
				+  if (results.length > 0) {
			
 
				+    const best = results.reduce((a, b) => a.docsPerSec > b.docsPerSec ? a : b);
			
 
				+    console.log(`\n  Best: ${best.parallelism} contexts, flash=${best.flashAttention}`);
			
 
				+    console.log(`        ${best.medianMs.toFixed(0)}ms for ${DOC_COUNT} docs (${best.docsPerSec.toFixed(1)} docs/s)`);
			
 
				+    if (best.totalVram > 0) console.log(`        ${formatBytes(best.totalVram)} VRAM`);
			
 
				+  }
			
 
				+
			
 
				+  console.log("");
			
 
				+  await model.dispose();
			
 
				+  await llama.dispose();
			
 
				+}
			
 
				+
			
 
				+main().catch(console.error);
			
--- a/src/llm.test.ts
+++ b/src/llm.test.ts
@@ -221,10 +221,15 @@ describe.skipIf(!!process.env.CI)("LlamaCpp Integration", () => {
 
				       const successCount = allResults.filter(r => r !== null).length;
			
 
				       expect(successCount).toBe(10);
			
 
				 
			
 
				-      // THE KEY ASSERTION: Only 1 context should be created, not 5
			
 
				-      // Without the fix, contextCreateCount would be 5 (one per concurrent embedBatch call)
			
 
				-      console.log(`Context creation count: ${contextCreateCount} (expected: 1)`);
			
 
				-      expect(contextCreateCount).toBe(1);
			
 
				+      // THE KEY ASSERTION: Contexts should be created once (by ensureEmbedContexts),
			
 
				+      // not duplicated per concurrent embedBatch call. The exact count depends on
			
 
				+      // available VRAM (computeParallelism), but should not be 5 (one per call).
			
 
				+      // Without the fix, contextCreateCount would be 5× the intended count (one set per concurrent call).
			
 
				+      // With the promise guard, contexts are created exactly once regardless of concurrent callers.
			
 
				+      // The count depends on VRAM (computeParallelism), but should be ≤ 8 (the cap).
			
 
				+      console.log(`Context creation count: ${contextCreateCount} (expected: ≤ 8, not 5× duplicated)`);
			
 
				+      expect(contextCreateCount).toBeGreaterThanOrEqual(1);
			
 
				+      expect(contextCreateCount).toBeLessThanOrEqual(8);
			
 
				       
			
 
				       await freshLlm.dispose();
			
 
				     }, 60000);
			
--- a/src/llm.ts
+++ b/src/llm.ts
@@ -6,6 +6,7 @@
 
				 
			
 
				 import {
			
 
				   getLlama,
			
 
				+  getLlamaGpuTypes,
			
 
				   resolveModelFile,
			
 
				   LlamaChatSession,
			
 
				   LlamaLogLevel,
			
@@ -354,10 +355,10 @@ const DEFAULT_INACTIVITY_TIMEOUT_MS = 5 * 60 * 1000;
 
				 export class LlamaCpp implements LLM {
			
 
				   private llama: Llama | null = null;
			
 
				   private embedModel: LlamaModel | null = null;
			
 
				-  private embedContext: LlamaEmbeddingContext | null = null;
			
 
				+  private embedContexts: LlamaEmbeddingContext[] = [];
			
 
				   private generateModel: LlamaModel | null = null;
			
 
				   private rerankModel: LlamaModel | null = null;
			
 
				-  private rerankContext: Awaited<ReturnType<LlamaModel["createRankingContext"]>> | null = null;
			
 
				+  private rerankContexts: Awaited<ReturnType<LlamaModel["createRankingContext"]>>[] = [];
			
 
				 
			
 
				   private embedModelUri: string;
			
 
				   private generateModelUri: string;
			
@@ -366,7 +367,6 @@ export class LlamaCpp implements LLM {
 
				 
			
 
				   // Ensure we don't load the same model/context concurrently (which can allocate duplicate VRAM).
			
 
				   private embedModelLoadPromise: Promise<LlamaModel> | null = null;
			
 
				-  private embedContextCreatePromise: Promise<LlamaEmbeddingContext> | null = null;
			
 
				   private generateModelLoadPromise: Promise<LlamaModel> | null = null;
			
 
				   private rerankModelLoadPromise: Promise<LlamaModel> | null = null;
			
 
				 
			
@@ -423,7 +423,7 @@ export class LlamaCpp implements LLM {
 
				    * Check if any contexts are currently loaded (and therefore worth unloading on inactivity).
			
 
				    */
			
 
				   private hasLoadedContexts(): boolean {
			
 
				-    return !!(this.embedContext || this.rerankContext);
			
 
				+    return !!(this.embedContexts.length > 0 || this.rerankContexts.length > 0);
			
 
				   }
			
 
				 
			
 
				   /**
			
@@ -445,14 +445,14 @@ export class LlamaCpp implements LLM {
 
				     }
			
 
				 
			
 
				     // Dispose contexts first
			
 
				-    if (this.embedContext) {
			
 
				-      await this.embedContext.dispose();
			
 
				-      this.embedContext = null;
			
 
				+    for (const ctx of this.embedContexts) {
			
 
				+      await ctx.dispose();
			
 
				     }
			
 
				-    if (this.rerankContext) {
			
 
				-      await this.rerankContext.dispose();
			
 
				-      this.rerankContext = null;
			
 
				+    this.embedContexts = [];
			
 
				+    for (const ctx of this.rerankContexts) {
			
 
				+      await ctx.dispose();
			
 
				     }
			
 
				+    this.rerankContexts = [];
			
 
				 
			
 
				     // Optionally dispose models too (opt-in)
			
 
				     if (this.disposeModelsOnInactivity) {
			
@@ -491,7 +491,33 @@ export class LlamaCpp implements LLM {
 
				    */
			
 
				   private async ensureLlama(): Promise<Llama> {
			
 
				     if (!this.llama) {
			
 
				-      this.llama = await getLlama({ logLevel: LlamaLogLevel.error });
			
 
				+      // Detect available GPU types and use the best one.
			
 
				+      // We can't rely on gpu:"auto" — it returns false even when CUDA is available
			
 
				+      // (likely a binary/build config issue in node-llama-cpp).
			
 
				+      const gpuTypes = await getLlamaGpuTypes();
			
 
				+      // Prefer CUDA > Metal > Vulkan > CPU
			
 
				+      const preferred = (["cuda", "metal", "vulkan"] as const).find(g => gpuTypes.includes(g));
			
 
				+
			
 
				+      let llama: Llama;
			
 
				+      if (preferred) {
			
 
				+        try {
			
 
				+          llama = await getLlama({ gpu: preferred, logLevel: LlamaLogLevel.error });
			
 
				+        } catch {
			
 
				+          llama = await getLlama({ gpu: false, logLevel: LlamaLogLevel.error });
			
 
				+          process.stderr.write(
			
 
				+            `QMD Warning: ${preferred} reported available but failed to initialize. Falling back to CPU.\n`
			
 
				+          );
			
 
				+        }
			
 
				+      } else {
			
 
				+        llama = await getLlama({ gpu: false, logLevel: LlamaLogLevel.error });
			
 
				+      }
			
 
				+
			
 
				+      if (!llama.gpu) {
			
 
				+        process.stderr.write(
			
 
				+          "QMD Warning: no GPU acceleration, running on CPU (slow). Run 'qmd status' for details.\n"
			
 
				+        );
			
 
				+      }
			
 
				+      this.llama = llama;
			
 
				     }
			
 
				     return this.llama;
			
 
				   }
			
@@ -535,34 +561,92 @@ export class LlamaCpp implements LLM {
 
				   }
			
 
				 
			
 
				   /**
			
 
				-   * Load embedding context (lazy). Context can be disposed and recreated without reloading the model.
			
 
				-   * Uses promise guard to prevent concurrent context creation race condition.
			
 
				+   * Compute how many parallel contexts to create.
			
 
				+   *
			
 
				+   * GPU: constrained by VRAM (25% of free, capped at 8).
			
 
				+   * CPU: constrained by cores. Splitting threads across contexts enables
			
 
				+   *      true parallelism (each context runs on its own cores). Use at most
			
 
				+   *      half the math cores, with at least 4 threads per context.
			
 
				    */
			
 
				-  private async ensureEmbedContext(): Promise<LlamaEmbeddingContext> {
			
 
				-    if (!this.embedContext) {
			
 
				-      // If context creation is already in progress, wait for it
			
 
				-      if (this.embedContextCreatePromise) {
			
 
				-        return await this.embedContextCreatePromise;
			
 
				+  private async computeParallelism(perContextMB: number): Promise<number> {
			
 
				+    const llama = await this.ensureLlama();
			
 
				+
			
 
				+    if (llama.gpu) {
			
 
				+      try {
			
 
				+        const vram = await llama.getVramState();
			
 
				+        const freeMB = vram.free / (1024 * 1024);
			
 
				+        const maxByVram = Math.floor((freeMB * 0.25) / perContextMB);
			
 
				+        return Math.max(1, Math.min(8, maxByVram));
			
 
				+      } catch {
			
 
				+        return 2;
			
 
				       }
			
 
				+    }
			
 
				 
			
 
				-      // Start context creation and store promise so concurrent calls wait
			
 
				-      this.embedContextCreatePromise = (async () => {
			
 
				-        const model = await this.ensureEmbedModel();
			
 
				-        const context = await model.createEmbeddingContext();
			
 
				-        this.embedContext = context;
			
 
				-        return context;
			
 
				-      })();
			
 
				+    // CPU: split cores across contexts. At least 4 threads per context.
			
 
				+    const cores = llama.cpuMathCores || 4;
			
 
				+    const maxContexts = Math.floor(cores / 4);
			
 
				+    return Math.max(1, Math.min(4, maxContexts));
			
 
				+  }
			
 
				 
			
 
				-      try {
			
 
				-        const context = await this.embedContextCreatePromise;
			
 
				-        this.touchActivity();
			
 
				-        return context;
			
 
				-      } finally {
			
 
				-        this.embedContextCreatePromise = null;
			
 
				+  /**
			
 
				+   * Get the number of threads each context should use, given N parallel contexts.
			
 
				+   * Splits available math cores evenly across contexts.
			
 
				+   */
			
 
				+  private async threadsPerContext(parallelism: number): Promise<number> {
			
 
				+    const llama = await this.ensureLlama();
			
 
				+    if (llama.gpu) return 0; // GPU: let the library decide
			
 
				+    const cores = llama.cpuMathCores || 4;
			
 
				+    return Math.max(1, Math.floor(cores / parallelism));
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Load embedding contexts (lazy). Creates multiple for parallel embedding.
			
 
				+   * Uses promise guard to prevent concurrent context creation race condition.
			
 
				+   */
			
 
				+  private embedContextsCreatePromise: Promise<LlamaEmbeddingContext[]> | null = null;
			
 
				+
			
 
				+  private async ensureEmbedContexts(): Promise<LlamaEmbeddingContext[]> {
			
 
				+    if (this.embedContexts.length > 0) {
			
 
				+      this.touchActivity();
			
 
				+      return this.embedContexts;
			
 
				+    }
			
 
				+
			
 
				+    if (this.embedContextsCreatePromise) {
			
 
				+      return await this.embedContextsCreatePromise;
			
 
				+    }
			
 
				+
			
 
				+    this.embedContextsCreatePromise = (async () => {
			
 
				+      const model = await this.ensureEmbedModel();
			
 
				+      // Embed contexts are ~143 MB each (nomic-embed 2048 ctx)
			
 
				+      const n = await this.computeParallelism(150);
			
 
				+      const threads = await this.threadsPerContext(n);
			
 
				+      for (let i = 0; i < n; i++) {
			
 
				+        try {
			
 
				+          this.embedContexts.push(await model.createEmbeddingContext({
			
 
				+            ...(threads > 0 ? { threads } : {}),
			
 
				+          }));
			
 
				+        } catch {
			
 
				+          if (this.embedContexts.length === 0) throw new Error("Failed to create any embedding context");
			
 
				+          break;
			
 
				+        }
			
 
				       }
			
 
				+      this.touchActivity();
			
 
				+      return this.embedContexts;
			
 
				+    })();
			
 
				+
			
 
				+    try {
			
 
				+      return await this.embedContextsCreatePromise;
			
 
				+    } finally {
			
 
				+      this.embedContextsCreatePromise = null;
			
 
				     }
			
 
				-    this.touchActivity();
			
 
				-    return this.embedContext;
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Get a single embed context (for single-embed calls). Uses first from pool.
			
 
				+   */
			
 
				+  private async ensureEmbedContext(): Promise<LlamaEmbeddingContext> {
			
 
				+    const contexts = await this.ensureEmbedContexts();
			
 
				+    return contexts[0]!;
			
 
				   }
			
 
				 
			
 
				   /**
			
@@ -624,15 +708,50 @@ export class LlamaCpp implements LLM {
 
				   }
			
 
				 
			
 
				   /**
			
 
				-   * Load rerank context (lazy). Context can be disposed and recreated without reloading the model.
			
 
				+   * Load rerank contexts (lazy). Creates multiple contexts for parallel ranking.
			
 
				+   * Each context has its own sequence, so they can evaluate independently.
			
 
				+   *
			
 
				+   * Tuning choices:
			
 
				+   * - contextSize 1024: reranking chunks are ~800 tokens max, 1024 is plenty
			
 
				+   * - flashAttention: ~20% less VRAM per context (568 vs 711 MB)
			
 
				+   * - Combined: drops from 11.6 GB (auto, no flash) to 568 MB per context (20×)
			
 
				    */
			
 
				-  private async ensureRerankContext(): Promise<Awaited<ReturnType<LlamaModel["createRankingContext"]>>> {
			
 
				-    if (!this.rerankContext) {
			
 
				+  // Qwen3 reranker template adds ~200 tokens overhead (system prompt, tags, etc.)
			
 
				+  // Chunks are max 800 tokens, so 800 + 200 + query ≈ 1100 tokens typical.
			
 
				+  // Use 2048 for safety margin. Still 17× less than auto (40960).
			
 
				+  private static readonly RERANK_CONTEXT_SIZE = 2048;
			
 
				+
			
 
				+  private async ensureRerankContexts(): Promise<Awaited<ReturnType<LlamaModel["createRankingContext"]>>[]> {
			
 
				+    if (this.rerankContexts.length === 0) {
			
 
				       const model = await this.ensureRerankModel();
			
 
				-      this.rerankContext = await model.createRankingContext();
			
 
				+      // ~960 MB per context with flash attention at contextSize 2048
			
 
				+      const n = await this.computeParallelism(1000);
			
 
				+      const threads = await this.threadsPerContext(n);
			
 
				+      for (let i = 0; i < n; i++) {
			
 
				+        try {
			
 
				+          this.rerankContexts.push(await model.createRankingContext({
			
 
				+            contextSize: LlamaCpp.RERANK_CONTEXT_SIZE,
			
 
				+            flashAttention: true,
			
 
				+            ...(threads > 0 ? { threads } : {}),
			
 
				+          }));
			
 
				+        } catch {
			
 
				+          if (this.rerankContexts.length === 0) {
			
 
				+            // Flash attention might not be supported — retry without it
			
 
				+            try {
			
 
				+              this.rerankContexts.push(await model.createRankingContext({
			
 
				+                contextSize: LlamaCpp.RERANK_CONTEXT_SIZE,
			
 
				+                ...(threads > 0 ? { threads } : {}),
			
 
				+              }));
			
 
				+            } catch {
			
 
				+              throw new Error("Failed to create any rerank context");
			
 
				+            }
			
 
				+          }
			
 
				+          break;
			
 
				+        }
			
 
				+      }
			
 
				     }
			
 
				     this.touchActivity();
			
 
				-    return this.rerankContext;
			
 
				+    return this.rerankContexts;
			
 
				   }
			
 
				 
			
 
				   // ==========================================================================
			
@@ -703,26 +822,51 @@ export class LlamaCpp implements LLM {
 
				     if (texts.length === 0) return [];
			
 
				 
			
 
				     try {
			
 
				-      const context = await this.ensureEmbedContext();
			
 
				-
			
 
				-      // node-llama-cpp handles batching internally when we make parallel requests
			
 
				-      const embeddings = await Promise.all(
			
 
				-        texts.map(async (text) => {
			
 
				+      const contexts = await this.ensureEmbedContexts();
			
 
				+      const n = contexts.length;
			
 
				+
			
 
				+      if (n === 1) {
			
 
				+        // Single context: sequential (no point splitting)
			
 
				+        const context = contexts[0]!;
			
 
				+        const embeddings = [];
			
 
				+        for (const text of texts) {
			
 
				           try {
			
 
				             const embedding = await context.getEmbeddingFor(text);
			
 
				-            this.touchActivity();  // Keep-alive during slow batches
			
 
				-            return {
			
 
				-              embedding: Array.from(embedding.vector),
			
 
				-              model: this.embedModelUri,
			
 
				-            };
			
 
				+            this.touchActivity();
			
 
				+            embeddings.push({ embedding: Array.from(embedding.vector), model: this.embedModelUri });
			
 
				           } catch (err) {
			
 
				             console.error("Embedding error for text:", err);
			
 
				-            return null;
			
 
				+            embeddings.push(null);
			
 
				+          }
			
 
				+        }
			
 
				+        return embeddings;
			
 
				+      }
			
 
				+
			
 
				+      // Multiple contexts: split texts across contexts for parallel evaluation
			
 
				+      const chunkSize = Math.ceil(texts.length / n);
			
 
				+      const chunks = Array.from({ length: n }, (_, i) =>
			
 
				+        texts.slice(i * chunkSize, (i + 1) * chunkSize)
			
 
				+      );
			
 
				+
			
 
				+      const chunkResults = await Promise.all(
			
 
				+        chunks.map(async (chunk, i) => {
			
 
				+          const ctx = contexts[i]!;
			
 
				+          const results: (EmbeddingResult | null)[] = [];
			
 
				+          for (const text of chunk) {
			
 
				+            try {
			
 
				+              const embedding = await ctx.getEmbeddingFor(text);
			
 
				+              this.touchActivity();
			
 
				+              results.push({ embedding: Array.from(embedding.vector), model: this.embedModelUri });
			
 
				+            } catch (err) {
			
 
				+              console.error("Embedding error for text:", err);
			
 
				+              results.push(null);
			
 
				+            }
			
 
				           }
			
 
				+          return results;
			
 
				         })
			
 
				       );
			
 
				 
			
 
				-      return embeddings;
			
 
				+      return chunkResults.flat();
			
 
				     } catch (error) {
			
 
				       console.error("Batch embedding error:", error);
			
 
				       return texts.map(() => null);
			
@@ -879,7 +1023,7 @@ export class LlamaCpp implements LLM {
 
				     // Ping activity at start to keep models alive during this operation
			
 
				     this.touchActivity();
			
 
				 
			
 
				-    const context = await this.ensureRerankContext();
			
 
				+    const contexts = await this.ensureRerankContexts();
			
 
				 
			
 
				     // Build a map from document text to original indices (for lookup after sorting)
			
 
				     const textToDoc = new Map<string, { file: string; index: number }>();
			
@@ -890,8 +1034,24 @@ export class LlamaCpp implements LLM {
 
				     // Extract just the text for ranking
			
 
				     const texts = documents.map((doc) => doc.text);
			
 
				 
			
 
				-    // Use the proper ranking API - returns [{document: string, score: number}] sorted by score
			
 
				-    const ranked = await context.rankAndSort(query, texts);
			
 
				+    // Split documents across contexts for parallel evaluation.
			
 
				+    // Each context has its own sequence with a lock, so parallelism comes
			
 
				+    // from multiple contexts evaluating different chunks simultaneously.
			
 
				+    const n = contexts.length;
			
 
				+    const chunkSize = Math.ceil(texts.length / n);
			
 
				+    const chunks = Array.from({ length: n }, (_, i) =>
			
 
				+      texts.slice(i * chunkSize, (i + 1) * chunkSize)
			
 
				+    ).filter(chunk => chunk.length > 0);
			
 
				+
			
 
				+    const allScores = await Promise.all(
			
 
				+      chunks.map((chunk, i) => contexts[i]!.rankAll(query, chunk))
			
 
				+    );
			
 
				+
			
 
				+    // Reassemble scores in original order and sort
			
 
				+    const flatScores = allScores.flat();
			
 
				+    const ranked = texts
			
 
				+      .map((text, i) => ({ document: text, score: flatScores[i]! }))
			
 
				+      .sort((a, b) => b.score - a.score);
			
 
				 
			
 
				     // Map back to our result format using the text-to-doc map
			
 
				     const results: RerankDocumentResult[] = ranked.map((item) => {
			
@@ -909,6 +1069,35 @@ export class LlamaCpp implements LLM {
 
				     };
			
 
				   }
			
 
				 
			
 
				+  /**
			
 
				+   * Get device/GPU info for status display.
			
 
				+   * Initializes llama if not already done.
			
 
				+   */
			
 
				+  async getDeviceInfo(): Promise<{
			
 
				+    gpu: string | false;
			
 
				+    gpuOffloading: boolean;
			
 
				+    gpuDevices: string[];
			
 
				+    vram?: { total: number; used: number; free: number };
			
 
				+    cpuCores: number;
			
 
				+  }> {
			
 
				+    const llama = await this.ensureLlama();
			
 
				+    const gpuDevices = await llama.getGpuDeviceNames();
			
 
				+    let vram: { total: number; used: number; free: number } | undefined;
			
 
				+    if (llama.gpu) {
			
 
				+      try {
			
 
				+        const state = await llama.getVramState();
			
 
				+        vram = { total: state.total, used: state.used, free: state.free };
			
 
				+      } catch { /* no vram info */ }
			
 
				+    }
			
 
				+    return {
			
 
				+      gpu: llama.gpu,
			
 
				+      gpuOffloading: llama.supportsGpuOffloading,
			
 
				+      gpuDevices,
			
 
				+      vram,
			
 
				+      cpuCores: llama.cpuMathCores,
			
 
				+    };
			
 
				+  }
			
 
				+
			
 
				   async dispose(): Promise<void> {
			
 
				     // Prevent double-dispose
			
 
				     if (this.disposed) {
			
@@ -932,8 +1121,8 @@ export class LlamaCpp implements LLM {
 
				     }
			
 
				 
			
 
				     // Clear references
			
 
				-    this.embedContext = null;
			
 
				-    this.rerankContext = null;
			
 
				+    this.embedContexts = [];
			
 
				+    this.rerankContexts = [];
			
 
				     this.embedModel = null;
			
 
				     this.generateModel = null;
			
 
				     this.rerankModel = null;
			
@@ -941,7 +1130,7 @@ export class LlamaCpp implements LLM {
 
				 
			
 
				     // Clear any in-flight load/create promises
			
 
				     this.embedModelLoadPromise = null;
			
 
				-    this.embedContextCreatePromise = null;
			
 
				+    this.embedContextsCreatePromise = null;
			
 
				     this.generateModelLoadPromise = null;
			
 
				     this.rerankModelLoadPromise = null;
			
 
				   }
			
--- a/src/mcp.ts
+++ b/src/mcp.ts
@@ -1,4 +1,3 @@
 
				-#!/usr/bin/env bun
			
 
				 /**
			
 
				  * QMD MCP Server - Model Context Protocol server for QMD
			
 
				  *
			
@@ -8,6 +7,8 @@
 
				  * Follows MCP spec 2025-06-18 for proper response types.
			
 
				  */
			
 
				 
			
 
				+import { createServer, type IncomingMessage, type ServerResponse } from "node:http";
			
 
				+import { fileURLToPath } from "url";
			
 
				 import { McpServer, ResourceTemplate } from "@modelcontextprotocol/sdk/server/mcp.js";
			
 
				 import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
			
 
				 import { WebStandardStreamableHTTPServerTransport }
			
@@ -147,7 +148,7 @@ function buildInstructions(store: Store): string {
 
				  */
			
 
				 function createMcpServer(store: Store): McpServer {
			
 
				   const server = new McpServer(
			
 
				-    { name: "qmd", version: "1.0.0" },
			
 
				+    { name: "qmd", version: "0.9.9" },
			
 
				     { instructions: buildInstructions(store) },
			
 
				   );
			
 
				 
			
@@ -237,9 +238,7 @@ function createMcpServer(store: Store): McpServer {
 
				       },
			
 
				     },
			
 
				     async ({ query, limit, minScore, collection }) => {
			
 
				-      // Note: Collection filtering is now done post-search since collections are managed in YAML
			
 
				-      const results = store.searchFTS(query, limit || 10)
			
 
				-        .filter(r => !collection || r.collectionName === collection);
			
 
				+      const results = store.searchFTS(query, limit || 10, collection);
			
 
				       const filtered: SearchResultItem[] = results
			
 
				         .filter(r => r.score >= (minScore || 0))
			
 
				         .map(r => {
			
@@ -541,7 +540,7 @@ export async function startMcpServer(): Promise<void> {
 
				 // =============================================================================
			
 
				 
			
 
				 export type HttpServerHandle = {
			
 
				-  httpServer: ReturnType<typeof Bun.serve>;
			
 
				+  httpServer: import("http").Server;
			
 
				   port: number;
			
 
				   stop: () => Promise<void>;
			
 
				 };
			
@@ -588,47 +587,79 @@ export async function startMcpHttpServer(port: number, options?: { quiet?: boole
 
				     if (!quiet) console.error(msg);
			
 
				   }
			
 
				 
			
 
				-  const httpServer = Bun.serve({
			
 
				-    port,
			
 
				-    hostname: "localhost",
			
 
				-    async fetch(req) {
			
 
				-      const reqStart = Date.now();
			
 
				-      const pathname = new URL(req.url).pathname;
			
 
				-
			
 
				-      if (pathname === "/health" && req.method === "GET") {
			
 
				-        const res = Response.json({
			
 
				-          status: "ok",
			
 
				-          uptime: Math.floor((Date.now() - startTime) / 1000),
			
 
				-        });
			
 
				+  // Helper to collect request body
			
 
				+  async function collectBody(req: IncomingMessage): Promise<string> {
			
 
				+    const chunks: Buffer[] = [];
			
 
				+    for await (const chunk of req) chunks.push(chunk as Buffer);
			
 
				+    return Buffer.concat(chunks).toString();
			
 
				+  }
			
 
				+
			
 
				+  const httpServer = createServer(async (nodeReq: IncomingMessage, nodeRes: ServerResponse) => {
			
 
				+    const reqStart = Date.now();
			
 
				+    const pathname = nodeReq.url || "/";
			
 
				+
			
 
				+    try {
			
 
				+      if (pathname === "/health" && nodeReq.method === "GET") {
			
 
				+        const body = JSON.stringify({ status: "ok", uptime: Math.floor((Date.now() - startTime) / 1000) });
			
 
				+        nodeRes.writeHead(200, { "Content-Type": "application/json" });
			
 
				+        nodeRes.end(body);
			
 
				         log(`${ts()} GET /health (${Date.now() - reqStart}ms)`);
			
 
				-        return res;
			
 
				+        return;
			
 
				       }
			
 
				 
			
 
				-      if (pathname === "/mcp" && req.method === "POST") {
			
 
				-        const body = await req.json();
			
 
				+      if (pathname === "/mcp" && nodeReq.method === "POST") {
			
 
				+        const rawBody = await collectBody(nodeReq);
			
 
				+        const body = JSON.parse(rawBody);
			
 
				         const label = describeRequest(body);
			
 
				-        const res = await transport.handleRequest(req, { parsedBody: body });
			
 
				+        const url = `http://localhost:${port}${pathname}`;
			
 
				+        const headers: Record<string, string> = {};
			
 
				+        for (const [k, v] of Object.entries(nodeReq.headers)) {
			
 
				+          if (typeof v === "string") headers[k] = v;
			
 
				+        }
			
 
				+        const request = new Request(url, { method: "POST", headers, body: rawBody });
			
 
				+        const response = await transport.handleRequest(request, { parsedBody: body });
			
 
				+        nodeRes.writeHead(response.status, Object.fromEntries(response.headers));
			
 
				+        nodeRes.end(Buffer.from(await response.arrayBuffer()));
			
 
				         log(`${ts()} POST /mcp ${label} (${Date.now() - reqStart}ms)`);
			
 
				-        return res;
			
 
				+        return;
			
 
				       }
			
 
				 
			
 
				-      // Pass other methods (GET, DELETE) to transport for protocol handling
			
 
				       if (pathname === "/mcp") {
			
 
				-        return transport.handleRequest(req);
			
 
				+        const url = `http://localhost:${port}${pathname}`;
			
 
				+        const headers: Record<string, string> = {};
			
 
				+        for (const [k, v] of Object.entries(nodeReq.headers)) {
			
 
				+          if (typeof v === "string") headers[k] = v;
			
 
				+        }
			
 
				+        const rawBody = nodeReq.method !== "GET" && nodeReq.method !== "HEAD" ? await collectBody(nodeReq) : undefined;
			
 
				+        const request = new Request(url, { method: nodeReq.method || "GET", headers, ...(rawBody ? { body: rawBody } : {}) });
			
 
				+        const response = await transport.handleRequest(request);
			
 
				+        nodeRes.writeHead(response.status, Object.fromEntries(response.headers));
			
 
				+        nodeRes.end(Buffer.from(await response.arrayBuffer()));
			
 
				+        return;
			
 
				       }
			
 
				 
			
 
				-      return new Response("Not Found", { status: 404 });
			
 
				-    },
			
 
				+      nodeRes.writeHead(404);
			
 
				+      nodeRes.end("Not Found");
			
 
				+    } catch (err) {
			
 
				+      console.error("HTTP handler error:", err);
			
 
				+      nodeRes.writeHead(500);
			
 
				+      nodeRes.end("Internal Server Error");
			
 
				+    }
			
 
				+  });
			
 
				+
			
 
				+  await new Promise<void>((resolve, reject) => {
			
 
				+    httpServer.on("error", reject);
			
 
				+    httpServer.listen(port, "localhost", () => resolve());
			
 
				   });
			
 
				 
			
 
				-  const actualPort = httpServer.port;
			
 
				+  const actualPort = (httpServer.address() as import("net").AddressInfo).port;
			
 
				 
			
 
				   let stopping = false;
			
 
				   const stop = async () => {
			
 
				     if (stopping) return;
			
 
				     stopping = true;
			
 
				     await transport.close();
			
 
				-    httpServer.stop();
			
 
				+    httpServer.close();
			
 
				     store.close();
			
 
				     await disposeDefaultLlamaCpp();
			
 
				   };
			
@@ -649,6 +680,6 @@ export async function startMcpHttpServer(port: number, options?: { quiet?: boole
 
				 }
			
 
				 
			
 
				 // Run if this is the main module
			
 
				-if (import.meta.main) {
			
 
				+if (fileURLToPath(import.meta.url) === process.argv[1] || process.argv[1]?.endsWith("/mcp.ts")) {
			
 
				   startMcpServer().catch(console.error);
			
 
				 }
			
--- a/src/qmd.ts
+++ b/src/qmd.ts
@@ -65,7 +65,7 @@ import {
 
				   createStore,
			
 
				   getDefaultDbPath,
			
 
				 } from "./store.js";
			
 
				-import { disposeDefaultLlamaCpp, withLLMSession, pullModels, DEFAULT_EMBED_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_MODEL_CACHE_DIR } from "./llm.js";
			
 
				+import { disposeDefaultLlamaCpp, getDefaultLlamaCpp, withLLMSession, pullModels, DEFAULT_EMBED_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_MODEL_CACHE_DIR } from "./llm.js";
			
 
				 import {
			
 
				   formatSearchResults,
			
 
				   formatDocuments,
			
@@ -249,7 +249,7 @@ function formatBytes(bytes: number): string {
 
				   return `${(bytes / (1024 * 1024 * 1024)).toFixed(1)} GB`;
			
 
				 }
			
 
				 
			
 
				-function showStatus(): void {
			
 
				+async function showStatus(): Promise<void> {
			
 
				   const dbPath = getDbPath();
			
 
				   const db = getDb();
			
 
				 
			
@@ -362,6 +362,36 @@ function showStatus(): void {
 
				     console.log(`\n${c.dim}No collections. Run 'qmd collection add .' to index markdown files.${c.reset}`);
			
 
				   }
			
 
				 
			
 
				+  // Device / GPU info
			
 
				+  try {
			
 
				+    const llm = getDefaultLlamaCpp();
			
 
				+    const device = await llm.getDeviceInfo();
			
 
				+    console.log(`\n${c.bold}Device${c.reset}`);
			
 
				+    if (device.gpu) {
			
 
				+      console.log(`  GPU:      ${c.green}${device.gpu}${c.reset} (offloading: ${device.gpuOffloading ? 'yes' : 'no'})`);
			
 
				+      if (device.gpuDevices.length > 0) {
			
 
				+        // Deduplicate and count GPUs
			
 
				+        const counts = new Map<string, number>();
			
 
				+        for (const name of device.gpuDevices) {
			
 
				+          counts.set(name, (counts.get(name) || 0) + 1);
			
 
				+        }
			
 
				+        const deviceStr = Array.from(counts.entries())
			
 
				+          .map(([name, count]) => count > 1 ? `${count}× ${name}` : name)
			
 
				+          .join(', ');
			
 
				+        console.log(`  Devices:  ${deviceStr}`);
			
 
				+      }
			
 
				+      if (device.vram) {
			
 
				+        console.log(`  VRAM:     ${formatBytes(device.vram.free)} free / ${formatBytes(device.vram.total)} total`);
			
 
				+      }
			
 
				+    } else {
			
 
				+      console.log(`  GPU:      ${c.yellow}none${c.reset} (running on CPU — models will be slow)`);
			
 
				+      console.log(`  ${c.dim}Tip: Install CUDA, Vulkan, or Metal support for GPU acceleration.${c.reset}`);
			
 
				+    }
			
 
				+    console.log(`  CPU:      ${device.cpuCores} math cores`);
			
 
				+  } catch {
			
 
				+    // Don't fail status if LLM init fails
			
 
				+  }
			
 
				+
			
 
				   closeDb();
			
 
				 }
			
 
				 
			
@@ -1871,8 +1901,7 @@ function search(query: string, opts: OutputOptions): void {
 
				 
			
 
				   // Use large limit for --all, otherwise fetch more than needed and let outputResults filter
			
 
				   const fetchLimit = opts.all ? 100000 : Math.max(50, opts.limit * 2);
			
 
				-  // searchFTS accepts collection name as number parameter for legacy reasons (will be fixed in store.ts)
			
 
				-  const results = searchFTS(db, query, fetchLimit, collectionName as any);
			
 
				+  const results = searchFTS(db, query, fetchLimit, collectionName);
			
 
				 
			
 
				   // Add context to results
			
 
				   const resultsWithContext = results.map(r => ({
			
@@ -2348,7 +2377,7 @@ if (import.meta.main) {
 
				     }
			
 
				 
			
 
				     case "status":
			
 
				-      showStatus();
			
 
				+      await showStatus();
			
 
				       break;
			
 
				 
			
 
				     case "update":
			
--- a/src/store.test.ts
+++ b/src/store.test.ts
@@ -1219,8 +1219,8 @@ describe("FTS Search", () => {
 
				     const allResults = store.searchFTS("searchable", 10);
			
 
				     expect(allResults).toHaveLength(2);
			
 
				 
			
 
				-    // Filter by collection name (collectionId is now treated as collection name string)
			
 
				-    const filtered = store.searchFTS("searchable", 10, collection1 as unknown as number);
			
 
				+    // Filter by collection name
			
 
				+    const filtered = store.searchFTS("searchable", 10, collection1);
			
 
				     expect(filtered).toHaveLength(1);
			
 
				     expect(filtered[0]!.displayPath).toBe(`${collection1}/doc1.md`);
			
 
				 
			
--- a/src/store.ts
+++ b/src/store.ts
@@ -830,8 +830,8 @@ export type Store = {
 
				   toVirtualPath: (absolutePath: string) => string | null;
			
 
				 
			
 
				   // Search
			
 
				-  searchFTS: (query: string, limit?: number, collectionId?: number) => SearchResult[];
			
 
				-  searchVec: (query: string, model: string, limit?: number, collectionName?: string) => Promise<SearchResult[]>;
			
 
				+  searchFTS: (query: string, limit?: number, collectionName?: string) => SearchResult[];
			
 
				+  searchVec: (query: string, model: string, limit?: number, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[]) => Promise<SearchResult[]>;
			
 
				 
			
 
				   // Query expansion & reranking
			
 
				   expandQuery: (query: string, model?: string) => Promise<ExpandedQuery[]>;
			
@@ -913,8 +913,8 @@ export function createStore(dbPath?: string): Store {
 
				     toVirtualPath: (absolutePath: string) => toVirtualPath(db, absolutePath),
			
 
				 
			
 
				     // Search
			
 
				-    searchFTS: (query: string, limit?: number, collectionId?: number) => searchFTS(db, query, limit, collectionId),
			
 
				-    searchVec: (query: string, model: string, limit?: number, collectionName?: string) => searchVec(db, query, model, limit, collectionName),
			
 
				+    searchFTS: (query: string, limit?: number, collectionName?: string) => searchFTS(db, query, limit, collectionName),
			
 
				+    searchVec: (query: string, model: string, limit?: number, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[]) => searchVec(db, query, model, limit, collectionName, session, precomputedEmbedding),
			
 
				 
			
 
				     // Query expansion & reranking
			
 
				     expandQuery: (query: string, model?: string) => expandQuery(query, model, db),
			
@@ -2020,7 +2020,7 @@ function buildFTS5Query(query: string): string | null {
 
				   return terms.map(t => `"${t}"*`).join(' AND ');
			
 
				 }
			
 
				 
			
 
				-export function searchFTS(db: Database, query: string, limit: number = 20, collectionId?: number): SearchResult[] {
			
 
				+export function searchFTS(db: Database, query: string, limit: number = 20, collectionName?: string): SearchResult[] {
			
 
				   const ftsQuery = buildFTS5Query(query);
			
 
				   if (!ftsQuery) return [];
			
 
				 
			
@@ -2039,12 +2039,9 @@ export function searchFTS(db: Database, query: string, limit: number = 20, colle
 
				   `;
			
 
				   const params: (string | number)[] = [ftsQuery];
			
 
				 
			
 
				-  if (collectionId) {
			
 
				-    // Note: collectionId is a legacy parameter that should be phased out
			
 
				-    // Collections are now managed in YAML. For now, we interpret it as a collection name filter.
			
 
				-    // This code path is likely unused as collection filtering should be done at CLI level.
			
 
				+  if (collectionName) {
			
 
				     sql += ` AND d.collection = ?`;
			
 
				-    params.push(String(collectionId));
			
 
				+    params.push(String(collectionName));
			
 
				   }
			
 
				 
			
 
				   // bm25 lower is better; sort ascending.
			
@@ -2080,11 +2077,11 @@ export function searchFTS(db: Database, query: string, limit: number = 20, colle
 
				 // Vector Search
			
 
				 // =============================================================================
			
 
				 
			
 
				-export async function searchVec(db: Database, query: string, model: string, limit: number = 20, collectionName?: string, session?: ILLMSession): Promise<SearchResult[]> {
			
 
				+export async function searchVec(db: Database, query: string, model: string, limit: number = 20, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[]): Promise<SearchResult[]> {
			
 
				   const tableExists = db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
			
 
				   if (!tableExists) return [];
			
 
				 
			
 
				-  const embedding = await getEmbedding(query, model, true, session);
			
 
				+  const embedding = precomputedEmbedding ?? await getEmbedding(query, model, true, session);
			
 
				   if (!embedding) return [];
			
 
				 
			
 
				   // IMPORTANT: We use a two-step query approach here because sqlite-vec virtual tables
			
@@ -2848,8 +2845,8 @@ export async function hybridQuery(
 
				   ).get();
			
 
				 
			
 
				   // Step 1: BM25 probe — strong signal skips expensive LLM expansion
			
 
				-  const initialFts = store.searchFTS(query, 20)
			
 
				-    .filter(r => !collection || r.collectionName === collection);
			
 
				+  // Pass collection directly into FTS query (filter at SQL level, not post-hoc)
			
 
				+  const initialFts = store.searchFTS(query, 20, collection);
			
 
				   const topScore = initialFts[0]?.score ?? 0;
			
 
				   const secondScore = initialFts[1]?.score ?? 0;
			
 
				   const hasStrongSignal = initialFts.length > 0
			
@@ -2875,26 +2872,15 @@ export async function hybridQuery(
 
				   }
			
 
				 
			
 
				   // Step 3: Route searches by query type
			
 
				-  // Original query → vector search (FTS already covered by probe in step 1).
			
 
				-  // Vector searches run sequentially — node-llama-cpp's embed context
			
 
				-  // hangs on concurrent embed() calls (known limitation).
			
 
				-  if (hasVectors) {
			
 
				-    const vecResults = await store.searchVec(query, DEFAULT_EMBED_MODEL, 20, collection);
			
 
				-    if (vecResults.length > 0) {
			
 
				-      for (const r of vecResults) docidMap.set(r.filepath, r.docid);
			
 
				-      rankedLists.push(vecResults.map(r => ({
			
 
				-        file: r.filepath, displayPath: r.displayPath,
			
 
				-        title: r.title, body: r.body || "", score: r.score,
			
 
				-      })));
			
 
				-    }
			
 
				-  }
			
 
				+  //
			
 
				+  // Strategy: run all FTS queries immediately (they're sync/instant), then
			
 
				+  // batch-embed all vector queries in one embedBatch() call, then run
			
 
				+  // sqlite-vec lookups with pre-computed embeddings.
			
 
				 
			
 
				-  // Expanded queries → route by type: lex→FTS only, vec/hyde→vector only.
			
 
				-  // This restores the CLI's query-type-aware routing that was lost in the initial refactor.
			
 
				+  // 3a: Run FTS for all lex expansions right away (no LLM needed)
			
 
				   for (const q of expanded) {
			
 
				     if (q.type === 'lex') {
			
 
				-      const ftsResults = store.searchFTS(q.text, 20)
			
 
				-        .filter(r => !collection || r.collectionName === collection);
			
 
				+      const ftsResults = store.searchFTS(q.text, 20, collection);
			
 
				       if (ftsResults.length > 0) {
			
 
				         for (const r of ftsResults) docidMap.set(r.filepath, r.docid);
			
 
				         rankedLists.push(ftsResults.map(r => ({
			
@@ -2902,17 +2888,40 @@ export async function hybridQuery(
 
				           title: r.title, body: r.body || "", score: r.score,
			
 
				         })));
			
 
				       }
			
 
				-    } else {
			
 
				-      // vec or hyde → vector search only
			
 
				-      if (hasVectors) {
			
 
				-        const vecResults = await store.searchVec(q.text, DEFAULT_EMBED_MODEL, 20, collection);
			
 
				-        if (vecResults.length > 0) {
			
 
				-          for (const r of vecResults) docidMap.set(r.filepath, r.docid);
			
 
				-          rankedLists.push(vecResults.map(r => ({
			
 
				-            file: r.filepath, displayPath: r.displayPath,
			
 
				-            title: r.title, body: r.body || "", score: r.score,
			
 
				-          })));
			
 
				-        }
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  // 3b: Collect all texts that need vector search (original query + vec/hyde expansions)
			
 
				+  if (hasVectors) {
			
 
				+    const vecQueries: { text: string; isOriginal: boolean }[] = [
			
 
				+      { text: query, isOriginal: true },
			
 
				+    ];
			
 
				+    for (const q of expanded) {
			
 
				+      if (q.type === 'vec' || q.type === 'hyde') {
			
 
				+        vecQueries.push({ text: q.text, isOriginal: false });
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+    // Batch embed all vector queries in a single call
			
 
				+    const llm = getDefaultLlamaCpp();
			
 
				+    const textsToEmbed = vecQueries.map(q => formatQueryForEmbedding(q.text));
			
 
				+    const embeddings = await llm.embedBatch(textsToEmbed);
			
 
				+
			
 
				+    // Run sqlite-vec lookups with pre-computed embeddings
			
 
				+    for (let i = 0; i < vecQueries.length; i++) {
			
 
				+      const embedding = embeddings[i]?.embedding;
			
 
				+      if (!embedding) continue;
			
 
				+
			
 
				+      const vecResults = await store.searchVec(
			
 
				+        vecQueries[i]!.text, DEFAULT_EMBED_MODEL, 20, collection,
			
 
				+        undefined, embedding
			
 
				+      );
			
 
				+      if (vecResults.length > 0) {
			
 
				+        for (const r of vecResults) docidMap.set(r.filepath, r.docid);
			
 
				+        rankedLists.push(vecResults.map(r => ({
			
 
				+          file: r.filepath, displayPath: r.displayPath,
			
 
				+          title: r.title, body: r.body || "", score: r.score,
			
 
				+        })));
			
 
				       }
			
 
				     }
			
 
				   }