Browse Source

perf: CPU parallelism via multi-context thread splitting

Our assumption that CPU can't benefit from multiple contexts was
wrong. The withLock in node-llama-cpp serializes within a single
context, but separate contexts with split threads run on different
cores in true parallel.

Key changes:
- computeParallelism() now returns >1 on CPU (cores / 4, max 4)
- threadsPerContext() splits math cores evenly across contexts
- Both embed and rerank contexts get proper thread counts
- Benchmark updated to test CPU parallelism

Before (CPU, 40 docs): 9.7s (4.1 docs/s) — 6 threads, 1 context
After  (CPU, 40 docs): 2.3s (17.2 docs/s) — 32 threads, 8 contexts

Two fixes stacked:
1. Thread count: default was 6 (library hardcode), now uses all
   math cores — 2× improvement alone
2. Multi-context: splitting cores across 8 contexts gives another
   2.2× on top

End-to-end 'qmd query' on CPU: 10.3s → 2.9s

CPU benchmark (Threadripper PRO 7975WX, 32 math cores):
  1 ctx: 5001ms (8.0 docs/s)
  2 ctx: 3585ms (11.2 docs/s)  1.4×
  4 ctx: 2874ms (13.9 docs/s)  1.7×
  8 ctx: 2323ms (17.2 docs/s)  2.2×
Tobi Lütke 3 months ago
parent
commit
392934e78a
2 changed files with 50 additions and 16 deletions
  1. 12 4
      src/bench-rerank.ts
  2. 38 12
      src/llm.ts

+ 12 - 4
src/bench-rerank.ts

@@ -108,18 +108,23 @@ async function benchmarkConfig(
   const vramBefore = llama.gpu ? await llama.getVramState() : null;
   const rssBefore = getMemUsage().rss;
 
-  // Create contexts
+  // Create contexts. On CPU, split threads evenly across contexts.
+  const cpuThreads = !llama.gpu ? Math.floor(llama.cpuMathCores / parallelism) : 0;
   const contexts = [];
   for (let i = 0; i < parallelism; i++) {
     try {
       contexts.push(await model.createRankingContext({
         contextSize: CONTEXT_SIZE,
         flashAttention: flash,
+        ...(cpuThreads > 0 ? { threads: cpuThreads } : {}),
       }));
     } catch {
       if (contexts.length === 0) {
         // Try without flash
-        contexts.push(await model.createRankingContext({ contextSize: CONTEXT_SIZE }));
+        contexts.push(await model.createRankingContext({
+          contextSize: CONTEXT_SIZE,
+          ...(cpuThreads > 0 ? { threads: cpuThreads } : {}),
+        }));
       }
       break;
     }
@@ -253,8 +258,11 @@ async function main() {
 
   for (const p of PARALLEL_CONFIGS) {
     if (!llama.gpu && p > 1) {
-      console.log(`\n  [${p} ctx] skipped (CPU — no benefit from parallelism)`);
-      continue;
+      // CPU: only test if we have enough cores (at least 4 per context)
+      if (llama.cpuMathCores < p * 4) {
+        console.log(`\n  [${p} ctx] skipped (need ${p * 4} cores, have ${llama.cpuMathCores})`);
+        continue;
+      }
     }
 
     // Test with flash attention

+ 38 - 12
src/llm.ts

@@ -561,22 +561,42 @@ export class LlamaCpp implements LLM {
   }
 
   /**
-   * Compute how many parallel contexts to create based on available VRAM.
-   * Conservative: uses at most 25% of free VRAM for contexts, capped at 8.
+   * Compute how many parallel contexts to create.
+   *
+   * GPU: constrained by VRAM (25% of free, capped at 8).
+   * CPU: constrained by cores. Splitting threads across contexts enables
+   *      true parallelism (each context runs on its own cores). Use at most
+   *      half the math cores, with at least 4 threads per context.
    */
   private async computeParallelism(perContextMB: number): Promise<number> {
     const llama = await this.ensureLlama();
-    if (!llama.gpu) return 1; // CPU: no benefit from multiple contexts
 
-    try {
-      const vram = await llama.getVramState();
-      const freeMB = vram.free / (1024 * 1024);
-      // Use at most 25% of free VRAM, min 1, max 8
-      const maxByVram = Math.floor((freeMB * 0.25) / perContextMB);
-      return Math.max(1, Math.min(8, maxByVram));
-    } catch {
-      return 2; // Conservative fallback
+    if (llama.gpu) {
+      try {
+        const vram = await llama.getVramState();
+        const freeMB = vram.free / (1024 * 1024);
+        const maxByVram = Math.floor((freeMB * 0.25) / perContextMB);
+        return Math.max(1, Math.min(8, maxByVram));
+      } catch {
+        return 2;
+      }
     }
+
+    // CPU: split cores across contexts. At least 4 threads per context.
+    const cores = llama.cpuMathCores || 4;
+    const maxContexts = Math.floor(cores / 4);
+    return Math.max(1, Math.min(4, maxContexts));
+  }
+
+  /**
+   * Get the number of threads each context should use, given N parallel contexts.
+   * Splits available math cores evenly across contexts.
+   */
+  private async threadsPerContext(parallelism: number): Promise<number> {
+    const llama = await this.ensureLlama();
+    if (llama.gpu) return 0; // GPU: let the library decide
+    const cores = llama.cpuMathCores || 4;
+    return Math.max(1, Math.floor(cores / parallelism));
   }
 
   /**
@@ -599,9 +619,12 @@ export class LlamaCpp implements LLM {
       const model = await this.ensureEmbedModel();
       // Embed contexts are ~143 MB each (nomic-embed 2048 ctx)
       const n = await this.computeParallelism(150);
+      const threads = await this.threadsPerContext(n);
       for (let i = 0; i < n; i++) {
         try {
-          this.embedContexts.push(await model.createEmbeddingContext());
+          this.embedContexts.push(await model.createEmbeddingContext({
+            ...(threads > 0 ? { threads } : {}),
+          }));
         } catch {
           if (this.embedContexts.length === 0) throw new Error("Failed to create any embedding context");
           break;
@@ -703,11 +726,13 @@ export class LlamaCpp implements LLM {
       const model = await this.ensureRerankModel();
       // ~960 MB per context with flash attention at contextSize 2048
       const n = await this.computeParallelism(1000);
+      const threads = await this.threadsPerContext(n);
       for (let i = 0; i < n; i++) {
         try {
           this.rerankContexts.push(await model.createRankingContext({
             contextSize: LlamaCpp.RERANK_CONTEXT_SIZE,
             flashAttention: true,
+            ...(threads > 0 ? { threads } : {}),
           }));
         } catch {
           if (this.rerankContexts.length === 0) {
@@ -715,6 +740,7 @@ export class LlamaCpp implements LLM {
             try {
               this.rerankContexts.push(await model.createRankingContext({
                 contextSize: LlamaCpp.RERANK_CONTEXT_SIZE,
+                ...(threads > 0 ? { threads } : {}),
               }));
             } catch {
               throw new Error("Failed to create any rerank context");