5 months ago · 392934e78a
--- a/src/bench-rerank.ts
+++ b/src/bench-rerank.ts
@@ -108,18 +108,23 @@ async function benchmarkConfig(
 
				   const vramBefore = llama.gpu ? await llama.getVramState() : null;
			
 
				   const rssBefore = getMemUsage().rss;
			
 
				 
			
 
				-  // Create contexts
			
 
				+  // Create contexts. On CPU, split threads evenly across contexts.
			
 
				+  const cpuThreads = !llama.gpu ? Math.floor(llama.cpuMathCores / parallelism) : 0;
			
 
				   const contexts = [];
			
 
				   for (let i = 0; i < parallelism; i++) {
			
 
				     try {
			
 
				       contexts.push(await model.createRankingContext({
			
 
				         contextSize: CONTEXT_SIZE,
			
 
				         flashAttention: flash,
			
 
				+        ...(cpuThreads > 0 ? { threads: cpuThreads } : {}),
			
 
				       }));
			
 
				     } catch {
			
 
				       if (contexts.length === 0) {
			
 
				         // Try without flash
			
 
				-        contexts.push(await model.createRankingContext({ contextSize: CONTEXT_SIZE }));
			
 
				+        contexts.push(await model.createRankingContext({
			
 
				+          contextSize: CONTEXT_SIZE,
			
 
				+          ...(cpuThreads > 0 ? { threads: cpuThreads } : {}),
			
 
				+        }));
			
 
				       }
			
 
				       break;
			
 
				     }
			
@@ -253,8 +258,11 @@ async function main() {
 
				 
			
 
				   for (const p of PARALLEL_CONFIGS) {
			
 
				     if (!llama.gpu && p > 1) {
			
 
				-      console.log(`\n  [${p} ctx] skipped (CPU — no benefit from parallelism)`);
			
 
				-      continue;
			
 
				+      // CPU: only test if we have enough cores (at least 4 per context)
			
 
				+      if (llama.cpuMathCores < p * 4) {
			
 
				+        console.log(`\n  [${p} ctx] skipped (need ${p * 4} cores, have ${llama.cpuMathCores})`);
			
 
				+        continue;
			
 
				+      }
			
 
				     }
			
 
				 
			
 
				     // Test with flash attention
			
--- a/src/llm.ts
+++ b/src/llm.ts
@@ -561,22 +561,42 @@ export class LlamaCpp implements LLM {
 
				   }
			
 
				 
			
 
				   /**
			
 
				-   * Compute how many parallel contexts to create based on available VRAM.
			
 
				-   * Conservative: uses at most 25% of free VRAM for contexts, capped at 8.
			
 
				+   * Compute how many parallel contexts to create.
			
 
				+   *
			
 
				+   * GPU: constrained by VRAM (25% of free, capped at 8).
			
 
				+   * CPU: constrained by cores. Splitting threads across contexts enables
			
 
				+   *      true parallelism (each context runs on its own cores). Use at most
			
 
				+   *      half the math cores, with at least 4 threads per context.
			
 
				    */
			
 
				   private async computeParallelism(perContextMB: number): Promise<number> {
			
 
				     const llama = await this.ensureLlama();
			
 
				-    if (!llama.gpu) return 1; // CPU: no benefit from multiple contexts
			
 
				 
			
 
				-    try {
			
 
				-      const vram = await llama.getVramState();
			
 
				-      const freeMB = vram.free / (1024 * 1024);
			
 
				-      // Use at most 25% of free VRAM, min 1, max 8
			
 
				-      const maxByVram = Math.floor((freeMB * 0.25) / perContextMB);
			
 
				-      return Math.max(1, Math.min(8, maxByVram));
			
 
				-    } catch {
			
 
				-      return 2; // Conservative fallback
			
 
				+    if (llama.gpu) {
			
 
				+      try {
			
 
				+        const vram = await llama.getVramState();
			
 
				+        const freeMB = vram.free / (1024 * 1024);
			
 
				+        const maxByVram = Math.floor((freeMB * 0.25) / perContextMB);
			
 
				+        return Math.max(1, Math.min(8, maxByVram));
			
 
				+      } catch {
			
 
				+        return 2;
			
 
				+      }
			
 
				     }
			
 
				+
			
 
				+    // CPU: split cores across contexts. At least 4 threads per context.
			
 
				+    const cores = llama.cpuMathCores || 4;
			
 
				+    const maxContexts = Math.floor(cores / 4);
			
 
				+    return Math.max(1, Math.min(4, maxContexts));
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Get the number of threads each context should use, given N parallel contexts.
			
 
				+   * Splits available math cores evenly across contexts.
			
 
				+   */
			
 
				+  private async threadsPerContext(parallelism: number): Promise<number> {
			
 
				+    const llama = await this.ensureLlama();
			
 
				+    if (llama.gpu) return 0; // GPU: let the library decide
			
 
				+    const cores = llama.cpuMathCores || 4;
			
 
				+    return Math.max(1, Math.floor(cores / parallelism));
			
 
				   }
			
 
				 
			
 
				   /**
			
@@ -599,9 +619,12 @@ export class LlamaCpp implements LLM {
 
				       const model = await this.ensureEmbedModel();
			
 
				       // Embed contexts are ~143 MB each (nomic-embed 2048 ctx)
			
 
				       const n = await this.computeParallelism(150);
			
 
				+      const threads = await this.threadsPerContext(n);
			
 
				       for (let i = 0; i < n; i++) {
			
 
				         try {
			
 
				-          this.embedContexts.push(await model.createEmbeddingContext());
			
 
				+          this.embedContexts.push(await model.createEmbeddingContext({
			
 
				+            ...(threads > 0 ? { threads } : {}),
			
 
				+          }));
			
 
				         } catch {
			
 
				           if (this.embedContexts.length === 0) throw new Error("Failed to create any embedding context");
			
 
				           break;
			
@@ -703,11 +726,13 @@ export class LlamaCpp implements LLM {
 
				       const model = await this.ensureRerankModel();
			
 
				       // ~960 MB per context with flash attention at contextSize 2048
			
 
				       const n = await this.computeParallelism(1000);
			
 
				+      const threads = await this.threadsPerContext(n);
			
 
				       for (let i = 0; i < n; i++) {
			
 
				         try {
			
 
				           this.rerankContexts.push(await model.createRankingContext({
			
 
				             contextSize: LlamaCpp.RERANK_CONTEXT_SIZE,
			
 
				             flashAttention: true,
			
 
				+            ...(threads > 0 ? { threads } : {}),
			
 
				           }));
			
 
				         } catch {
			
 
				           if (this.rerankContexts.length === 0) {
			
@@ -715,6 +740,7 @@ export class LlamaCpp implements LLM {
 
				             try {
			
 
				               this.rerankContexts.push(await model.createRankingContext({
			
 
				                 contextSize: LlamaCpp.RERANK_CONTEXT_SIZE,
			
 
				+                ...(threads > 0 ? { threads } : {}),
			
 
				               }));
			
 
				             } catch {
			
 
				               throw new Error("Failed to create any rerank context");