4 months ago · 0a0e1e6f29
--- a/src/llm.ts
+++ b/src/llm.ts
@@ -357,7 +357,7 @@ export class LlamaCpp implements LLM {
 
				   private embedContext: LlamaEmbeddingContext | null = null;
			
 
				   private generateModel: LlamaModel | null = null;
			
 
				   private rerankModel: LlamaModel | null = null;
			
 
				-  private rerankContext: Awaited<ReturnType<LlamaModel["createRankingContext"]>> | null = null;
			
 
				+  private rerankContexts: Awaited<ReturnType<LlamaModel["createRankingContext"]>>[] = [];
			
 
				 
			
 
				   private embedModelUri: string;
			
 
				   private generateModelUri: string;
			
@@ -423,7 +423,7 @@ export class LlamaCpp implements LLM {
 
				    * Check if any contexts are currently loaded (and therefore worth unloading on inactivity).
			
 
				    */
			
 
				   private hasLoadedContexts(): boolean {
			
 
				-    return !!(this.embedContext || this.rerankContext);
			
 
				+    return !!(this.embedContext || this.rerankContexts.length > 0);
			
 
				   }
			
 
				 
			
 
				   /**
			
@@ -449,10 +449,10 @@ export class LlamaCpp implements LLM {
 
				       await this.embedContext.dispose();
			
 
				       this.embedContext = null;
			
 
				     }
			
 
				-    if (this.rerankContext) {
			
 
				-      await this.rerankContext.dispose();
			
 
				-      this.rerankContext = null;
			
 
				+    for (const ctx of this.rerankContexts) {
			
 
				+      await ctx.dispose();
			
 
				     }
			
 
				+    this.rerankContexts = [];
			
 
				 
			
 
				     // Optionally dispose models too (opt-in)
			
 
				     if (this.disposeModelsOnInactivity) {
			
@@ -646,15 +646,31 @@ export class LlamaCpp implements LLM {
 
				   }
			
 
				 
			
 
				   /**
			
 
				-   * Load rerank context (lazy). Context can be disposed and recreated without reloading the model.
			
 
				+   * Load rerank contexts (lazy). Creates multiple contexts for parallel ranking.
			
 
				+   * Each context has its own sequence, so they can evaluate independently.
			
 
				    */
			
 
				-  private async ensureRerankContext(): Promise<Awaited<ReturnType<LlamaModel["createRankingContext"]>>> {
			
 
				-    if (!this.rerankContext) {
			
 
				+  private static readonly RERANK_PARALLEL_CONTEXTS = 4;
			
 
				+
			
 
				+  private async ensureRerankContexts(): Promise<Awaited<ReturnType<LlamaModel["createRankingContext"]>>[]> {
			
 
				+    if (this.rerankContexts.length === 0) {
			
 
				       const model = await this.ensureRerankModel();
			
 
				-      this.rerankContext = await model.createRankingContext();
			
 
				+      const n = LlamaCpp.RERANK_PARALLEL_CONTEXTS;
			
 
				+      // Create contexts sequentially to avoid VRAM allocation races
			
 
				+      for (let i = 0; i < n; i++) {
			
 
				+        try {
			
 
				+          this.rerankContexts.push(await model.createRankingContext());
			
 
				+        } catch {
			
 
				+          // VRAM exhausted — use however many we got
			
 
				+          if (this.rerankContexts.length === 0) {
			
 
				+            // Must have at least one
			
 
				+            throw new Error("Failed to create any rerank context");
			
 
				+          }
			
 
				+          break;
			
 
				+        }
			
 
				+      }
			
 
				     }
			
 
				     this.touchActivity();
			
 
				-    return this.rerankContext;
			
 
				+    return this.rerankContexts;
			
 
				   }
			
 
				 
			
 
				   // ==========================================================================
			
@@ -901,7 +917,7 @@ export class LlamaCpp implements LLM {
 
				     // Ping activity at start to keep models alive during this operation
			
 
				     this.touchActivity();
			
 
				 
			
 
				-    const context = await this.ensureRerankContext();
			
 
				+    const contexts = await this.ensureRerankContexts();
			
 
				 
			
 
				     // Build a map from document text to original indices (for lookup after sorting)
			
 
				     const textToDoc = new Map<string, { file: string; index: number }>();
			
@@ -912,8 +928,24 @@ export class LlamaCpp implements LLM {
 
				     // Extract just the text for ranking
			
 
				     const texts = documents.map((doc) => doc.text);
			
 
				 
			
 
				-    // Use the proper ranking API - returns [{document: string, score: number}] sorted by score
			
 
				-    const ranked = await context.rankAndSort(query, texts);
			
 
				+    // Split documents across contexts for parallel evaluation.
			
 
				+    // Each context has its own sequence with a lock, so parallelism comes
			
 
				+    // from multiple contexts evaluating different chunks simultaneously.
			
 
				+    const n = contexts.length;
			
 
				+    const chunkSize = Math.ceil(texts.length / n);
			
 
				+    const chunks = Array.from({ length: n }, (_, i) =>
			
 
				+      texts.slice(i * chunkSize, (i + 1) * chunkSize)
			
 
				+    ).filter(chunk => chunk.length > 0);
			
 
				+
			
 
				+    const allScores = await Promise.all(
			
 
				+      chunks.map((chunk, i) => contexts[i]!.rankAll(query, chunk))
			
 
				+    );
			
 
				+
			
 
				+    // Reassemble scores in original order and sort
			
 
				+    const flatScores = allScores.flat();
			
 
				+    const ranked = texts
			
 
				+      .map((text, i) => ({ document: text, score: flatScores[i]! }))
			
 
				+      .sort((a, b) => b.score - a.score);
			
 
				 
			
 
				     // Map back to our result format using the text-to-doc map
			
 
				     const results: RerankDocumentResult[] = ranked.map((item) => {
			
@@ -984,7 +1016,7 @@ export class LlamaCpp implements LLM {
 
				 
			
 
				     // Clear references
			
 
				     this.embedContext = null;
			
 
				-    this.rerankContext = null;
			
 
				+    this.rerankContexts = [];
			
 
				     this.embedModel = null;
			
 
				     this.generateModel = null;
			
 
				     this.rerankModel = null;