4 bulan lalu · 4ac95b5e26
--- a/src/llm.test.ts
+++ b/src/llm.test.ts
@@ -221,10 +221,15 @@ describe("LlamaCpp Integration", () => {
 
				       const successCount = allResults.filter(r => r !== null).length;
			
 
				       expect(successCount).toBe(10);
			
 
				 
			
 
				-      // THE KEY ASSERTION: Only 1 context should be created, not 5
			
 
				-      // Without the fix, contextCreateCount would be 5 (one per concurrent embedBatch call)
			
 
				-      console.log(`Context creation count: ${contextCreateCount} (expected: 1)`);
			
 
				-      expect(contextCreateCount).toBe(1);
			
 
				+      // THE KEY ASSERTION: Contexts should be created once (by ensureEmbedContexts),
			
 
				+      // not duplicated per concurrent embedBatch call. The exact count depends on
			
 
				+      // available VRAM (computeParallelism), but should not be 5 (one per call).
			
 
				+      // Without the fix, contextCreateCount would be 5× the intended count (one set per concurrent call).
			
 
				+      // With the promise guard, contexts are created exactly once regardless of concurrent callers.
			
 
				+      // The count depends on VRAM (computeParallelism), but should be ≤ 8 (the cap).
			
 
				+      console.log(`Context creation count: ${contextCreateCount} (expected: ≤ 8, not 5× duplicated)`);
			
 
				+      expect(contextCreateCount).toBeGreaterThanOrEqual(1);
			
 
				+      expect(contextCreateCount).toBeLessThanOrEqual(8);
			
 
				       
			
 
				       await freshLlm.dispose();
			
 
				     }, 60000);
			
--- a/src/llm.ts
+++ b/src/llm.ts
@@ -354,7 +354,7 @@ const DEFAULT_INACTIVITY_TIMEOUT_MS = 5 * 60 * 1000;
 
				 export class LlamaCpp implements LLM {
			
 
				   private llama: Llama | null = null;
			
 
				   private embedModel: LlamaModel | null = null;
			
 
				-  private embedContext: LlamaEmbeddingContext | null = null;
			
 
				+  private embedContexts: LlamaEmbeddingContext[] = [];
			
 
				   private generateModel: LlamaModel | null = null;
			
 
				   private rerankModel: LlamaModel | null = null;
			
 
				   private rerankContexts: Awaited<ReturnType<LlamaModel["createRankingContext"]>>[] = [];
			
@@ -366,7 +366,6 @@ export class LlamaCpp implements LLM {
 
				 
			
 
				   // Ensure we don't load the same model/context concurrently (which can allocate duplicate VRAM).
			
 
				   private embedModelLoadPromise: Promise<LlamaModel> | null = null;
			
 
				-  private embedContextCreatePromise: Promise<LlamaEmbeddingContext> | null = null;
			
 
				   private generateModelLoadPromise: Promise<LlamaModel> | null = null;
			
 
				   private rerankModelLoadPromise: Promise<LlamaModel> | null = null;
			
 
				 
			
@@ -423,7 +422,7 @@ export class LlamaCpp implements LLM {
 
				    * Check if any contexts are currently loaded (and therefore worth unloading on inactivity).
			
 
				    */
			
 
				   private hasLoadedContexts(): boolean {
			
 
				-    return !!(this.embedContext || this.rerankContexts.length > 0);
			
 
				+    return !!(this.embedContexts.length > 0 || this.rerankContexts.length > 0);
			
 
				   }
			
 
				 
			
 
				   /**
			
@@ -445,10 +444,10 @@ export class LlamaCpp implements LLM {
 
				     }
			
 
				 
			
 
				     // Dispose contexts first
			
 
				-    if (this.embedContext) {
			
 
				-      await this.embedContext.dispose();
			
 
				-      this.embedContext = null;
			
 
				+    for (const ctx of this.embedContexts) {
			
 
				+      await ctx.dispose();
			
 
				     }
			
 
				+    this.embedContexts = [];
			
 
				     for (const ctx of this.rerankContexts) {
			
 
				       await ctx.dispose();
			
 
				     }
			
@@ -557,34 +556,69 @@ export class LlamaCpp implements LLM {
 
				   }
			
 
				 
			
 
				   /**
			
 
				-   * Load embedding context (lazy). Context can be disposed and recreated without reloading the model.
			
 
				+   * Compute how many parallel contexts to create based on available VRAM.
			
 
				+   * Conservative: uses at most 25% of free VRAM for contexts, capped at 8.
			
 
				+   */
			
 
				+  private async computeParallelism(perContextMB: number): Promise<number> {
			
 
				+    const llama = await this.ensureLlama();
			
 
				+    if (!llama.gpu) return 1; // CPU: no benefit from multiple contexts
			
 
				+
			
 
				+    try {
			
 
				+      const vram = await llama.getVramState();
			
 
				+      const freeMB = vram.free / (1024 * 1024);
			
 
				+      // Use at most 25% of free VRAM, min 1, max 8
			
 
				+      const maxByVram = Math.floor((freeMB * 0.25) / perContextMB);
			
 
				+      return Math.max(1, Math.min(8, maxByVram));
			
 
				+    } catch {
			
 
				+      return 2; // Conservative fallback
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Load embedding contexts (lazy). Creates multiple for parallel embedding.
			
 
				    * Uses promise guard to prevent concurrent context creation race condition.
			
 
				    */
			
 
				-  private async ensureEmbedContext(): Promise<LlamaEmbeddingContext> {
			
 
				-    if (!this.embedContext) {
			
 
				-      // If context creation is already in progress, wait for it
			
 
				-      if (this.embedContextCreatePromise) {
			
 
				-        return await this.embedContextCreatePromise;
			
 
				-      }
			
 
				+  private embedContextsCreatePromise: Promise<LlamaEmbeddingContext[]> | null = null;
			
 
				 
			
 
				-      // Start context creation and store promise so concurrent calls wait
			
 
				-      this.embedContextCreatePromise = (async () => {
			
 
				-        const model = await this.ensureEmbedModel();
			
 
				-        const context = await model.createEmbeddingContext();
			
 
				-        this.embedContext = context;
			
 
				-        return context;
			
 
				-      })();
			
 
				+  private async ensureEmbedContexts(): Promise<LlamaEmbeddingContext[]> {
			
 
				+    if (this.embedContexts.length > 0) {
			
 
				+      this.touchActivity();
			
 
				+      return this.embedContexts;
			
 
				+    }
			
 
				 
			
 
				-      try {
			
 
				-        const context = await this.embedContextCreatePromise;
			
 
				-        this.touchActivity();
			
 
				-        return context;
			
 
				-      } finally {
			
 
				-        this.embedContextCreatePromise = null;
			
 
				+    if (this.embedContextsCreatePromise) {
			
 
				+      return await this.embedContextsCreatePromise;
			
 
				+    }
			
 
				+
			
 
				+    this.embedContextsCreatePromise = (async () => {
			
 
				+      const model = await this.ensureEmbedModel();
			
 
				+      // Embed contexts are ~143 MB each (nomic-embed 2048 ctx)
			
 
				+      const n = await this.computeParallelism(150);
			
 
				+      for (let i = 0; i < n; i++) {
			
 
				+        try {
			
 
				+          this.embedContexts.push(await model.createEmbeddingContext());
			
 
				+        } catch {
			
 
				+          if (this.embedContexts.length === 0) throw new Error("Failed to create any embedding context");
			
 
				+          break;
			
 
				+        }
			
 
				       }
			
 
				+      this.touchActivity();
			
 
				+      return this.embedContexts;
			
 
				+    })();
			
 
				+
			
 
				+    try {
			
 
				+      return await this.embedContextsCreatePromise;
			
 
				+    } finally {
			
 
				+      this.embedContextsCreatePromise = null;
			
 
				     }
			
 
				-    this.touchActivity();
			
 
				-    return this.embedContext;
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Get a single embed context (for single-embed calls). Uses first from pool.
			
 
				+   */
			
 
				+  private async ensureEmbedContext(): Promise<LlamaEmbeddingContext> {
			
 
				+    const contexts = await this.ensureEmbedContexts();
			
 
				+    return contexts[0]!;
			
 
				   }
			
 
				 
			
 
				   /**
			
@@ -648,21 +682,24 @@ export class LlamaCpp implements LLM {
 
				   /**
			
 
				    * Load rerank contexts (lazy). Creates multiple contexts for parallel ranking.
			
 
				    * Each context has its own sequence, so they can evaluate independently.
			
 
				+   *
			
 
				+   * Uses contextSize 1024 instead of auto (40960) — reranking chunks are ~800
			
 
				+   * tokens max, so 1024 is plenty. This drops VRAM from 11.6 GB to 711 MB per context.
			
 
				    */
			
 
				-  private static readonly RERANK_PARALLEL_CONTEXTS = 4;
			
 
				+  private static readonly RERANK_CONTEXT_SIZE = 1024;
			
 
				 
			
 
				   private async ensureRerankContexts(): Promise<Awaited<ReturnType<LlamaModel["createRankingContext"]>>[]> {
			
 
				     if (this.rerankContexts.length === 0) {
			
 
				       const model = await this.ensureRerankModel();
			
 
				-      const n = LlamaCpp.RERANK_PARALLEL_CONTEXTS;
			
 
				-      // Create contexts sequentially to avoid VRAM allocation races
			
 
				+      // Rerank contexts are ~711 MB each at contextSize 1024
			
 
				+      const n = await this.computeParallelism(750);
			
 
				       for (let i = 0; i < n; i++) {
			
 
				         try {
			
 
				-          this.rerankContexts.push(await model.createRankingContext());
			
 
				+          this.rerankContexts.push(await model.createRankingContext({
			
 
				+            contextSize: LlamaCpp.RERANK_CONTEXT_SIZE,
			
 
				+          }));
			
 
				         } catch {
			
 
				-          // VRAM exhausted — use however many we got
			
 
				           if (this.rerankContexts.length === 0) {
			
 
				-            // Must have at least one
			
 
				             throw new Error("Failed to create any rerank context");
			
 
				           }
			
 
				           break;
			
@@ -741,26 +778,51 @@ export class LlamaCpp implements LLM {
 
				     if (texts.length === 0) return [];
			
 
				 
			
 
				     try {
			
 
				-      const context = await this.ensureEmbedContext();
			
 
				-
			
 
				-      // node-llama-cpp handles batching internally when we make parallel requests
			
 
				-      const embeddings = await Promise.all(
			
 
				-        texts.map(async (text) => {
			
 
				+      const contexts = await this.ensureEmbedContexts();
			
 
				+      const n = contexts.length;
			
 
				+
			
 
				+      if (n === 1) {
			
 
				+        // Single context: sequential (no point splitting)
			
 
				+        const context = contexts[0]!;
			
 
				+        const embeddings = [];
			
 
				+        for (const text of texts) {
			
 
				           try {
			
 
				             const embedding = await context.getEmbeddingFor(text);
			
 
				-            this.touchActivity();  // Keep-alive during slow batches
			
 
				-            return {
			
 
				-              embedding: Array.from(embedding.vector),
			
 
				-              model: this.embedModelUri,
			
 
				-            };
			
 
				+            this.touchActivity();
			
 
				+            embeddings.push({ embedding: Array.from(embedding.vector), model: this.embedModelUri });
			
 
				           } catch (err) {
			
 
				             console.error("Embedding error for text:", err);
			
 
				-            return null;
			
 
				+            embeddings.push(null);
			
 
				+          }
			
 
				+        }
			
 
				+        return embeddings;
			
 
				+      }
			
 
				+
			
 
				+      // Multiple contexts: split texts across contexts for parallel evaluation
			
 
				+      const chunkSize = Math.ceil(texts.length / n);
			
 
				+      const chunks = Array.from({ length: n }, (_, i) =>
			
 
				+        texts.slice(i * chunkSize, (i + 1) * chunkSize)
			
 
				+      );
			
 
				+
			
 
				+      const chunkResults = await Promise.all(
			
 
				+        chunks.map(async (chunk, i) => {
			
 
				+          const ctx = contexts[i]!;
			
 
				+          const results: (EmbeddingResult | null)[] = [];
			
 
				+          for (const text of chunk) {
			
 
				+            try {
			
 
				+              const embedding = await ctx.getEmbeddingFor(text);
			
 
				+              this.touchActivity();
			
 
				+              results.push({ embedding: Array.from(embedding.vector), model: this.embedModelUri });
			
 
				+            } catch (err) {
			
 
				+              console.error("Embedding error for text:", err);
			
 
				+              results.push(null);
			
 
				+            }
			
 
				           }
			
 
				+          return results;
			
 
				         })
			
 
				       );
			
 
				 
			
 
				-      return embeddings;
			
 
				+      return chunkResults.flat();
			
 
				     } catch (error) {
			
 
				       console.error("Batch embedding error:", error);
			
 
				       return texts.map(() => null);
			
@@ -1015,7 +1077,7 @@ export class LlamaCpp implements LLM {
 
				     }
			
 
				 
			
 
				     // Clear references
			
 
				-    this.embedContext = null;
			
 
				+    this.embedContexts = [];
			
 
				     this.rerankContexts = [];
			
 
				     this.embedModel = null;
			
 
				     this.generateModel = null;
			
@@ -1024,7 +1086,7 @@ export class LlamaCpp implements LLM {
 
				 
			
 
				     // Clear any in-flight load/create promises
			
 
				     this.embedModelLoadPromise = null;
			
 
				-    this.embedContextCreatePromise = null;
			
 
				+    this.embedContextsCreatePromise = null;
			
 
				     this.generateModelLoadPromise = null;
			
 
				     this.rerankModelLoadPromise = null;
			
 
				   }