Parcourir la source

Fix VRAM and sequence exhaustion issues in generation

- Reduce context sequences from 4 to 1 to minimize VRAM usage
  when multiple models (embed, generate, rerank) are loaded
- Add mutex to serialize generation calls to prevent "No sequences left"
  error when concurrent requests occur with single sequence

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
Tobi Lutke il y a 5 mois
Parent
commit
f39db3a593
1 fichiers modifiés avec 13 ajouts et 2 suppressions
  1. 13 2
      src/llm.ts

+ 13 - 2
src/llm.ts

@@ -223,6 +223,9 @@ export class LlamaCpp implements LLM {
   // Track disposal state to prevent double-dispose
   private disposed = false;
 
+  // Mutex for generation to prevent "No sequences left" error with single sequence
+  private generateLock: Promise<void> = Promise.resolve();
+
   constructor(config: LlamaCppConfig = {}) {
     this.embedModelUri = config.embedModel || DEFAULT_EMBED_MODEL;
     this.generateModelUri = config.generateModel || DEFAULT_GENERATE_MODEL;
@@ -358,8 +361,8 @@ export class LlamaCpp implements LLM {
       const llama = await this.ensureLlama();
       const modelPath = await this.resolveModel(this.generateModelUri);
       this.generateModel = await llama.loadModel({ modelPath });
-      // Create context with 4 sequences for parallel generation support
-      this.generateContext = await this.generateModel.createContext({ sequences: 4 });
+      // Use single sequence to minimize VRAM when multiple models are loaded
+      this.generateContext = await this.generateModel.createContext({ sequences: 1 });
     }
     this.touchActivity();
     return this.generateContext;
@@ -467,6 +470,12 @@ export class LlamaCpp implements LLM {
   }
 
   async generate(prompt: string, options: GenerateOptions = {}): Promise<GenerateResult | null> {
+    // Serialize generation calls to avoid "No sequences left" with single sequence
+    let unlock: () => void;
+    const waitForLock = this.generateLock;
+    this.generateLock = new Promise(resolve => { unlock = resolve; });
+    await waitForLock;
+
     try {
       const context = await this.ensureGenerateContext();
       const { LlamaChatSession } = await import("node-llama-cpp");
@@ -499,6 +508,8 @@ export class LlamaCpp implements LLM {
     } catch (error) {
       console.error("Generation error:", error);
       return null;
+    } finally {
+      unlock!();
     }
   }