6 months ago · f39db3a593
--- a/src/llm.ts
+++ b/src/llm.ts
@@ -223,6 +223,9 @@ export class LlamaCpp implements LLM {
 
				   // Track disposal state to prevent double-dispose
			
 
				   private disposed = false;
			
 
				 
			
 
				+  // Mutex for generation to prevent "No sequences left" error with single sequence
			
 
				+  private generateLock: Promise<void> = Promise.resolve();
			
 
				+
			
 
				   constructor(config: LlamaCppConfig = {}) {
			
 
				     this.embedModelUri = config.embedModel || DEFAULT_EMBED_MODEL;
			
 
				     this.generateModelUri = config.generateModel || DEFAULT_GENERATE_MODEL;
			
@@ -358,8 +361,8 @@ export class LlamaCpp implements LLM {
 
				       const llama = await this.ensureLlama();
			
 
				       const modelPath = await this.resolveModel(this.generateModelUri);
			
 
				       this.generateModel = await llama.loadModel({ modelPath });
			
 
				-      // Create context with 4 sequences for parallel generation support
			
 
				-      this.generateContext = await this.generateModel.createContext({ sequences: 4 });
			
 
				+      // Use single sequence to minimize VRAM when multiple models are loaded
			
 
				+      this.generateContext = await this.generateModel.createContext({ sequences: 1 });
			
 
				     }
			
 
				     this.touchActivity();
			
 
				     return this.generateContext;
			
@@ -467,6 +470,12 @@ export class LlamaCpp implements LLM {
 
				   }
			
 
				 
			
 
				   async generate(prompt: string, options: GenerateOptions = {}): Promise<GenerateResult | null> {
			
 
				+    // Serialize generation calls to avoid "No sequences left" with single sequence
			
 
				+    let unlock: () => void;
			
 
				+    const waitForLock = this.generateLock;
			
 
				+    this.generateLock = new Promise(resolve => { unlock = resolve; });
			
 
				+    await waitForLock;
			
 
				+
			
 
				     try {
			
 
				       const context = await this.ensureGenerateContext();
			
 
				       const { LlamaChatSession } = await import("node-llama-cpp");
			
@@ -499,6 +508,8 @@ export class LlamaCpp implements LLM {
 
				     } catch (error) {
			
 
				       console.error("Generation error:", error);
			
 
				       return null;
			
 
				+    } finally {
			
 
				+      unlock!();
			
 
				     }
			
 
				   }