hace 6 meses · e8f4dce0b7
--- a/src/eval.test.ts
+++ b/src/eval.test.ts
@@ -34,7 +34,7 @@ import {
 
				   DEFAULT_EMBED_MODEL,
			
 
				   type RankedResult,
			
 
				 } from "./store";
			
 
				-import { getDefaultLlamaCpp, formatDocForEmbedding } from "./llm";
			
 
				+import { getDefaultLlamaCpp, disposeDefaultLlamaCpp, formatDocForEmbedding } from "./llm";
			
 
				 
			
 
				 // Eval queries with expected documents
			
 
				 const evalQueries: {
			
@@ -201,8 +201,8 @@ describe("Vector Search", () => {
 
				     hasEmbeddings = true;
			
 
				   }, 120000); // 2 minute timeout for embedding generation
			
 
				 
			
 
				-  // Note: Don't call disposeDefaultLlamaCpp() here - it causes Metal backend
			
 
				-  // assertion failures during process exit. Let the process exit handle cleanup.
			
 
				+  // Note: Don't dispose here - Hybrid tests also use llama.
			
 
				+  // Dispose happens in the global afterAll.
			
 
				 
			
 
				   test("easy queries: ≥60% Hit@3 (vector should match keywords too)", async () => {
			
 
				     if (!hasEmbeddings) return; // Skip if embedding failed
			
@@ -392,6 +392,8 @@ describe("Hybrid Search (RRF)", () => {
 
				 // Cleanup
			
 
				 // =============================================================================
			
 
				 
			
 
				-afterAll(() => {
			
 
				+afterAll(async () => {
			
 
				+  // Dispose llama before process exit to properly free Metal resources
			
 
				+  await disposeDefaultLlamaCpp();
			
 
				   rmSync(tempDir, { recursive: true, force: true });
			
 
				 });
			
--- a/src/llm.ts
+++ b/src/llm.ts
@@ -704,19 +704,38 @@ Generate the structured expansion:`;
 
				       this.inactivityTimer = null;
			
 
				     }
			
 
				 
			
 
				-    // Don't explicitly dispose llama resources - it causes Metal backend
			
 
				-    // assertion failures during process cleanup. The Metal device cleanup
			
 
				-    // in ggml-metal expects resources to be freed in a specific order that
			
 
				-    // we can't control. Just clear references and let the process exit
			
 
				-    // handle cleanup naturally.
			
 
				-    // See: https://github.com/ggml-org/llama.cpp/pull/17869
			
 
				-    this.embedContext = null;
			
 
				-    this.generateContext = null;
			
 
				-    this.rerankContext = null;
			
 
				-    this.embedModel = null;
			
 
				-    this.generateModel = null;
			
 
				-    this.rerankModel = null;
			
 
				-    this.llama = null;
			
 
				+    // Dispose in order: contexts -> models -> llama
			
 
				+    // Contexts depend on models, models depend on llama
			
 
				+    if (this.embedContext) {
			
 
				+      await this.embedContext.dispose();
			
 
				+      this.embedContext = null;
			
 
				+    }
			
 
				+    if (this.generateContext) {
			
 
				+      await this.generateContext.dispose();
			
 
				+      this.generateContext = null;
			
 
				+    }
			
 
				+    if (this.rerankContext) {
			
 
				+      await this.rerankContext.dispose();
			
 
				+      this.rerankContext = null;
			
 
				+    }
			
 
				+
			
 
				+    if (this.embedModel) {
			
 
				+      await this.embedModel.dispose();
			
 
				+      this.embedModel = null;
			
 
				+    }
			
 
				+    if (this.generateModel) {
			
 
				+      await this.generateModel.dispose();
			
 
				+      this.generateModel = null;
			
 
				+    }
			
 
				+    if (this.rerankModel) {
			
 
				+      await this.rerankModel.dispose();
			
 
				+      this.rerankModel = null;
			
 
				+    }
			
 
				+
			
 
				+    if (this.llama) {
			
 
				+      await this.llama.dispose();
			
 
				+      this.llama = null;
			
 
				+    }
			
 
				   }
			
 
				 }