6 ヶ月前 · 4131c827de
--- a/src/eval.test.ts
+++ b/src/eval.test.ts
@@ -34,7 +34,7 @@ import {
 
				   DEFAULT_EMBED_MODEL,
			
 
				   type RankedResult,
			
 
				 } from "./store";
			
 
				-import { getDefaultLlamaCpp, disposeDefaultLlamaCpp, formatDocForEmbedding } from "./llm";
			
 
				+import { getDefaultLlamaCpp, formatDocForEmbedding } from "./llm";
			
 
				 
			
 
				 // Eval queries with expected documents
			
 
				 const evalQueries: {
			
@@ -192,9 +192,8 @@ describe("Vector Search", () => {
 
				     hasEmbeddings = true;
			
 
				   }, 120000); // 2 minute timeout for embedding generation
			
 
				 
			
 
				-  afterAll(async () => {
			
 
				-    await disposeDefaultLlamaCpp();
			
 
				-  });
			
 
				+  // Note: Don't call disposeDefaultLlamaCpp() here - it causes Metal backend
			
 
				+  // assertion failures during process exit. Let the process exit handle cleanup.
			
 
				 
			
 
				   test("easy queries: ≥60% Hit@3 (vector should match keywords too)", async () => {
			
 
				     if (!hasEmbeddings) return; // Skip if embedding failed
			
--- a/src/llm.ts
+++ b/src/llm.ts
@@ -220,6 +220,9 @@ export class LlamaCpp implements LLM {
 
				   private inactivityTimer: ReturnType<typeof setTimeout> | null = null;
			
 
				   private inactivityTimeoutMs: number;
			
 
				 
			
 
				+  // Track disposal state to prevent double-dispose
			
 
				+  private disposed = false;
			
 
				+
			
 
				   constructor(config: LlamaCppConfig = {}) {
			
 
				     this.embedModelUri = config.embedModel || DEFAULT_EMBED_MODEL;
			
 
				     this.generateModelUri = config.generateModel || DEFAULT_GENERATE_MODEL;
			
@@ -263,6 +266,11 @@ export class LlamaCpp implements LLM {
 
				    * Models will be reloaded lazily on next operation.
			
 
				    */
			
 
				   async unloadModels(): Promise<void> {
			
 
				+    // Don't unload if already disposed
			
 
				+    if (this.disposed) {
			
 
				+      return;
			
 
				+    }
			
 
				+
			
 
				     // Clear timer
			
 
				     if (this.inactivityTimer) {
			
 
				       clearTimeout(this.inactivityTimer);
			
@@ -684,45 +692,31 @@ Generate the structured expansion:`;
 
				   }
			
 
				 
			
 
				   async dispose(): Promise<void> {
			
 
				+    // Prevent double-dispose
			
 
				+    if (this.disposed) {
			
 
				+      return;
			
 
				+    }
			
 
				+    this.disposed = true;
			
 
				+
			
 
				     // Clear inactivity timer
			
 
				     if (this.inactivityTimer) {
			
 
				       clearTimeout(this.inactivityTimer);
			
 
				       this.inactivityTimer = null;
			
 
				     }
			
 
				 
			
 
				-    // Dispose contexts
			
 
				-    if (this.embedContext) {
			
 
				-      await this.embedContext.dispose();
			
 
				-      this.embedContext = null;
			
 
				-    }
			
 
				-    if (this.generateContext) {
			
 
				-      await this.generateContext.dispose();
			
 
				-      this.generateContext = null;
			
 
				-    }
			
 
				-    if (this.rerankContext) {
			
 
				-      await this.rerankContext.dispose();
			
 
				-      this.rerankContext = null;
			
 
				-    }
			
 
				-
			
 
				-    // Dispose models
			
 
				-    if (this.embedModel) {
			
 
				-      await this.embedModel.dispose();
			
 
				-      this.embedModel = null;
			
 
				-    }
			
 
				-    if (this.generateModel) {
			
 
				-      await this.generateModel.dispose();
			
 
				-      this.generateModel = null;
			
 
				-    }
			
 
				-    if (this.rerankModel) {
			
 
				-      await this.rerankModel.dispose();
			
 
				-      this.rerankModel = null;
			
 
				-    }
			
 
				-
			
 
				-    // Dispose llama
			
 
				-    if (this.llama) {
			
 
				-      await this.llama.dispose();
			
 
				-      this.llama = null;
			
 
				-    }
			
 
				+    // Don't explicitly dispose llama resources - it causes Metal backend
			
 
				+    // assertion failures during process cleanup. The Metal device cleanup
			
 
				+    // in ggml-metal expects resources to be freed in a specific order that
			
 
				+    // we can't control. Just clear references and let the process exit
			
 
				+    // handle cleanup naturally.
			
 
				+    // See: https://github.com/ggml-org/llama.cpp/pull/17869
			
 
				+    this.embedContext = null;
			
 
				+    this.generateContext = null;
			
 
				+    this.rerankContext = null;
			
 
				+    this.embedModel = null;
			
 
				+    this.generateModel = null;
			
 
				+    this.rerankModel = null;
			
 
				+    this.llama = null;
			
 
				   }
			
 
				 }