ソースを参照

Make LlamaCpp dispose idempotent and avoid Metal backend crash

- Add disposed flag to prevent double-dispose
- Don't explicitly dispose llama resources in dispose() - just clear refs
- Let process exit handle Metal cleanup naturally
- Remove disposeDefaultLlamaCpp call from eval tests

Note: llama.cpp Metal backend still crashes at process exit due to
ggml-metal cleanup issues. This is a known upstream issue:
https://github.com/ggml-org/llama.cpp/pull/17869

All tests pass (12/12), the abort happens after test completion.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
Tobi Lutke 5 ヶ月 前
コミット
4131c827de
2 ファイル変更30 行追加37 行削除
  1. 3 4
      src/eval.test.ts
  2. 27 33
      src/llm.ts

+ 3 - 4
src/eval.test.ts

@@ -34,7 +34,7 @@ import {
   DEFAULT_EMBED_MODEL,
   type RankedResult,
 } from "./store";
-import { getDefaultLlamaCpp, disposeDefaultLlamaCpp, formatDocForEmbedding } from "./llm";
+import { getDefaultLlamaCpp, formatDocForEmbedding } from "./llm";
 
 // Eval queries with expected documents
 const evalQueries: {
@@ -192,9 +192,8 @@ describe("Vector Search", () => {
     hasEmbeddings = true;
   }, 120000); // 2 minute timeout for embedding generation
 
-  afterAll(async () => {
-    await disposeDefaultLlamaCpp();
-  });
+  // Note: Don't call disposeDefaultLlamaCpp() here - it causes Metal backend
+  // assertion failures during process exit. Let the process exit handle cleanup.
 
   test("easy queries: ≥60% Hit@3 (vector should match keywords too)", async () => {
     if (!hasEmbeddings) return; // Skip if embedding failed

+ 27 - 33
src/llm.ts

@@ -220,6 +220,9 @@ export class LlamaCpp implements LLM {
   private inactivityTimer: ReturnType<typeof setTimeout> | null = null;
   private inactivityTimeoutMs: number;
 
+  // Track disposal state to prevent double-dispose
+  private disposed = false;
+
   constructor(config: LlamaCppConfig = {}) {
     this.embedModelUri = config.embedModel || DEFAULT_EMBED_MODEL;
     this.generateModelUri = config.generateModel || DEFAULT_GENERATE_MODEL;
@@ -263,6 +266,11 @@ export class LlamaCpp implements LLM {
    * Models will be reloaded lazily on next operation.
    */
   async unloadModels(): Promise<void> {
+    // Don't unload if already disposed
+    if (this.disposed) {
+      return;
+    }
+
     // Clear timer
     if (this.inactivityTimer) {
       clearTimeout(this.inactivityTimer);
@@ -684,45 +692,31 @@ Generate the structured expansion:`;
   }
 
   async dispose(): Promise<void> {
+    // Prevent double-dispose
+    if (this.disposed) {
+      return;
+    }
+    this.disposed = true;
+
     // Clear inactivity timer
     if (this.inactivityTimer) {
       clearTimeout(this.inactivityTimer);
       this.inactivityTimer = null;
     }
 
-    // Dispose contexts
-    if (this.embedContext) {
-      await this.embedContext.dispose();
-      this.embedContext = null;
-    }
-    if (this.generateContext) {
-      await this.generateContext.dispose();
-      this.generateContext = null;
-    }
-    if (this.rerankContext) {
-      await this.rerankContext.dispose();
-      this.rerankContext = null;
-    }
-
-    // Dispose models
-    if (this.embedModel) {
-      await this.embedModel.dispose();
-      this.embedModel = null;
-    }
-    if (this.generateModel) {
-      await this.generateModel.dispose();
-      this.generateModel = null;
-    }
-    if (this.rerankModel) {
-      await this.rerankModel.dispose();
-      this.rerankModel = null;
-    }
-
-    // Dispose llama
-    if (this.llama) {
-      await this.llama.dispose();
-      this.llama = null;
-    }
+    // Don't explicitly dispose llama resources - it causes Metal backend
+    // assertion failures during process cleanup. The Metal device cleanup
+    // in ggml-metal expects resources to be freed in a specific order that
+    // we can't control. Just clear references and let the process exit
+    // handle cleanup naturally.
+    // See: https://github.com/ggml-org/llama.cpp/pull/17869
+    this.embedContext = null;
+    this.generateContext = null;
+    this.rerankContext = null;
+    this.embedModel = null;
+    this.generateModel = null;
+    this.rerankModel = null;
+    this.llama = null;
   }
 }