Răsfoiți Sursa

feat: auto-detect GPU acceleration + device info in status

QMD was running all models on CPU even when CUDA/Vulkan/Metal
was available. The getLlama() call used no gpu option, defaulting
to false.

Now:
- ensureLlama() tries cuda → vulkan → metal → CPU fallback
- Prints warning to stderr if falling back to CPU
- 'qmd status' shows GPU type, device names, VRAM, and CPU cores
- On this machine: 7.5s query vs 5+ minutes on CPU (reranker)

The reranker (Qwen3-Reranker-0.6B) calls are serialized by a lock
in node-llama-cpp's rankAndSort() — each of the 40 chunks is
evaluated sequentially. This is inherent to the library's design
(single sequence context). GPU acceleration is the fix, not
batching — the lock prevents true parallelism regardless.
Tobi Lütke 3 luni în urmă
părinte
comite
ee86bba45e
2 a modificat fișierele cu 85 adăugiri și 4 ștergeri
  1. 52 1
      src/llm.ts
  2. 33 3
      src/qmd.ts

+ 52 - 1
src/llm.ts

@@ -491,7 +491,29 @@ export class LlamaCpp implements LLM {
    */
   private async ensureLlama(): Promise<Llama> {
     if (!this.llama) {
-      this.llama = await getLlama({ logLevel: LlamaLogLevel.error });
+      // Auto-detect GPU: try cuda, then vulkan, then metal, then CPU fallback
+      let llama: Llama | null = null;
+      for (const gpu of ["cuda", "vulkan", "metal"] as const) {
+        try {
+          llama = await getLlama({ gpu, logLevel: LlamaLogLevel.error });
+          break;
+        } catch {
+          // GPU type not available, try next
+        }
+      }
+      if (!llama) {
+        llama = await getLlama({ gpu: false, logLevel: LlamaLogLevel.error });
+        process.stderr.write(
+          "QMD Warning: no GPU acceleration available, running models on CPU (this will be slow).\n" +
+          "Run 'qmd status' for device info. Install CUDA/Vulkan/Metal support for better performance.\n"
+        );
+      } else if (!llama.supportsGpuOffloading) {
+        process.stderr.write(
+          "QMD Warning: GPU detected but offloading not supported, models will run on CPU.\n" +
+          "Run 'qmd status' for device info.\n"
+        );
+      }
+      this.llama = llama;
     }
     return this.llama;
   }
@@ -909,6 +931,35 @@ export class LlamaCpp implements LLM {
     };
   }
 
+  /**
+   * Get device/GPU info for status display.
+   * Initializes llama if not already done.
+   */
+  async getDeviceInfo(): Promise<{
+    gpu: string | false;
+    gpuOffloading: boolean;
+    gpuDevices: string[];
+    vram?: { total: number; used: number; free: number };
+    cpuCores: number;
+  }> {
+    const llama = await this.ensureLlama();
+    const gpuDevices = await llama.getGpuDeviceNames();
+    let vram: { total: number; used: number; free: number } | undefined;
+    if (llama.gpu) {
+      try {
+        const state = await llama.getVramState();
+        vram = { total: state.total, used: state.used, free: state.free };
+      } catch { /* no vram info */ }
+    }
+    return {
+      gpu: llama.gpu,
+      gpuOffloading: llama.supportsGpuOffloading,
+      gpuDevices,
+      vram,
+      cpuCores: llama.cpuMathCores,
+    };
+  }
+
   async dispose(): Promise<void> {
     // Prevent double-dispose
     if (this.disposed) {

+ 33 - 3
src/qmd.ts

@@ -65,7 +65,7 @@ import {
   createStore,
   getDefaultDbPath,
 } from "./store.js";
-import { disposeDefaultLlamaCpp, withLLMSession, pullModels, DEFAULT_EMBED_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_MODEL_CACHE_DIR } from "./llm.js";
+import { disposeDefaultLlamaCpp, getDefaultLlamaCpp, withLLMSession, pullModels, DEFAULT_EMBED_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_MODEL_CACHE_DIR } from "./llm.js";
 import {
   formatSearchResults,
   formatDocuments,
@@ -249,7 +249,7 @@ function formatBytes(bytes: number): string {
   return `${(bytes / (1024 * 1024 * 1024)).toFixed(1)} GB`;
 }
 
-function showStatus(): void {
+async function showStatus(): Promise<void> {
   const dbPath = getDbPath();
   const db = getDb();
 
@@ -362,6 +362,36 @@ function showStatus(): void {
     console.log(`\n${c.dim}No collections. Run 'qmd collection add .' to index markdown files.${c.reset}`);
   }
 
+  // Device / GPU info
+  try {
+    const llm = getDefaultLlamaCpp();
+    const device = await llm.getDeviceInfo();
+    console.log(`\n${c.bold}Device${c.reset}`);
+    if (device.gpu) {
+      console.log(`  GPU:      ${c.green}${device.gpu}${c.reset} (offloading: ${device.gpuOffloading ? 'yes' : 'no'})`);
+      if (device.gpuDevices.length > 0) {
+        // Deduplicate and count GPUs
+        const counts = new Map<string, number>();
+        for (const name of device.gpuDevices) {
+          counts.set(name, (counts.get(name) || 0) + 1);
+        }
+        const deviceStr = Array.from(counts.entries())
+          .map(([name, count]) => count > 1 ? `${count}× ${name}` : name)
+          .join(', ');
+        console.log(`  Devices:  ${deviceStr}`);
+      }
+      if (device.vram) {
+        console.log(`  VRAM:     ${formatBytes(device.vram.free)} free / ${formatBytes(device.vram.total)} total`);
+      }
+    } else {
+      console.log(`  GPU:      ${c.yellow}none${c.reset} (running on CPU — models will be slow)`);
+      console.log(`  ${c.dim}Tip: Install CUDA, Vulkan, or Metal support for GPU acceleration.${c.reset}`);
+    }
+    console.log(`  CPU:      ${device.cpuCores} math cores`);
+  } catch {
+    // Don't fail status if LLM init fails
+  }
+
   closeDb();
 }
 
@@ -2347,7 +2377,7 @@ if (import.meta.main) {
     }
 
     case "status":
-      showStatus();
+      await showStatus();
       break;
 
     case "update":