4 luni în urmă · ee86bba45e
--- a/src/llm.ts
+++ b/src/llm.ts
@@ -491,7 +491,29 @@ export class LlamaCpp implements LLM {
 
				    */
			
 
				   private async ensureLlama(): Promise<Llama> {
			
 
				     if (!this.llama) {
			
 
				-      this.llama = await getLlama({ logLevel: LlamaLogLevel.error });
			
 
				+      // Auto-detect GPU: try cuda, then vulkan, then metal, then CPU fallback
			
 
				+      let llama: Llama | null = null;
			
 
				+      for (const gpu of ["cuda", "vulkan", "metal"] as const) {
			
 
				+        try {
			
 
				+          llama = await getLlama({ gpu, logLevel: LlamaLogLevel.error });
			
 
				+          break;
			
 
				+        } catch {
			
 
				+          // GPU type not available, try next
			
 
				+        }
			
 
				+      }
			
 
				+      if (!llama) {
			
 
				+        llama = await getLlama({ gpu: false, logLevel: LlamaLogLevel.error });
			
 
				+        process.stderr.write(
			
 
				+          "QMD Warning: no GPU acceleration available, running models on CPU (this will be slow).\n" +
			
 
				+          "Run 'qmd status' for device info. Install CUDA/Vulkan/Metal support for better performance.\n"
			
 
				+        );
			
 
				+      } else if (!llama.supportsGpuOffloading) {
			
 
				+        process.stderr.write(
			
 
				+          "QMD Warning: GPU detected but offloading not supported, models will run on CPU.\n" +
			
 
				+          "Run 'qmd status' for device info.\n"
			
 
				+        );
			
 
				+      }
			
 
				+      this.llama = llama;
			
 
				     }
			
 
				     return this.llama;
			
 
				   }
			
@@ -909,6 +931,35 @@ export class LlamaCpp implements LLM {
 
				     };
			
 
				   }
			
 
				 
			
 
				+  /**
			
 
				+   * Get device/GPU info for status display.
			
 
				+   * Initializes llama if not already done.
			
 
				+   */
			
 
				+  async getDeviceInfo(): Promise<{
			
 
				+    gpu: string | false;
			
 
				+    gpuOffloading: boolean;
			
 
				+    gpuDevices: string[];
			
 
				+    vram?: { total: number; used: number; free: number };
			
 
				+    cpuCores: number;
			
 
				+  }> {
			
 
				+    const llama = await this.ensureLlama();
			
 
				+    const gpuDevices = await llama.getGpuDeviceNames();
			
 
				+    let vram: { total: number; used: number; free: number } | undefined;
			
 
				+    if (llama.gpu) {
			
 
				+      try {
			
 
				+        const state = await llama.getVramState();
			
 
				+        vram = { total: state.total, used: state.used, free: state.free };
			
 
				+      } catch { /* no vram info */ }
			
 
				+    }
			
 
				+    return {
			
 
				+      gpu: llama.gpu,
			
 
				+      gpuOffloading: llama.supportsGpuOffloading,
			
 
				+      gpuDevices,
			
 
				+      vram,
			
 
				+      cpuCores: llama.cpuMathCores,
			
 
				+    };
			
 
				+  }
			
 
				+
			
 
				   async dispose(): Promise<void> {
			
 
				     // Prevent double-dispose
			
 
				     if (this.disposed) {
			
--- a/src/qmd.ts
+++ b/src/qmd.ts
@@ -65,7 +65,7 @@ import {
 
				   createStore,
			
 
				   getDefaultDbPath,
			
 
				 } from "./store.js";
			
 
				-import { disposeDefaultLlamaCpp, withLLMSession, pullModels, DEFAULT_EMBED_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_MODEL_CACHE_DIR } from "./llm.js";
			
 
				+import { disposeDefaultLlamaCpp, getDefaultLlamaCpp, withLLMSession, pullModels, DEFAULT_EMBED_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_MODEL_CACHE_DIR } from "./llm.js";
			
 
				 import {
			
 
				   formatSearchResults,
			
 
				   formatDocuments,
			
@@ -249,7 +249,7 @@ function formatBytes(bytes: number): string {
 
				   return `${(bytes / (1024 * 1024 * 1024)).toFixed(1)} GB`;
			
 
				 }
			
 
				 
			
 
				-function showStatus(): void {
			
 
				+async function showStatus(): Promise<void> {
			
 
				   const dbPath = getDbPath();
			
 
				   const db = getDb();
			
 
				 
			
@@ -362,6 +362,36 @@ function showStatus(): void {
 
				     console.log(`\n${c.dim}No collections. Run 'qmd collection add .' to index markdown files.${c.reset}`);
			
 
				   }
			
 
				 
			
 
				+  // Device / GPU info
			
 
				+  try {
			
 
				+    const llm = getDefaultLlamaCpp();
			
 
				+    const device = await llm.getDeviceInfo();
			
 
				+    console.log(`\n${c.bold}Device${c.reset}`);
			
 
				+    if (device.gpu) {
			
 
				+      console.log(`  GPU:      ${c.green}${device.gpu}${c.reset} (offloading: ${device.gpuOffloading ? 'yes' : 'no'})`);
			
 
				+      if (device.gpuDevices.length > 0) {
			
 
				+        // Deduplicate and count GPUs
			
 
				+        const counts = new Map<string, number>();
			
 
				+        for (const name of device.gpuDevices) {
			
 
				+          counts.set(name, (counts.get(name) || 0) + 1);
			
 
				+        }
			
 
				+        const deviceStr = Array.from(counts.entries())
			
 
				+          .map(([name, count]) => count > 1 ? `${count}× ${name}` : name)
			
 
				+          .join(', ');
			
 
				+        console.log(`  Devices:  ${deviceStr}`);
			
 
				+      }
			
 
				+      if (device.vram) {
			
 
				+        console.log(`  VRAM:     ${formatBytes(device.vram.free)} free / ${formatBytes(device.vram.total)} total`);
			
 
				+      }
			
 
				+    } else {
			
 
				+      console.log(`  GPU:      ${c.yellow}none${c.reset} (running on CPU — models will be slow)`);
			
 
				+      console.log(`  ${c.dim}Tip: Install CUDA, Vulkan, or Metal support for GPU acceleration.${c.reset}`);
			
 
				+    }
			
 
				+    console.log(`  CPU:      ${device.cpuCores} math cores`);
			
 
				+  } catch {
			
 
				+    // Don't fail status if LLM init fails
			
 
				+  }
			
 
				+
			
 
				   closeDb();
			
 
				 }
			
 
				 
			
@@ -2347,7 +2377,7 @@ if (import.meta.main) {
 
				     }
			
 
				 
			
 
				     case "status":
			
 
				-      showStatus();
			
 
				+      await showStatus();
			
 
				       break;
			
 
				 
			
 
				     case "update":