vor 3 Monaten · 26e3d0c077
--- a/src/cli/qmd.ts
+++ b/src/cli/qmd.ts
@@ -461,10 +461,10 @@ async function showStatus(): Promise<void> {
 
				   }
			
 
				 
			
 
				   // Device / GPU info
			
 
				+  console.log(`\n${c.bold}Device${c.reset}`);
			
 
				   try {
			
 
				     const llm = getDefaultLlamaCpp();
			
 
				-    const device = await llm.getDeviceInfo();
			
 
				-    console.log(`\n${c.bold}Device${c.reset}`);
			
 
				+    const device = await llm.getDeviceInfo({ allowBuild: false });
			
 
				     if (device.gpu) {
			
 
				       console.log(`  GPU:      ${c.green}${device.gpu}${c.reset} (offloading: ${device.gpuOffloading ? 'yes' : 'no'})`);
			
 
				       if (device.gpuDevices.length > 0) {
			
@@ -486,8 +486,11 @@ async function showStatus(): Promise<void> {
 
				       console.log(`  ${c.dim}Tip: Install CUDA, Vulkan, or Metal support for GPU acceleration.${c.reset}`);
			
 
				     }
			
 
				     console.log(`  CPU:      ${device.cpuCores} math cores`);
			
 
				-  } catch {
			
 
				-    // Don't fail status if LLM init fails
			
 
				+  } catch (error) {
			
 
				+    console.log(`  Status:   ${c.dim}skipped${c.reset} (status probe does not build llama.cpp backends)`);
			
 
				+    if (error instanceof Error && error.message) {
			
 
				+      console.log(`  ${c.dim}${error.message}${c.reset}`);
			
 
				+    }
			
 
				   }
			
 
				 
			
 
				   // Tips section
			
--- a/src/llm.ts
+++ b/src/llm.ts
@@ -550,7 +550,7 @@ export class LlamaCpp implements LLM {
 
				   /**
			
 
				    * Initialize the llama instance (lazy)
			
 
				    */
			
 
				-  private async ensureLlama(): Promise<Llama> {
			
 
				+  private async ensureLlama(allowBuild = true): Promise<Llama> {
			
 
				     if (!this.llama) {
			
 
				       // Allow override via QMD_LLAMA_GPU: "false" | "off" | "none" forces CPU
			
 
				       const gpuOverride = (process.env.QMD_LLAMA_GPU ?? "").toLowerCase();
			
@@ -558,9 +558,10 @@ export class LlamaCpp implements LLM {
 
				 
			
 
				       const loadLlama = async (gpu: "auto" | false) =>
			
 
				         await getLlama({
			
 
				-          build: "autoAttempt",
			
 
				+          build: allowBuild ? "autoAttempt" : "never",
			
 
				           logLevel: LlamaLogLevel.error,
			
 
				           gpu,
			
 
				+          skipDownload: !allowBuild,
			
 
				         });
			
 
				 
			
 
				       let llama: Llama;
			
@@ -1244,14 +1245,14 @@ export class LlamaCpp implements LLM {
 
				    * Get device/GPU info for status display.
			
 
				    * Initializes llama if not already done.
			
 
				    */
			
 
				-  async getDeviceInfo(): Promise<{
			
 
				+  async getDeviceInfo(options: { allowBuild?: boolean } = {}): Promise<{
			
 
				     gpu: string | false;
			
 
				     gpuOffloading: boolean;
			
 
				     gpuDevices: string[];
			
 
				     vram?: { total: number; used: number; free: number };
			
 
				     cpuCores: number;
			
 
				   }> {
			
 
				-    const llama = await this.ensureLlama();
			
 
				+    const llama = await this.ensureLlama(options.allowBuild ?? true);
			
 
				     const gpuDevices = await llama.getGpuDeviceNames();
			
 
				     let vram: { total: number; used: number; free: number } | undefined;
			
 
				     if (llama.gpu) {
			
--- a/test/llm.test.ts
+++ b/test/llm.test.ts
@@ -193,6 +193,32 @@ describe("LlamaCpp rerank deduping", () => {
 
				   });
			
 
				 });
			
 
				 
			
 
				+describe("LlamaCpp.getDeviceInfo", () => {
			
 
				+  test("can skip build attempts for status probes", async () => {
			
 
				+    const llm = new LlamaCpp({}) as any;
			
 
				+    const fakeLlama = {
			
 
				+      gpu: "metal",
			
 
				+      supportsGpuOffloading: true,
			
 
				+      cpuMathCores: 8,
			
 
				+      getGpuDeviceNames: vi.fn().mockResolvedValue(["Apple GPU"]),
			
 
				+      getVramState: vi.fn().mockResolvedValue({ total: 1024, used: 256, free: 768 }),
			
 
				+    };
			
 
				+
			
 
				+    llm.ensureLlama = vi.fn().mockResolvedValue(fakeLlama);
			
 
				+
			
 
				+    const device = await llm.getDeviceInfo({ allowBuild: false });
			
 
				+
			
 
				+    expect(llm.ensureLlama).toHaveBeenCalledWith(false);
			
 
				+    expect(device).toEqual({
			
 
				+      gpu: "metal",
			
 
				+      gpuOffloading: true,
			
 
				+      gpuDevices: ["Apple GPU"],
			
 
				+      vram: { total: 1024, used: 256, free: 768 },
			
 
				+      cpuCores: 8,
			
 
				+    });
			
 
				+  });
			
 
				+});
			
 
				+
			
 
				 // =============================================================================
			
 
				 // Integration Tests (require actual models)
			
 
				 // =============================================================================