Browse Source

Merge remote-tracking branch 'origin/main' into fix/handelize-preserve-case

# Conflicts:
#	CHANGELOG.md
Kim Junmo 1 month ago
parent
commit
bb5becaf81
5 changed files with 97 additions and 16 deletions
  1. 3 0
      CHANGELOG.md
  2. 10 5
      src/cli/qmd.ts
  3. 22 11
      src/llm.ts
  4. 3 0
      src/mcp/server.ts
  5. 59 0
      test/llm.test.ts

+ 3 - 0
CHANGELOG.md

@@ -2,6 +2,9 @@
 
 ## [Unreleased]
 
+### Fixes
+
+- GPU: respect explicit `QMD_LLAMA_GPU=metal|vulkan|cuda` backend overrides instead of always using auto GPU selection. #529
 - Fix: preserve original filename case in `handelize()`. The previous
   `.toLowerCase()` call made indexed paths unreachable on case-sensitive
   filesystems (Linux). `qmd update` automatically migrates legacy

+ 10 - 5
src/cli/qmd.ts

@@ -462,10 +462,10 @@ async function showStatus(): Promise<void> {
   }
 
   // Device / GPU info
+  console.log(`\n${c.bold}Device${c.reset}`);
   try {
     const llm = getDefaultLlamaCpp();
-    const device = await llm.getDeviceInfo();
-    console.log(`\n${c.bold}Device${c.reset}`);
+    const device = await llm.getDeviceInfo({ allowBuild: false });
     if (device.gpu) {
       console.log(`  GPU:      ${c.green}${device.gpu}${c.reset} (offloading: ${device.gpuOffloading ? 'yes' : 'no'})`);
       if (device.gpuDevices.length > 0) {
@@ -487,8 +487,11 @@ async function showStatus(): Promise<void> {
       console.log(`  ${c.dim}Tip: Install CUDA, Vulkan, or Metal support for GPU acceleration.${c.reset}`);
     }
     console.log(`  CPU:      ${device.cpuCores} math cores`);
-  } catch {
-    // Don't fail status if LLM init fails
+  } catch (error) {
+    console.log(`  Status:   ${c.dim}skipped${c.reset} (status probe does not build llama.cpp backends)`);
+    if (error instanceof Error && error.message) {
+      console.log(`  ${c.dim}${error.message}${c.reset}`);
+    }
   }
 
   // Tips section
@@ -1933,7 +1936,8 @@ function outputResults(results: OutputRow[], query: string, opts: OutputOptions)
     const output = filtered.map(row => {
       const docid = row.docid || (row.hash ? row.hash.slice(0, 6) : undefined);
       let body = opts.full ? row.body : undefined;
-      let snippet = !opts.full ? extractSnippet(row.body, query, 300, row.chunkPos, undefined, opts.intent).snippet : undefined;
+      const snippetInfo = !opts.full ? extractSnippet(row.body, query, 300, row.chunkPos, undefined, opts.intent) : undefined;
+      let snippet = snippetInfo?.snippet;
       if (opts.lineNumbers) {
         if (body) body = addLineNumbers(body);
         if (snippet) snippet = addLineNumbers(snippet);
@@ -1942,6 +1946,7 @@ function outputResults(results: OutputRow[], query: string, opts: OutputOptions)
         ...(docid && { docid: `#${docid}` }),
         score: Math.round(row.score * 100) / 100,
         file: toQmdPath(row.displayPath),
+        ...(snippetInfo && { line: snippetInfo.line }),
         title: row.title,
         ...(row.context && { context: row.context }),
         ...(body && { body }),

+ 22 - 11
src/llm.ts

@@ -385,6 +385,18 @@ export type LlamaCppConfig = {
 const DEFAULT_INACTIVITY_TIMEOUT_MS = 5 * 60 * 1000;
 const DEFAULT_EXPAND_CONTEXT_SIZE = 2048;
 
+type LlamaGpuMode = "auto" | "metal" | "vulkan" | "cuda" | false;
+
+export function resolveLlamaGpuMode(envValue = process.env.QMD_LLAMA_GPU): LlamaGpuMode {
+  const normalized = envValue?.trim().toLowerCase() ?? "";
+  if (!normalized) return "auto";
+  if (["false", "off", "none", "disable", "disabled", "0"].includes(normalized)) return false;
+  if (normalized === "metal" || normalized === "vulkan" || normalized === "cuda") return normalized;
+
+  process.stderr.write(`QMD Warning: invalid QMD_LLAMA_GPU="${envValue}", using auto GPU selection.\n`);
+  return "auto";
+}
+
 function resolveExpandContextSize(configValue?: number): number {
   if (configValue !== undefined) {
     if (!Number.isInteger(configValue) || configValue <= 0) {
@@ -550,30 +562,29 @@ export class LlamaCpp implements LLM {
   /**
    * Initialize the llama instance (lazy)
    */
-  private async ensureLlama(): Promise<Llama> {
+  private async ensureLlama(allowBuild = true): Promise<Llama> {
     if (!this.llama) {
-      // Allow override via QMD_LLAMA_GPU: "false" | "off" | "none" forces CPU
-      const gpuOverride = (process.env.QMD_LLAMA_GPU ?? "").toLowerCase();
-      const forceCpu = ["false", "off", "none", "disable", "disabled", "0"].includes(gpuOverride);
+      const gpuMode = resolveLlamaGpuMode();
 
-      const loadLlama = async (gpu: "auto" | false) =>
+      const loadLlama = async (gpu: LlamaGpuMode) =>
         await getLlama({
-          build: "autoAttempt",
+          build: allowBuild ? "autoAttempt" : "never",
           logLevel: LlamaLogLevel.error,
           gpu,
+          skipDownload: !allowBuild,
         });
 
       let llama: Llama;
-      if (forceCpu) {
+      if (gpuMode === false) {
         llama = await loadLlama(false);
       } else {
         try {
-          llama = await loadLlama("auto");
+          llama = await loadLlama(gpuMode);
         } catch (err) {
           // GPU backend (e.g. Vulkan on headless/driverless machines) can throw at init.
           // Fall back to CPU so qmd still works.
           process.stderr.write(
-            `QMD Warning: GPU init failed (${err instanceof Error ? err.message : String(err)}), falling back to CPU.\n`
+            `QMD Warning: GPU init failed${gpuMode === "auto" ? "" : ` for QMD_LLAMA_GPU=${gpuMode}`} (${err instanceof Error ? err.message : String(err)}), falling back to CPU.\n`
           );
           llama = await loadLlama(false);
         }
@@ -1244,14 +1255,14 @@ export class LlamaCpp implements LLM {
    * Get device/GPU info for status display.
    * Initializes llama if not already done.
    */
-  async getDeviceInfo(): Promise<{
+  async getDeviceInfo(options: { allowBuild?: boolean } = {}): Promise<{
     gpu: string | false;
     gpuOffloading: boolean;
     gpuDevices: string[];
     vram?: { total: number; used: number; free: number };
     cpuCores: number;
   }> {
-    const llama = await this.ensureLlama();
+    const llama = await this.ensureLlama(options.allowBuild ?? true);
     const gpuDevices = await llama.getGpuDeviceNames();
     let vram: { total: number; used: number; free: number } | undefined;
     if (llama.gpu) {

+ 3 - 0
src/mcp/server.ts

@@ -30,6 +30,9 @@ import {
   type IndexStatus,
 } from "../index.js";
 import { getConfigPath } from "../collections.js";
+import { enableProductionMode } from "../store.js";
+
+enableProductionMode();
 
 // =============================================================================
 // Types for structured content

+ 59 - 0
test/llm.test.ts

@@ -12,6 +12,7 @@ import {
   LlamaCpp,
   getDefaultLlamaCpp,
   disposeDefaultLlamaCpp,
+  resolveLlamaGpuMode,
   withLLMSession,
   canUnloadLLM,
   SessionReleasedError,
@@ -55,6 +56,38 @@ describe("LlamaCpp.modelExists", () => {
   });
 });
 
+describe("QMD_LLAMA_GPU resolution", () => {
+  test("uses auto when unset or blank", () => {
+    expect(resolveLlamaGpuMode(undefined)).toBe("auto");
+    expect(resolveLlamaGpuMode("   ")).toBe("auto");
+  });
+
+  test("maps CPU disable values to false", () => {
+    expect(resolveLlamaGpuMode("false")).toBe(false);
+    expect(resolveLlamaGpuMode("OFF")).toBe(false);
+    expect(resolveLlamaGpuMode(" none ")).toBe(false);
+    expect(resolveLlamaGpuMode("disabled")).toBe(false);
+    expect(resolveLlamaGpuMode("0")).toBe(false);
+  });
+
+  test("passes through supported GPU backends", () => {
+    expect(resolveLlamaGpuMode("metal")).toBe("metal");
+    expect(resolveLlamaGpuMode("VULKAN")).toBe("vulkan");
+    expect(resolveLlamaGpuMode(" cuda ")).toBe("cuda");
+  });
+
+  test("warns and falls back to auto for unsupported values", () => {
+    const stderrSpy = vi.spyOn(process.stderr, "write").mockReturnValue(true);
+    try {
+      expect(resolveLlamaGpuMode("rocm")).toBe("auto");
+      expect(stderrSpy).toHaveBeenCalled();
+      expect(String(stderrSpy.mock.calls[0]?.[0] || "")).toContain("QMD_LLAMA_GPU");
+    } finally {
+      stderrSpy.mockRestore();
+    }
+  });
+});
+
 describe("LlamaCpp expand context size config", () => {
   const defaultExpandContextSize = 2048;
 
@@ -193,6 +226,32 @@ describe("LlamaCpp rerank deduping", () => {
   });
 });
 
+describe("LlamaCpp.getDeviceInfo", () => {
+  test("can skip build attempts for status probes", async () => {
+    const llm = new LlamaCpp({}) as any;
+    const fakeLlama = {
+      gpu: "metal",
+      supportsGpuOffloading: true,
+      cpuMathCores: 8,
+      getGpuDeviceNames: vi.fn().mockResolvedValue(["Apple GPU"]),
+      getVramState: vi.fn().mockResolvedValue({ total: 1024, used: 256, free: 768 }),
+    };
+
+    llm.ensureLlama = vi.fn().mockResolvedValue(fakeLlama);
+
+    const device = await llm.getDeviceInfo({ allowBuild: false });
+
+    expect(llm.ensureLlama).toHaveBeenCalledWith(false);
+    expect(device).toEqual({
+      gpu: "metal",
+      gpuOffloading: true,
+      gpuDevices: ["Apple GPU"],
+      vram: { total: 1024, used: 256, free: 768 },
+      cpuCores: 8,
+    });
+  });
+});
+
 // =============================================================================
 // Integration Tests (require actual models)
 // =============================================================================