2 miesięcy temu · 668c4d06e0
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,21 @@
 
				 
			
 
				 ## [Unreleased]
			
 
				 
			
 
				+### Changes
			
 
				+
			
 
				+- `QMD_DISABLE_LOCAL_LLM=1` env var hard-disables the local node-llama-cpp
			
 
				+  path: any `LlamaCpp.ensureLlama()` call throws with an actionable error
			
 
				+  pointing at `EmbeddingProvider`. Use for remote-only deployments where a
			
 
				+  Vulkan probe / cmake build attempt indicates an unintended fallback
			
 
				+  (e.g. headless cron host without `libvulkan-dev`/`glslc`).
			
 
				+- `QMD_EMBED_ENDPOINT` set without an explicit `QMD_LLAMA_GPU` now defaults
			
 
				+  the local llama instance to **CPU only** instead of probing GPU. The
			
 
				+  embed path runs over HTTP — only rerank/expand still use the local LLM,
			
 
				+  and the prebuilt CPU binary is sufficient. This silences ~30s/run of
			
 
				+  Vulkan probe + cmake-js-llama compile noise on hosts without the Vulkan
			
 
				+  SDK installed. Override with `QMD_LLAMA_GPU=auto` to opt back into GPU
			
 
				+  probing for hybrid local-rerank + remote-embed setups. (i-c28wngnd)
			
 
				+
			
 
				 ## [2.1.0] - 2026-04-05
			
 
				 
			
 
				 Code files now chunk at function and class boundaries via tree-sitter,
			
--- a/dist/llm.d.ts
+++ b/dist/llm.d.ts
@@ -4,6 +4,23 @@
 
				  * Provides embeddings, text generation, and reranking using local GGUF models.
			
 
				  */
			
 
				 import { type Token as LlamaToken } from "node-llama-cpp";
			
 
				+/**
			
 
				+ * `QMD_DISABLE_LOCAL_LLM=1` opt-out: when set, `LlamaCpp.ensureLlama()`
			
 
				+ * throws on first invocation. Use for remote-only deployments where any
			
 
				+ * `getLlama()` call indicates an unintended fallback (e.g. cron host
			
 
				+ * without libvulkan-dev/glslc — issue i-c28wngnd).
			
 
				+ */
			
 
				+export declare function isLocalLlmDisabled(env?: NodeJS.ProcessEnv): boolean;
			
 
				+/**
			
 
				+ * Resolve the GPU mode for `getLlama()`:
			
 
				+ *   1. Explicit `QMD_LLAMA_GPU=off|none|0|...`     → "cpu"
			
 
				+ *   2. Explicit `QMD_LLAMA_GPU=auto`               → "auto"
			
 
				+ *   3. Auto-detect: `QMD_EMBED_ENDPOINT` set        → "cpu"
			
 
				+ *      (remote embed provider — embed never touches local LLM. Rerank/expand
			
 
				+ *       still use prebuilt CPU binary; no Vulkan probe / cmake build.)
			
 
				+ *   4. Otherwise (legacy local-only setup)          → "auto"
			
 
				+ */
			
 
				+export declare function resolveLlamaGpuMode(env?: NodeJS.ProcessEnv): "cpu" | "auto";
			
 
				 /**
			
 
				  * Detect if a model URI uses the Qwen3-Embedding format.
			
 
				  * Qwen3-Embedding uses a different prompting style than nomic/embeddinggemma.
			
@@ -255,6 +272,23 @@ export declare class LlamaCpp implements LLM {
 
				     private ensureModelCacheDir;
			
 
				     /**
			
 
				      * Initialize the llama instance (lazy)
			
 
				+     *
			
 
				+     * Env-var controls (i-c28wngnd):
			
 
				+     *   - QMD_DISABLE_LOCAL_LLM=1    : hard-disable; throws on first ensureLlama()
			
 
				+     *                                  call. Use when the deployment must NEVER
			
 
				+     *                                  load node-llama-cpp (e.g. headless cron
			
 
				+     *                                  on a host without libvulkan-dev/glslc).
			
 
				+     *   - QMD_LLAMA_GPU=off|none|... : force CPU-only (skip Vulkan probe).
			
 
				+     *   - QMD_LLAMA_GPU=auto         : explicit opt-in to GPU probe even when
			
 
				+     *                                  QMD_EMBED_ENDPOINT is set (rare; useful
			
 
				+     *                                  for hybrid local-rerank + remote-embed).
			
 
				+     *
			
 
				+     * Auto-detect: when QMD_EMBED_ENDPOINT is set (HTTP embed provider, e.g.
			
 
				+     * cron on `code` → ai.mm.mk → models:8082), we default to CPU-only because
			
 
				+     * the embed path runs over HTTP and the only remaining local LLM consumers
			
 
				+     * are rerank/query-expansion, which work fine on the prebuilt CPU binary
			
 
				+     * and never need to invoke cmake-js-llama. This silences ~30s/run of
			
 
				+     * Vulkan probe + cmake noise on headless LXCs.
			
 
				      */
			
 
				     private ensureLlama;
			
 
				     /**
			
--- a/dist/llm.js
+++ b/dist/llm.js
@@ -8,6 +8,61 @@ import { homedir } from "os";
 
				 import { join } from "path";
			
 
				 import { existsSync, mkdirSync, statSync, unlinkSync, readdirSync, readFileSync, writeFileSync } from "fs";
			
 
				 // =============================================================================
			
 
				+// Local-LLM env-var policy (i-c28wngnd)
			
 
				+// =============================================================================
			
 
				+/**
			
 
				+ * Truthy values for boolean-style env vars. Mirrors the convention used by
			
 
				+ * `QMD_LLAMA_GPU` (false-style) — kept narrow so unrelated values don't flip
			
 
				+ * the disable.
			
 
				+ */
			
 
				+const TRUTHY_ENV_VALUES = new Set(["1", "true", "yes", "on"]);
			
 
				+/**
			
 
				+ * Falsy / off-style values accepted by `QMD_LLAMA_GPU`.
			
 
				+ */
			
 
				+const QMD_LLAMA_GPU_OFF_VALUES = new Set([
			
 
				+    "false", "off", "none", "disable", "disabled", "0",
			
 
				+]);
			
 
				+/**
			
 
				+ * `QMD_DISABLE_LOCAL_LLM=1` opt-out: when set, `LlamaCpp.ensureLlama()`
			
 
				+ * throws on first invocation. Use for remote-only deployments where any
			
 
				+ * `getLlama()` call indicates an unintended fallback (e.g. cron host
			
 
				+ * without libvulkan-dev/glslc — issue i-c28wngnd).
			
 
				+ */
			
 
				+export function isLocalLlmDisabled(env = process.env) {
			
 
				+    const raw = env.QMD_DISABLE_LOCAL_LLM?.trim().toLowerCase();
			
 
				+    return raw !== undefined && TRUTHY_ENV_VALUES.has(raw);
			
 
				+}
			
 
				+/**
			
 
				+ * Resolve the GPU mode for `getLlama()`:
			
 
				+ *   1. Explicit `QMD_LLAMA_GPU=off|none|0|...`     → "cpu"
			
 
				+ *   2. Explicit `QMD_LLAMA_GPU=auto`               → "auto"
			
 
				+ *   3. Auto-detect: `QMD_EMBED_ENDPOINT` set        → "cpu"
			
 
				+ *      (remote embed provider — embed never touches local LLM. Rerank/expand
			
 
				+ *       still use prebuilt CPU binary; no Vulkan probe / cmake build.)
			
 
				+ *   4. Otherwise (legacy local-only setup)          → "auto"
			
 
				+ */
			
 
				+export function resolveLlamaGpuMode(env = process.env) {
			
 
				+    const explicit = env.QMD_LLAMA_GPU?.trim().toLowerCase();
			
 
				+    if (explicit !== undefined && explicit !== "") {
			
 
				+        if (QMD_LLAMA_GPU_OFF_VALUES.has(explicit))
			
 
				+            return "cpu";
			
 
				+        if (explicit === "auto" || explicit === "true" || explicit === "on") {
			
 
				+            return "auto";
			
 
				+        }
			
 
				+        // Unknown value — preserve legacy behavior (probe).
			
 
				+        return "auto";
			
 
				+    }
			
 
				+    // Auto-detect remote-only deployment. When QMD_EMBED_ENDPOINT is set the
			
 
				+    // embed path runs over HTTP (factory.ts resolveProviderKind), so any
			
 
				+    // local LLM access is for rerank/expand only — the prebuilt CPU binary
			
 
				+    // is sufficient and skipping the Vulkan probe avoids the ~30s cmake
			
 
				+    // attempt on hosts without libvulkan-dev/glslc.
			
 
				+    const remoteEmbed = env.QMD_EMBED_ENDPOINT?.trim();
			
 
				+    if (remoteEmbed && remoteEmbed !== "")
			
 
				+        return "cpu";
			
 
				+    return "auto";
			
 
				+}
			
 
				+// =============================================================================
			
 
				 // Embedding Formatting Functions
			
 
				 // =============================================================================
			
 
				 /**
			
@@ -291,19 +346,48 @@ export class LlamaCpp {
 
				     }
			
 
				     /**
			
 
				      * Initialize the llama instance (lazy)
			
 
				+     *
			
 
				+     * Env-var controls (i-c28wngnd):
			
 
				+     *   - QMD_DISABLE_LOCAL_LLM=1    : hard-disable; throws on first ensureLlama()
			
 
				+     *                                  call. Use when the deployment must NEVER
			
 
				+     *                                  load node-llama-cpp (e.g. headless cron
			
 
				+     *                                  on a host without libvulkan-dev/glslc).
			
 
				+     *   - QMD_LLAMA_GPU=off|none|... : force CPU-only (skip Vulkan probe).
			
 
				+     *   - QMD_LLAMA_GPU=auto         : explicit opt-in to GPU probe even when
			
 
				+     *                                  QMD_EMBED_ENDPOINT is set (rare; useful
			
 
				+     *                                  for hybrid local-rerank + remote-embed).
			
 
				+     *
			
 
				+     * Auto-detect: when QMD_EMBED_ENDPOINT is set (HTTP embed provider, e.g.
			
 
				+     * cron on `code` → ai.mm.mk → models:8082), we default to CPU-only because
			
 
				+     * the embed path runs over HTTP and the only remaining local LLM consumers
			
 
				+     * are rerank/query-expansion, which work fine on the prebuilt CPU binary
			
 
				+     * and never need to invoke cmake-js-llama. This silences ~30s/run of
			
 
				+     * Vulkan probe + cmake noise on headless LXCs.
			
 
				      */
			
 
				     async ensureLlama() {
			
 
				         if (!this.llama) {
			
 
				-            // Allow override via QMD_LLAMA_GPU: "false" | "off" | "none" forces CPU
			
 
				-            const gpuOverride = (process.env.QMD_LLAMA_GPU ?? "").toLowerCase();
			
 
				-            const forceCpu = ["false", "off", "none", "disable", "disabled", "0"].includes(gpuOverride);
			
 
				+            // Hard-disable opt-out — fails fast so the caller knows. Throw early
			
 
				+            // so any path that ignores the documented `EmbeddingProvider` route
			
 
				+            // and reaches for the local LLM gets a loud, actionable error rather
			
 
				+            // than a silent 30s Vulkan compile attempt.
			
 
				+            if (isLocalLlmDisabled(process.env)) {
			
 
				+                throw new Error("QMD_DISABLE_LOCAL_LLM=1 — local node-llama-cpp is disabled. " +
			
 
				+                    "This deployment is configured for remote embeddings only; the " +
			
 
				+                    "code path that reached `ensureLlama()` should route through an " +
			
 
				+                    "EmbeddingProvider (set QMD_EMBED_ENDPOINT) instead. Unset " +
			
 
				+                    "QMD_DISABLE_LOCAL_LLM to re-enable local rerank/expand.");
			
 
				+            }
			
 
				+            // Resolve GPU mode: explicit QMD_LLAMA_GPU wins, else auto-detect
			
 
				+            // remote-only deployment (CPU when QMD_EMBED_ENDPOINT is set), else
			
 
				+            // probe GPU normally for legacy local-only setups.
			
 
				+            const gpuMode = resolveLlamaGpuMode(process.env);
			
 
				             const loadLlama = async (gpu) => await getLlama({
			
 
				                 build: "autoAttempt",
			
 
				                 logLevel: LlamaLogLevel.error,
			
 
				                 gpu,
			
 
				             });
			
 
				             let llama;
			
 
				-            if (forceCpu) {
			
 
				+            if (gpuMode === "cpu") {
			
 
				                 llama = await loadLlama(false);
			
 
				             }
			
 
				             else {
			
@@ -317,7 +401,11 @@ export class LlamaCpp {
 
				                     llama = await loadLlama(false);
			
 
				                 }
			
 
				             }
			
 
				-            if (llama.gpu === false) {
			
 
				+            // Suppress the "running on CPU (slow)" warning when CPU was requested
			
 
				+            // explicitly or auto-selected for a remote-only deployment — there's
			
 
				+            // nothing the operator can do about it and the hint isn't relevant
			
 
				+            // (embed runs via HTTP; only rerank/expand use the local CPU path).
			
 
				+            if (llama.gpu === false && gpuMode === "auto") {
			
 
				                 process.stderr.write("QMD Warning: no GPU acceleration, running on CPU (slow). Run 'qmd status' for details.\n");
			
 
				             }
			
 
				             this.llama = llama;
			
--- a/src/llm.ts
+++ b/src/llm.ts
@@ -18,6 +18,68 @@ import { homedir } from "os";
 
				 import { join } from "path";
			
 
				 import { existsSync, mkdirSync, statSync, unlinkSync, readdirSync, readFileSync, writeFileSync } from "fs";
			
 
				 
			
 
				+// =============================================================================
			
 
				+// Local-LLM env-var policy (i-c28wngnd)
			
 
				+// =============================================================================
			
 
				+
			
 
				+/**
			
 
				+ * Truthy values for boolean-style env vars. Mirrors the convention used by
			
 
				+ * `QMD_LLAMA_GPU` (false-style) — kept narrow so unrelated values don't flip
			
 
				+ * the disable.
			
 
				+ */
			
 
				+const TRUTHY_ENV_VALUES: ReadonlySet<string> = new Set(["1", "true", "yes", "on"]);
			
 
				+
			
 
				+/**
			
 
				+ * Falsy / off-style values accepted by `QMD_LLAMA_GPU`.
			
 
				+ */
			
 
				+const QMD_LLAMA_GPU_OFF_VALUES: ReadonlySet<string> = new Set([
			
 
				+  "false", "off", "none", "disable", "disabled", "0",
			
 
				+]);
			
 
				+
			
 
				+/**
			
 
				+ * `QMD_DISABLE_LOCAL_LLM=1` opt-out: when set, `LlamaCpp.ensureLlama()`
			
 
				+ * throws on first invocation. Use for remote-only deployments where any
			
 
				+ * `getLlama()` call indicates an unintended fallback (e.g. cron host
			
 
				+ * without libvulkan-dev/glslc — issue i-c28wngnd).
			
 
				+ */
			
 
				+export function isLocalLlmDisabled(env: NodeJS.ProcessEnv = process.env): boolean {
			
 
				+  const raw = env.QMD_DISABLE_LOCAL_LLM?.trim().toLowerCase();
			
 
				+  return raw !== undefined && TRUTHY_ENV_VALUES.has(raw);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * Resolve the GPU mode for `getLlama()`:
			
 
				+ *   1. Explicit `QMD_LLAMA_GPU=off|none|0|...`     → "cpu"
			
 
				+ *   2. Explicit `QMD_LLAMA_GPU=auto`               → "auto"
			
 
				+ *   3. Auto-detect: `QMD_EMBED_ENDPOINT` set        → "cpu"
			
 
				+ *      (remote embed provider — embed never touches local LLM. Rerank/expand
			
 
				+ *       still use prebuilt CPU binary; no Vulkan probe / cmake build.)
			
 
				+ *   4. Otherwise (legacy local-only setup)          → "auto"
			
 
				+ */
			
 
				+export function resolveLlamaGpuMode(
			
 
				+  env: NodeJS.ProcessEnv = process.env,
			
 
				+): "cpu" | "auto" {
			
 
				+  const explicit = env.QMD_LLAMA_GPU?.trim().toLowerCase();
			
 
				+  if (explicit !== undefined && explicit !== "") {
			
 
				+    if (QMD_LLAMA_GPU_OFF_VALUES.has(explicit)) return "cpu";
			
 
				+    if (explicit === "auto" || explicit === "true" || explicit === "on") {
			
 
				+      return "auto";
			
 
				+    }
			
 
				+    // Unknown value — preserve legacy behavior (probe).
			
 
				+    return "auto";
			
 
				+  }
			
 
				+
			
 
				+  // Auto-detect remote-only deployment. When QMD_EMBED_ENDPOINT is set the
			
 
				+  // embed path runs over HTTP (factory.ts resolveProviderKind), so any
			
 
				+  // local LLM access is for rerank/expand only — the prebuilt CPU binary
			
 
				+  // is sufficient and skipping the Vulkan probe avoids the ~30s cmake
			
 
				+  // attempt on hosts without libvulkan-dev/glslc.
			
 
				+  const remoteEmbed = env.QMD_EMBED_ENDPOINT?.trim();
			
 
				+  if (remoteEmbed && remoteEmbed !== "") return "cpu";
			
 
				+
			
 
				+  return "auto";
			
 
				+}
			
 
				+
			
 
				 // =============================================================================
			
 
				 // Embedding Formatting Functions
			
 
				 // =============================================================================
			
@@ -549,12 +611,44 @@ export class LlamaCpp implements LLM {
 
				 
			
 
				   /**
			
 
				    * Initialize the llama instance (lazy)
			
 
				+   *
			
 
				+   * Env-var controls (i-c28wngnd):
			
 
				+   *   - QMD_DISABLE_LOCAL_LLM=1    : hard-disable; throws on first ensureLlama()
			
 
				+   *                                  call. Use when the deployment must NEVER
			
 
				+   *                                  load node-llama-cpp (e.g. headless cron
			
 
				+   *                                  on a host without libvulkan-dev/glslc).
			
 
				+   *   - QMD_LLAMA_GPU=off|none|... : force CPU-only (skip Vulkan probe).
			
 
				+   *   - QMD_LLAMA_GPU=auto         : explicit opt-in to GPU probe even when
			
 
				+   *                                  QMD_EMBED_ENDPOINT is set (rare; useful
			
 
				+   *                                  for hybrid local-rerank + remote-embed).
			
 
				+   *
			
 
				+   * Auto-detect: when QMD_EMBED_ENDPOINT is set (HTTP embed provider, e.g.
			
 
				+   * cron on `code` → ai.mm.mk → models:8082), we default to CPU-only because
			
 
				+   * the embed path runs over HTTP and the only remaining local LLM consumers
			
 
				+   * are rerank/query-expansion, which work fine on the prebuilt CPU binary
			
 
				+   * and never need to invoke cmake-js-llama. This silences ~30s/run of
			
 
				+   * Vulkan probe + cmake noise on headless LXCs.
			
 
				    */
			
 
				   private async ensureLlama(): Promise<Llama> {
			
 
				     if (!this.llama) {
			
 
				-      // Allow override via QMD_LLAMA_GPU: "false" | "off" | "none" forces CPU
			
 
				-      const gpuOverride = (process.env.QMD_LLAMA_GPU ?? "").toLowerCase();
			
 
				-      const forceCpu = ["false", "off", "none", "disable", "disabled", "0"].includes(gpuOverride);
			
 
				+      // Hard-disable opt-out — fails fast so the caller knows. Throw early
			
 
				+      // so any path that ignores the documented `EmbeddingProvider` route
			
 
				+      // and reaches for the local LLM gets a loud, actionable error rather
			
 
				+      // than a silent 30s Vulkan compile attempt.
			
 
				+      if (isLocalLlmDisabled(process.env)) {
			
 
				+        throw new Error(
			
 
				+          "QMD_DISABLE_LOCAL_LLM=1 — local node-llama-cpp is disabled. " +
			
 
				+          "This deployment is configured for remote embeddings only; the " +
			
 
				+          "code path that reached `ensureLlama()` should route through an " +
			
 
				+          "EmbeddingProvider (set QMD_EMBED_ENDPOINT) instead. Unset " +
			
 
				+          "QMD_DISABLE_LOCAL_LLM to re-enable local rerank/expand."
			
 
				+        );
			
 
				+      }
			
 
				+
			
 
				+      // Resolve GPU mode: explicit QMD_LLAMA_GPU wins, else auto-detect
			
 
				+      // remote-only deployment (CPU when QMD_EMBED_ENDPOINT is set), else
			
 
				+      // probe GPU normally for legacy local-only setups.
			
 
				+      const gpuMode = resolveLlamaGpuMode(process.env);
			
 
				 
			
 
				       const loadLlama = async (gpu: "auto" | false) =>
			
 
				         await getLlama({
			
@@ -564,7 +658,7 @@ export class LlamaCpp implements LLM {
 
				         });
			
 
				 
			
 
				       let llama: Llama;
			
 
				-      if (forceCpu) {
			
 
				+      if (gpuMode === "cpu") {
			
 
				         llama = await loadLlama(false);
			
 
				       } else {
			
 
				         try {
			
@@ -579,7 +673,11 @@ export class LlamaCpp implements LLM {
 
				         }
			
 
				       }
			
 
				 
			
 
				-      if (llama.gpu === false) {
			
 
				+      // Suppress the "running on CPU (slow)" warning when CPU was requested
			
 
				+      // explicitly or auto-selected for a remote-only deployment — there's
			
 
				+      // nothing the operator can do about it and the hint isn't relevant
			
 
				+      // (embed runs via HTTP; only rerank/expand use the local CPU path).
			
 
				+      if (llama.gpu === false && gpuMode === "auto") {
			
 
				         process.stderr.write(
			
 
				           "QMD Warning: no GPU acceleration, running on CPU (slow). Run 'qmd status' for details.\n"
			
 
				         );
			
--- a/test/llm.test.ts
+++ b/test/llm.test.ts
@@ -15,6 +15,8 @@ import {
 
				   withLLMSession,
			
 
				   canUnloadLLM,
			
 
				   SessionReleasedError,
			
 
				+  isLocalLlmDisabled,
			
 
				+  resolveLlamaGpuMode,
			
 
				   type RerankDocument,
			
 
				   type ILLMSession,
			
 
				 } from "../src/llm.js";
			
@@ -161,6 +163,105 @@ describe("LlamaCpp model resolution (config > env > default)", () => {
 
				   });
			
 
				 });
			
 
				 
			
 
				+// =============================================================================
			
 
				+// QMD_DISABLE_LOCAL_LLM + remote-only auto-CPU (i-c28wngnd)
			
 
				+// =============================================================================
			
 
				+
			
 
				+describe("isLocalLlmDisabled (QMD_DISABLE_LOCAL_LLM)", () => {
			
 
				+  test("returns false when env var is unset", () => {
			
 
				+    expect(isLocalLlmDisabled({})).toBe(false);
			
 
				+    expect(isLocalLlmDisabled({ QMD_DISABLE_LOCAL_LLM: undefined })).toBe(false);
			
 
				+  });
			
 
				+
			
 
				+  test("returns false when env var is empty / whitespace", () => {
			
 
				+    expect(isLocalLlmDisabled({ QMD_DISABLE_LOCAL_LLM: "" })).toBe(false);
			
 
				+    expect(isLocalLlmDisabled({ QMD_DISABLE_LOCAL_LLM: "   " })).toBe(false);
			
 
				+  });
			
 
				+
			
 
				+  test("returns true for canonical truthy values", () => {
			
 
				+    for (const v of ["1", "true", "yes", "on", "TRUE", "Yes", " 1 "]) {
			
 
				+      expect(isLocalLlmDisabled({ QMD_DISABLE_LOCAL_LLM: v })).toBe(true);
			
 
				+    }
			
 
				+  });
			
 
				+
			
 
				+  test("returns false for canonical falsy values", () => {
			
 
				+    for (const v of ["0", "false", "no", "off", "FALSE", "No"]) {
			
 
				+      expect(isLocalLlmDisabled({ QMD_DISABLE_LOCAL_LLM: v })).toBe(false);
			
 
				+    }
			
 
				+  });
			
 
				+});
			
 
				+
			
 
				+describe("resolveLlamaGpuMode (QMD_LLAMA_GPU + QMD_EMBED_ENDPOINT)", () => {
			
 
				+  test("returns 'auto' for empty env (legacy local-only setup)", () => {
			
 
				+    expect(resolveLlamaGpuMode({})).toBe("auto");
			
 
				+  });
			
 
				+
			
 
				+  test("explicit QMD_LLAMA_GPU=off|none|0|disabled forces CPU", () => {
			
 
				+    for (const v of ["off", "none", "false", "0", "disabled", "disable", "OFF", "None"]) {
			
 
				+      expect(resolveLlamaGpuMode({ QMD_LLAMA_GPU: v })).toBe("cpu");
			
 
				+    }
			
 
				+  });
			
 
				+
			
 
				+  test("explicit QMD_LLAMA_GPU=auto|on|true preserves probe", () => {
			
 
				+    for (const v of ["auto", "on", "true", "Auto"]) {
			
 
				+      expect(resolveLlamaGpuMode({ QMD_LLAMA_GPU: v })).toBe("auto");
			
 
				+    }
			
 
				+  });
			
 
				+
			
 
				+  test("auto-detect: QMD_EMBED_ENDPOINT set → CPU (skip Vulkan probe)", () => {
			
 
				+    expect(
			
 
				+      resolveLlamaGpuMode({ QMD_EMBED_ENDPOINT: "http://models:8082" }),
			
 
				+    ).toBe("cpu");
			
 
				+  });
			
 
				+
			
 
				+  test("explicit QMD_LLAMA_GPU=auto OVERRIDES QMD_EMBED_ENDPOINT auto-CPU", () => {
			
 
				+    expect(
			
 
				+      resolveLlamaGpuMode({
			
 
				+        QMD_LLAMA_GPU: "auto",
			
 
				+        QMD_EMBED_ENDPOINT: "http://models:8082",
			
 
				+      }),
			
 
				+    ).toBe("auto");
			
 
				+  });
			
 
				+
			
 
				+  test("empty QMD_EMBED_ENDPOINT does not trigger auto-CPU", () => {
			
 
				+    expect(resolveLlamaGpuMode({ QMD_EMBED_ENDPOINT: "" })).toBe("auto");
			
 
				+    expect(resolveLlamaGpuMode({ QMD_EMBED_ENDPOINT: "   " })).toBe("auto");
			
 
				+  });
			
 
				+
			
 
				+  test("unknown QMD_LLAMA_GPU values fall back to 'auto' (preserve legacy probe)", () => {
			
 
				+    expect(resolveLlamaGpuMode({ QMD_LLAMA_GPU: "vulkan" })).toBe("auto");
			
 
				+    expect(resolveLlamaGpuMode({ QMD_LLAMA_GPU: "cuda" })).toBe("auto");
			
 
				+  });
			
 
				+});
			
 
				+
			
 
				+describe("LlamaCpp.ensureLlama() + QMD_DISABLE_LOCAL_LLM", () => {
			
 
				+  test("throws with actionable error when QMD_DISABLE_LOCAL_LLM=1", async () => {
			
 
				+    const prev = process.env.QMD_DISABLE_LOCAL_LLM;
			
 
				+    process.env.QMD_DISABLE_LOCAL_LLM = "1";
			
 
				+    try {
			
 
				+      const llm = new LlamaCpp({}) as any;
			
 
				+      await expect(llm.ensureLlama()).rejects.toThrow(/QMD_DISABLE_LOCAL_LLM/);
			
 
				+      await expect(llm.ensureLlama()).rejects.toThrow(/EmbeddingProvider/);
			
 
				+    } finally {
			
 
				+      if (prev === undefined) delete process.env.QMD_DISABLE_LOCAL_LLM;
			
 
				+      else process.env.QMD_DISABLE_LOCAL_LLM = prev;
			
 
				+    }
			
 
				+  });
			
 
				+
			
 
				+  test("does not throw when QMD_DISABLE_LOCAL_LLM is unset (smoke)", () => {
			
 
				+    // We don't want to actually call getLlama() (slow / loads native), but
			
 
				+    // we verify the guard does NOT fire for an empty/unset env. The full
			
 
				+    // integration path is exercised by the gated CI suite below.
			
 
				+    const prev = process.env.QMD_DISABLE_LOCAL_LLM;
			
 
				+    delete process.env.QMD_DISABLE_LOCAL_LLM;
			
 
				+    try {
			
 
				+      expect(isLocalLlmDisabled(process.env)).toBe(false);
			
 
				+    } finally {
			
 
				+      if (prev !== undefined) process.env.QMD_DISABLE_LOCAL_LLM = prev;
			
 
				+    }
			
 
				+  });
			
 
				+});
			
 
				+
			
 
				 describe("LlamaCpp rerank deduping", () => {
			
 
				   test("deduplicates identical document texts before scoring", async () => {
			
 
				     const llm = new LlamaCpp({}) as any;