2 tháng trước cách đây · 668c4d06e0
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,21 @@
 
															 ## [Unreleased]
														
 
															+### Changes
														
 
															+
														
 
															+- `QMD_DISABLE_LOCAL_LLM=1` env var hard-disables the local node-llama-cpp
														
 
															+  path: any `LlamaCpp.ensureLlama()` call throws with an actionable error
														
 
															+  pointing at `EmbeddingProvider`. Use for remote-only deployments where a
														
 
															+  Vulkan probe / cmake build attempt indicates an unintended fallback
														
 
															+  (e.g. headless cron host without `libvulkan-dev`/`glslc`).
														
 
															+- `QMD_EMBED_ENDPOINT` set without an explicit `QMD_LLAMA_GPU` now defaults
														
 
															+  the local llama instance to **CPU only** instead of probing GPU. The
														
 
															+  embed path runs over HTTP — only rerank/expand still use the local LLM,
														
 
															+  and the prebuilt CPU binary is sufficient. This silences ~30s/run of
														
 
															+  Vulkan probe + cmake-js-llama compile noise on hosts without the Vulkan
														
 
															+  SDK installed. Override with `QMD_LLAMA_GPU=auto` to opt back into GPU
														
 
															+  probing for hybrid local-rerank + remote-embed setups. (i-c28wngnd)
														
 
															+
														
 
															 ## [2.1.0] - 2026-04-05
														
 
															 Code files now chunk at function and class boundaries via tree-sitter,
														
--- a/dist/llm.d.ts
+++ b/dist/llm.d.ts
@@ -4,6 +4,23 @@
 
															  * Provides embeddings, text generation, and reranking using local GGUF models.
														
 
															  */
														
 
															 import { type Token as LlamaToken } from "node-llama-cpp";
														
 
															+/**
														
 
															+ * `QMD_DISABLE_LOCAL_LLM=1` opt-out: when set, `LlamaCpp.ensureLlama()`
														
 
															+ * throws on first invocation. Use for remote-only deployments where any
														
 
															+ * `getLlama()` call indicates an unintended fallback (e.g. cron host
														
 
															+ * without libvulkan-dev/glslc — issue i-c28wngnd).
														
 
															+ */
														
 
															+export declare function isLocalLlmDisabled(env?: NodeJS.ProcessEnv): boolean;
														
 
															+/**
														
 
															+ * Resolve the GPU mode for `getLlama()`:
														
 
															+ *   1. Explicit `QMD_LLAMA_GPU=off|none|0|...`     → "cpu"
														
 
															+ *   2. Explicit `QMD_LLAMA_GPU=auto`               → "auto"
														
 
															+ *   3. Auto-detect: `QMD_EMBED_ENDPOINT` set        → "cpu"
														
 
															+ *      (remote embed provider — embed never touches local LLM. Rerank/expand
														
 
															+ *       still use prebuilt CPU binary; no Vulkan probe / cmake build.)
														
 
															+ *   4. Otherwise (legacy local-only setup)          → "auto"
														
 
															+ */
														
 
															+export declare function resolveLlamaGpuMode(env?: NodeJS.ProcessEnv): "cpu" | "auto";
														
 
															 /**
														
 
															  * Detect if a model URI uses the Qwen3-Embedding format.
														
 
															  * Qwen3-Embedding uses a different prompting style than nomic/embeddinggemma.
														
@@ -255,6 +272,23 @@ export declare class LlamaCpp implements LLM {
 
															     private ensureModelCacheDir;
														
 
															     /**
														
 
															      * Initialize the llama instance (lazy)
														
 
															+     *
														
 
															+     * Env-var controls (i-c28wngnd):
														
 
															+     *   - QMD_DISABLE_LOCAL_LLM=1    : hard-disable; throws on first ensureLlama()
														
 
															+     *                                  call. Use when the deployment must NEVER
														
 
															+     *                                  load node-llama-cpp (e.g. headless cron
														
 
															+     *                                  on a host without libvulkan-dev/glslc).
														
 
															+     *   - QMD_LLAMA_GPU=off|none|... : force CPU-only (skip Vulkan probe).
														
 
															+     *   - QMD_LLAMA_GPU=auto         : explicit opt-in to GPU probe even when
														
 
															+     *                                  QMD_EMBED_ENDPOINT is set (rare; useful
														
 
															+     *                                  for hybrid local-rerank + remote-embed).
														
 
															+     *
														
 
															+     * Auto-detect: when QMD_EMBED_ENDPOINT is set (HTTP embed provider, e.g.
														
 
															+     * cron on `code` → ai.mm.mk → models:8082), we default to CPU-only because
														
 
															+     * the embed path runs over HTTP and the only remaining local LLM consumers
														
 
															+     * are rerank/query-expansion, which work fine on the prebuilt CPU binary
														
 
															+     * and never need to invoke cmake-js-llama. This silences ~30s/run of
														
 
															+     * Vulkan probe + cmake noise on headless LXCs.
														
 
															      */
														
 
															     private ensureLlama;
														
 
															     /**
														
--- a/dist/llm.js
+++ b/dist/llm.js
@@ -8,6 +8,61 @@ import { homedir } from "os";
 
															 import { join } from "path";
														
 
															 import { existsSync, mkdirSync, statSync, unlinkSync, readdirSync, readFileSync, writeFileSync } from "fs";
														
 
															 // =============================================================================
														
 
															+// Local-LLM env-var policy (i-c28wngnd)
														
 
															+// =============================================================================
														
 
															+/**
														
 
															+ * Truthy values for boolean-style env vars. Mirrors the convention used by
														
 
															+ * `QMD_LLAMA_GPU` (false-style) — kept narrow so unrelated values don't flip
														
 
															+ * the disable.
														
 
															+ */
														
 
															+const TRUTHY_ENV_VALUES = new Set(["1", "true", "yes", "on"]);
														
 
															+/**
														
 
															+ * Falsy / off-style values accepted by `QMD_LLAMA_GPU`.
														
 
															+ */
														
 
															+const QMD_LLAMA_GPU_OFF_VALUES = new Set([
														
 
															+    "false", "off", "none", "disable", "disabled", "0",
														
 
															+]);
														
 
															+/**
														
 
															+ * `QMD_DISABLE_LOCAL_LLM=1` opt-out: when set, `LlamaCpp.ensureLlama()`
														
 
															+ * throws on first invocation. Use for remote-only deployments where any
														
 
															+ * `getLlama()` call indicates an unintended fallback (e.g. cron host
														
 
															+ * without libvulkan-dev/glslc — issue i-c28wngnd).
														
 
															+ */
														
 
															+export function isLocalLlmDisabled(env = process.env) {
														
 
															+    const raw = env.QMD_DISABLE_LOCAL_LLM?.trim().toLowerCase();
														
 
															+    return raw !== undefined && TRUTHY_ENV_VALUES.has(raw);
														
 
															+}
														
 
															+/**
														
 
															+ * Resolve the GPU mode for `getLlama()`:
														
 
															+ *   1. Explicit `QMD_LLAMA_GPU=off|none|0|...`     → "cpu"
														
 
															+ *   2. Explicit `QMD_LLAMA_GPU=auto`               → "auto"
														
 
															+ *   3. Auto-detect: `QMD_EMBED_ENDPOINT` set        → "cpu"
														
 
															+ *      (remote embed provider — embed never touches local LLM. Rerank/expand
														
 
															+ *       still use prebuilt CPU binary; no Vulkan probe / cmake build.)
														
 
															+ *   4. Otherwise (legacy local-only setup)          → "auto"
														
 
															+ */
														
 
															+export function resolveLlamaGpuMode(env = process.env) {
														
 
															+    const explicit = env.QMD_LLAMA_GPU?.trim().toLowerCase();
														
 
															+    if (explicit !== undefined && explicit !== "") {
														
 
															+        if (QMD_LLAMA_GPU_OFF_VALUES.has(explicit))
														
 
															+            return "cpu";
														
 
															+        if (explicit === "auto" || explicit === "true" || explicit === "on") {
														
 
															+            return "auto";
														
 
															+        }
														
 
															+        // Unknown value — preserve legacy behavior (probe).
														
 
															+        return "auto";
														
 
															+    }
														
 
															+    // Auto-detect remote-only deployment. When QMD_EMBED_ENDPOINT is set the
														
 
															+    // embed path runs over HTTP (factory.ts resolveProviderKind), so any
														
 
															+    // local LLM access is for rerank/expand only — the prebuilt CPU binary
														
 
															+    // is sufficient and skipping the Vulkan probe avoids the ~30s cmake
														
 
															+    // attempt on hosts without libvulkan-dev/glslc.
														
 
															+    const remoteEmbed = env.QMD_EMBED_ENDPOINT?.trim();
														
 
															+    if (remoteEmbed && remoteEmbed !== "")
														
 
															+        return "cpu";
														
 
															+    return "auto";
														
 
															+}
														
 
															+// =============================================================================
														
 
															 // Embedding Formatting Functions
														
 
															 // =============================================================================
														
 
															 /**
														
@@ -291,19 +346,48 @@ export class LlamaCpp {
 
															     }
														
 
															     /**
														
 
															      * Initialize the llama instance (lazy)
														
 
															+     *
														
 
															+     * Env-var controls (i-c28wngnd):
														
 
															+     *   - QMD_DISABLE_LOCAL_LLM=1    : hard-disable; throws on first ensureLlama()
														
 
															+     *                                  call. Use when the deployment must NEVER
														
 
															+     *                                  load node-llama-cpp (e.g. headless cron
														
 
															+     *                                  on a host without libvulkan-dev/glslc).
														
 
															+     *   - QMD_LLAMA_GPU=off|none|... : force CPU-only (skip Vulkan probe).
														
 
															+     *   - QMD_LLAMA_GPU=auto         : explicit opt-in to GPU probe even when
														
 
															+     *                                  QMD_EMBED_ENDPOINT is set (rare; useful
														
 
															+     *                                  for hybrid local-rerank + remote-embed).
														
 
															+     *
														
 
															+     * Auto-detect: when QMD_EMBED_ENDPOINT is set (HTTP embed provider, e.g.
														
 
															+     * cron on `code` → ai.mm.mk → models:8082), we default to CPU-only because
														
 
															+     * the embed path runs over HTTP and the only remaining local LLM consumers
														
 
															+     * are rerank/query-expansion, which work fine on the prebuilt CPU binary
														
 
															+     * and never need to invoke cmake-js-llama. This silences ~30s/run of
														
 
															+     * Vulkan probe + cmake noise on headless LXCs.
														
 
															      */
														
 
															     async ensureLlama() {
														
 
															         if (!this.llama) {
														
 
															-            // Allow override via QMD_LLAMA_GPU: "false" | "off" | "none" forces CPU
														
 
															-            const gpuOverride = (process.env.QMD_LLAMA_GPU ?? "").toLowerCase();
														
 
															-            const forceCpu = ["false", "off", "none", "disable", "disabled", "0"].includes(gpuOverride);
														
 
															+            // Hard-disable opt-out — fails fast so the caller knows. Throw early
														
 
															+            // so any path that ignores the documented `EmbeddingProvider` route
														
 
															+            // and reaches for the local LLM gets a loud, actionable error rather
														
 
															+            // than a silent 30s Vulkan compile attempt.
														
 
															+            if (isLocalLlmDisabled(process.env)) {
														
 
															+                throw new Error("QMD_DISABLE_LOCAL_LLM=1 — local node-llama-cpp is disabled. " +
														
 
															+                    "This deployment is configured for remote embeddings only; the " +
														
 
															+                    "code path that reached `ensureLlama()` should route through an " +
														
 
															+                    "EmbeddingProvider (set QMD_EMBED_ENDPOINT) instead. Unset " +
														
 
															+                    "QMD_DISABLE_LOCAL_LLM to re-enable local rerank/expand.");
														
 
															+            }
														
 
															+            // Resolve GPU mode: explicit QMD_LLAMA_GPU wins, else auto-detect
														
 
															+            // remote-only deployment (CPU when QMD_EMBED_ENDPOINT is set), else
														
 
															+            // probe GPU normally for legacy local-only setups.
														
 
															+            const gpuMode = resolveLlamaGpuMode(process.env);
														
 
															             const loadLlama = async (gpu) => await getLlama({
														
 
															                 build: "autoAttempt",
														
 
															                 logLevel: LlamaLogLevel.error,
														
 
															                 gpu,
														
 
															             });
														
 
															             let llama;
														
 
															-            if (forceCpu) {
														
 
															+            if (gpuMode === "cpu") {
														
 
															                 llama = await loadLlama(false);
														
 
															             }
														
 
															             else {
														
@@ -317,7 +401,11 @@ export class LlamaCpp {
 
															                     llama = await loadLlama(false);
														
 
															                 }
														
 
															             }
														
 
															-            if (llama.gpu === false) {
														
 
															+            // Suppress the "running on CPU (slow)" warning when CPU was requested
														
 
															+            // explicitly or auto-selected for a remote-only deployment — there's
														
 
															+            // nothing the operator can do about it and the hint isn't relevant
														
 
															+            // (embed runs via HTTP; only rerank/expand use the local CPU path).
														
 
															+            if (llama.gpu === false && gpuMode === "auto") {
														
 
															                 process.stderr.write("QMD Warning: no GPU acceleration, running on CPU (slow). Run 'qmd status' for details.\n");
														
 
															             }
														
 
															             this.llama = llama;
														
--- a/src/llm.ts
+++ b/src/llm.ts
@@ -18,6 +18,68 @@ import { homedir } from "os";
 
															 import { join } from "path";
														
 
															 import { existsSync, mkdirSync, statSync, unlinkSync, readdirSync, readFileSync, writeFileSync } from "fs";
														
 
															+// =============================================================================
														
 
															+// Local-LLM env-var policy (i-c28wngnd)
														
 
															+// =============================================================================
														
 
															+
														
 
															+/**
														
 
															+ * Truthy values for boolean-style env vars. Mirrors the convention used by
														
 
															+ * `QMD_LLAMA_GPU` (false-style) — kept narrow so unrelated values don't flip
														
 
															+ * the disable.
														
 
															+ */
														
 
															+const TRUTHY_ENV_VALUES: ReadonlySet<string> = new Set(["1", "true", "yes", "on"]);
														
 
															+
														
 
															+/**
														
 
															+ * Falsy / off-style values accepted by `QMD_LLAMA_GPU`.
														
 
															+ */
														
 
															+const QMD_LLAMA_GPU_OFF_VALUES: ReadonlySet<string> = new Set([
														
 
															+  "false", "off", "none", "disable", "disabled", "0",
														
 
															+]);
														
 
															+
														
 
															+/**
														
 
															+ * `QMD_DISABLE_LOCAL_LLM=1` opt-out: when set, `LlamaCpp.ensureLlama()`
														
 
															+ * throws on first invocation. Use for remote-only deployments where any
														
 
															+ * `getLlama()` call indicates an unintended fallback (e.g. cron host
														
 
															+ * without libvulkan-dev/glslc — issue i-c28wngnd).
														
 
															+ */
														
 
															+export function isLocalLlmDisabled(env: NodeJS.ProcessEnv = process.env): boolean {
														
 
															+  const raw = env.QMD_DISABLE_LOCAL_LLM?.trim().toLowerCase();
														
 
															+  return raw !== undefined && TRUTHY_ENV_VALUES.has(raw);
														
 
															+}
														
 
															+
														
 
															+/**
														
 
															+ * Resolve the GPU mode for `getLlama()`:
														
 
															+ *   1. Explicit `QMD_LLAMA_GPU=off|none|0|...`     → "cpu"
														
 
															+ *   2. Explicit `QMD_LLAMA_GPU=auto`               → "auto"
														
 
															+ *   3. Auto-detect: `QMD_EMBED_ENDPOINT` set        → "cpu"
														
 
															+ *      (remote embed provider — embed never touches local LLM. Rerank/expand
														
 
															+ *       still use prebuilt CPU binary; no Vulkan probe / cmake build.)
														
 
															+ *   4. Otherwise (legacy local-only setup)          → "auto"
														
 
															+ */
														
 
															+export function resolveLlamaGpuMode(
														
 
															+  env: NodeJS.ProcessEnv = process.env,
														
 
															+): "cpu" | "auto" {
														
 
															+  const explicit = env.QMD_LLAMA_GPU?.trim().toLowerCase();
														
 
															+  if (explicit !== undefined && explicit !== "") {
														
 
															+    if (QMD_LLAMA_GPU_OFF_VALUES.has(explicit)) return "cpu";
														
 
															+    if (explicit === "auto" || explicit === "true" || explicit === "on") {
														
 
															+      return "auto";
														
 
															+    }
														
 
															+    // Unknown value — preserve legacy behavior (probe).
														
 
															+    return "auto";
														
 
															+  }
														
 
															+
														
 
															+  // Auto-detect remote-only deployment. When QMD_EMBED_ENDPOINT is set the
														
 
															+  // embed path runs over HTTP (factory.ts resolveProviderKind), so any
														
 
															+  // local LLM access is for rerank/expand only — the prebuilt CPU binary
														
 
															+  // is sufficient and skipping the Vulkan probe avoids the ~30s cmake
														
 
															+  // attempt on hosts without libvulkan-dev/glslc.
														
 
															+  const remoteEmbed = env.QMD_EMBED_ENDPOINT?.trim();
														
 
															+  if (remoteEmbed && remoteEmbed !== "") return "cpu";
														
 
															+
														
 
															+  return "auto";
														
 
															+}
														
 
															+
														
 
															 // =============================================================================
														
 
															 // Embedding Formatting Functions
														
 
															 // =============================================================================
														
@@ -549,12 +611,44 @@ export class LlamaCpp implements LLM {
 
															   /**
														
 
															    * Initialize the llama instance (lazy)
														
 
															+   *
														
 
															+   * Env-var controls (i-c28wngnd):
														
 
															+   *   - QMD_DISABLE_LOCAL_LLM=1    : hard-disable; throws on first ensureLlama()
														
 
															+   *                                  call. Use when the deployment must NEVER
														
 
															+   *                                  load node-llama-cpp (e.g. headless cron
														
 
															+   *                                  on a host without libvulkan-dev/glslc).
														
 
															+   *   - QMD_LLAMA_GPU=off|none|... : force CPU-only (skip Vulkan probe).
														
 
															+   *   - QMD_LLAMA_GPU=auto         : explicit opt-in to GPU probe even when
														
 
															+   *                                  QMD_EMBED_ENDPOINT is set (rare; useful
														
 
															+   *                                  for hybrid local-rerank + remote-embed).
														
 
															+   *
														
 
															+   * Auto-detect: when QMD_EMBED_ENDPOINT is set (HTTP embed provider, e.g.
														
 
															+   * cron on `code` → ai.mm.mk → models:8082), we default to CPU-only because
														
 
															+   * the embed path runs over HTTP and the only remaining local LLM consumers
														
 
															+   * are rerank/query-expansion, which work fine on the prebuilt CPU binary
														
 
															+   * and never need to invoke cmake-js-llama. This silences ~30s/run of
														
 
															+   * Vulkan probe + cmake noise on headless LXCs.
														
 
															    */
														
 
															   private async ensureLlama(): Promise<Llama> {
														
 
															     if (!this.llama) {
														
 
															-      // Allow override via QMD_LLAMA_GPU: "false" | "off" | "none" forces CPU
														
 
															-      const gpuOverride = (process.env.QMD_LLAMA_GPU ?? "").toLowerCase();
														
 
															-      const forceCpu = ["false", "off", "none", "disable", "disabled", "0"].includes(gpuOverride);
														
 
															+      // Hard-disable opt-out — fails fast so the caller knows. Throw early
														
 
															+      // so any path that ignores the documented `EmbeddingProvider` route
														
 
															+      // and reaches for the local LLM gets a loud, actionable error rather
														
 
															+      // than a silent 30s Vulkan compile attempt.
														
 
															+      if (isLocalLlmDisabled(process.env)) {
														
 
															+        throw new Error(
														
 
															+          "QMD_DISABLE_LOCAL_LLM=1 — local node-llama-cpp is disabled. " +
														
 
															+          "This deployment is configured for remote embeddings only; the " +
														
 
															+          "code path that reached `ensureLlama()` should route through an " +
														
 
															+          "EmbeddingProvider (set QMD_EMBED_ENDPOINT) instead. Unset " +
														
 
															+          "QMD_DISABLE_LOCAL_LLM to re-enable local rerank/expand."
														
 
															+        );
														
 
															+      }
														
 
															+
														
 
															+      // Resolve GPU mode: explicit QMD_LLAMA_GPU wins, else auto-detect
														
 
															+      // remote-only deployment (CPU when QMD_EMBED_ENDPOINT is set), else
														
 
															+      // probe GPU normally for legacy local-only setups.
														
 
															+      const gpuMode = resolveLlamaGpuMode(process.env);
														
 
															       const loadLlama = async (gpu: "auto" | false) =>
														
 
															         await getLlama({
														
@@ -564,7 +658,7 @@ export class LlamaCpp implements LLM {
 
															         });
														
 
															       let llama: Llama;
														
 
															-      if (forceCpu) {
														
 
															+      if (gpuMode === "cpu") {
														
 
															         llama = await loadLlama(false);
														
 
															       } else {
														
 
															         try {
														
@@ -579,7 +673,11 @@ export class LlamaCpp implements LLM {
 
															         }
														
 
															       }
														
 
															-      if (llama.gpu === false) {
														
 
															+      // Suppress the "running on CPU (slow)" warning when CPU was requested
														
 
															+      // explicitly or auto-selected for a remote-only deployment — there's
														
 
															+      // nothing the operator can do about it and the hint isn't relevant
														
 
															+      // (embed runs via HTTP; only rerank/expand use the local CPU path).
														
 
															+      if (llama.gpu === false && gpuMode === "auto") {
														
 
															         process.stderr.write(
														
 
															           "QMD Warning: no GPU acceleration, running on CPU (slow). Run 'qmd status' for details.\n"
														
 
															         );
														
--- a/test/llm.test.ts
+++ b/test/llm.test.ts
@@ -15,6 +15,8 @@ import {
 
															   withLLMSession,
														
 
															   canUnloadLLM,
														
 
															   SessionReleasedError,
														
 
															+  isLocalLlmDisabled,
														
 
															+  resolveLlamaGpuMode,
														
 
															   type RerankDocument,
														
 
															   type ILLMSession,
														
 
															 } from "../src/llm.js";
														
@@ -161,6 +163,105 @@ describe("LlamaCpp model resolution (config > env > default)", () => {
 
															   });
														
 
															 });
														
 
															+// =============================================================================
														
 
															+// QMD_DISABLE_LOCAL_LLM + remote-only auto-CPU (i-c28wngnd)
														
 
															+// =============================================================================
														
 
															+
														
 
															+describe("isLocalLlmDisabled (QMD_DISABLE_LOCAL_LLM)", () => {
														
 
															+  test("returns false when env var is unset", () => {
														
 
															+    expect(isLocalLlmDisabled({})).toBe(false);
														
 
															+    expect(isLocalLlmDisabled({ QMD_DISABLE_LOCAL_LLM: undefined })).toBe(false);
														
 
															+  });
														
 
															+
														
 
															+  test("returns false when env var is empty / whitespace", () => {
														
 
															+    expect(isLocalLlmDisabled({ QMD_DISABLE_LOCAL_LLM: "" })).toBe(false);
														
 
															+    expect(isLocalLlmDisabled({ QMD_DISABLE_LOCAL_LLM: "   " })).toBe(false);
														
 
															+  });
														
 
															+
														
 
															+  test("returns true for canonical truthy values", () => {
														
 
															+    for (const v of ["1", "true", "yes", "on", "TRUE", "Yes", " 1 "]) {
														
 
															+      expect(isLocalLlmDisabled({ QMD_DISABLE_LOCAL_LLM: v })).toBe(true);
														
 
															+    }
														
 
															+  });
														
 
															+
														
 
															+  test("returns false for canonical falsy values", () => {
														
 
															+    for (const v of ["0", "false", "no", "off", "FALSE", "No"]) {
														
 
															+      expect(isLocalLlmDisabled({ QMD_DISABLE_LOCAL_LLM: v })).toBe(false);
														
 
															+    }
														
 
															+  });
														
 
															+});
														
 
															+
														
 
															+describe("resolveLlamaGpuMode (QMD_LLAMA_GPU + QMD_EMBED_ENDPOINT)", () => {
														
 
															+  test("returns 'auto' for empty env (legacy local-only setup)", () => {
														
 
															+    expect(resolveLlamaGpuMode({})).toBe("auto");
														
 
															+  });
														
 
															+
														
 
															+  test("explicit QMD_LLAMA_GPU=off|none|0|disabled forces CPU", () => {
														
 
															+    for (const v of ["off", "none", "false", "0", "disabled", "disable", "OFF", "None"]) {
														
 
															+      expect(resolveLlamaGpuMode({ QMD_LLAMA_GPU: v })).toBe("cpu");
														
 
															+    }
														
 
															+  });
														
 
															+
														
 
															+  test("explicit QMD_LLAMA_GPU=auto|on|true preserves probe", () => {
														
 
															+    for (const v of ["auto", "on", "true", "Auto"]) {
														
 
															+      expect(resolveLlamaGpuMode({ QMD_LLAMA_GPU: v })).toBe("auto");
														
 
															+    }
														
 
															+  });
														
 
															+
														
 
															+  test("auto-detect: QMD_EMBED_ENDPOINT set → CPU (skip Vulkan probe)", () => {
														
 
															+    expect(
														
 
															+      resolveLlamaGpuMode({ QMD_EMBED_ENDPOINT: "http://models:8082" }),
														
 
															+    ).toBe("cpu");
														
 
															+  });
														
 
															+
														
 
															+  test("explicit QMD_LLAMA_GPU=auto OVERRIDES QMD_EMBED_ENDPOINT auto-CPU", () => {
														
 
															+    expect(
														
 
															+      resolveLlamaGpuMode({
														
 
															+        QMD_LLAMA_GPU: "auto",
														
 
															+        QMD_EMBED_ENDPOINT: "http://models:8082",
														
 
															+      }),
														
 
															+    ).toBe("auto");
														
 
															+  });
														
 
															+
														
 
															+  test("empty QMD_EMBED_ENDPOINT does not trigger auto-CPU", () => {
														
 
															+    expect(resolveLlamaGpuMode({ QMD_EMBED_ENDPOINT: "" })).toBe("auto");
														
 
															+    expect(resolveLlamaGpuMode({ QMD_EMBED_ENDPOINT: "   " })).toBe("auto");
														
 
															+  });
														
 
															+
														
 
															+  test("unknown QMD_LLAMA_GPU values fall back to 'auto' (preserve legacy probe)", () => {
														
 
															+    expect(resolveLlamaGpuMode({ QMD_LLAMA_GPU: "vulkan" })).toBe("auto");
														
 
															+    expect(resolveLlamaGpuMode({ QMD_LLAMA_GPU: "cuda" })).toBe("auto");
														
 
															+  });
														
 
															+});
														
 
															+
														
 
															+describe("LlamaCpp.ensureLlama() + QMD_DISABLE_LOCAL_LLM", () => {
														
 
															+  test("throws with actionable error when QMD_DISABLE_LOCAL_LLM=1", async () => {
														
 
															+    const prev = process.env.QMD_DISABLE_LOCAL_LLM;
														
 
															+    process.env.QMD_DISABLE_LOCAL_LLM = "1";
														
 
															+    try {
														
 
															+      const llm = new LlamaCpp({}) as any;
														
 
															+      await expect(llm.ensureLlama()).rejects.toThrow(/QMD_DISABLE_LOCAL_LLM/);
														
 
															+      await expect(llm.ensureLlama()).rejects.toThrow(/EmbeddingProvider/);
														
 
															+    } finally {
														
 
															+      if (prev === undefined) delete process.env.QMD_DISABLE_LOCAL_LLM;
														
 
															+      else process.env.QMD_DISABLE_LOCAL_LLM = prev;
														
 
															+    }
														
 
															+  });
														
 
															+
														
 
															+  test("does not throw when QMD_DISABLE_LOCAL_LLM is unset (smoke)", () => {
														
 
															+    // We don't want to actually call getLlama() (slow / loads native), but
														
 
															+    // we verify the guard does NOT fire for an empty/unset env. The full
														
 
															+    // integration path is exercised by the gated CI suite below.
														
 
															+    const prev = process.env.QMD_DISABLE_LOCAL_LLM;
														
 
															+    delete process.env.QMD_DISABLE_LOCAL_LLM;
														
 
															+    try {
														
 
															+      expect(isLocalLlmDisabled(process.env)).toBe(false);
														
 
															+    } finally {
														
 
															+      if (prev !== undefined) process.env.QMD_DISABLE_LOCAL_LLM = prev;
														
 
															+    }
														
 
															+  });
														
 
															+});
														
 
															+
														
 
															 describe("LlamaCpp rerank deduping", () => {
														
 
															   test("deduplicates identical document texts before scoring", async () => {
														
 
															     const llm = new LlamaCpp({}) as any;