|
@@ -18,6 +18,68 @@ import { homedir } from "os";
|
|
|
import { join } from "path";
|
|
import { join } from "path";
|
|
|
import { existsSync, mkdirSync, statSync, unlinkSync, readdirSync, readFileSync, writeFileSync } from "fs";
|
|
import { existsSync, mkdirSync, statSync, unlinkSync, readdirSync, readFileSync, writeFileSync } from "fs";
|
|
|
|
|
|
|
|
|
|
+// =============================================================================
|
|
|
|
|
+// Local-LLM env-var policy (i-c28wngnd)
|
|
|
|
|
+// =============================================================================
|
|
|
|
|
+
|
|
|
|
|
+/**
|
|
|
|
|
+ * Truthy values for boolean-style env vars. Mirrors the convention used by
|
|
|
|
|
+ * `QMD_LLAMA_GPU` (false-style) — kept narrow so unrelated values don't flip
|
|
|
|
|
+ * the disable.
|
|
|
|
|
+ */
|
|
|
|
|
+const TRUTHY_ENV_VALUES: ReadonlySet<string> = new Set(["1", "true", "yes", "on"]);
|
|
|
|
|
+
|
|
|
|
|
+/**
|
|
|
|
|
+ * Falsy / off-style values accepted by `QMD_LLAMA_GPU`.
|
|
|
|
|
+ */
|
|
|
|
|
+const QMD_LLAMA_GPU_OFF_VALUES: ReadonlySet<string> = new Set([
|
|
|
|
|
+ "false", "off", "none", "disable", "disabled", "0",
|
|
|
|
|
+]);
|
|
|
|
|
+
|
|
|
|
|
+/**
|
|
|
|
|
+ * `QMD_DISABLE_LOCAL_LLM=1` opt-out: when set, `LlamaCpp.ensureLlama()`
|
|
|
|
|
+ * throws on first invocation. Use for remote-only deployments where any
|
|
|
|
|
+ * `getLlama()` call indicates an unintended fallback (e.g. cron host
|
|
|
|
|
+ * without libvulkan-dev/glslc — issue i-c28wngnd).
|
|
|
|
|
+ */
|
|
|
|
|
+export function isLocalLlmDisabled(env: NodeJS.ProcessEnv = process.env): boolean {
|
|
|
|
|
+ const raw = env.QMD_DISABLE_LOCAL_LLM?.trim().toLowerCase();
|
|
|
|
|
+ return raw !== undefined && TRUTHY_ENV_VALUES.has(raw);
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+/**
|
|
|
|
|
+ * Resolve the GPU mode for `getLlama()`:
|
|
|
|
|
+ * 1. Explicit `QMD_LLAMA_GPU=off|none|0|...` → "cpu"
|
|
|
|
|
+ * 2. Explicit `QMD_LLAMA_GPU=auto` → "auto"
|
|
|
|
|
+ * 3. Auto-detect: `QMD_EMBED_ENDPOINT` set → "cpu"
|
|
|
|
|
+ * (remote embed provider — embed never touches local LLM. Rerank/expand
|
|
|
|
|
+ * still use prebuilt CPU binary; no Vulkan probe / cmake build.)
|
|
|
|
|
+ * 4. Otherwise (legacy local-only setup) → "auto"
|
|
|
|
|
+ */
|
|
|
|
|
+export function resolveLlamaGpuMode(
|
|
|
|
|
+ env: NodeJS.ProcessEnv = process.env,
|
|
|
|
|
+): "cpu" | "auto" {
|
|
|
|
|
+ const explicit = env.QMD_LLAMA_GPU?.trim().toLowerCase();
|
|
|
|
|
+ if (explicit !== undefined && explicit !== "") {
|
|
|
|
|
+ if (QMD_LLAMA_GPU_OFF_VALUES.has(explicit)) return "cpu";
|
|
|
|
|
+ if (explicit === "auto" || explicit === "true" || explicit === "on") {
|
|
|
|
|
+ return "auto";
|
|
|
|
|
+ }
|
|
|
|
|
+ // Unknown value — preserve legacy behavior (probe).
|
|
|
|
|
+ return "auto";
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // Auto-detect remote-only deployment. When QMD_EMBED_ENDPOINT is set the
|
|
|
|
|
+ // embed path runs over HTTP (factory.ts resolveProviderKind), so any
|
|
|
|
|
+ // local LLM access is for rerank/expand only — the prebuilt CPU binary
|
|
|
|
|
+ // is sufficient and skipping the Vulkan probe avoids the ~30s cmake
|
|
|
|
|
+ // attempt on hosts without libvulkan-dev/glslc.
|
|
|
|
|
+ const remoteEmbed = env.QMD_EMBED_ENDPOINT?.trim();
|
|
|
|
|
+ if (remoteEmbed && remoteEmbed !== "") return "cpu";
|
|
|
|
|
+
|
|
|
|
|
+ return "auto";
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
// =============================================================================
|
|
// =============================================================================
|
|
|
// Embedding Formatting Functions
|
|
// Embedding Formatting Functions
|
|
|
// =============================================================================
|
|
// =============================================================================
|
|
@@ -549,12 +611,44 @@ export class LlamaCpp implements LLM {
|
|
|
|
|
|
|
|
/**
|
|
/**
|
|
|
* Initialize the llama instance (lazy)
|
|
* Initialize the llama instance (lazy)
|
|
|
|
|
+ *
|
|
|
|
|
+ * Env-var controls (i-c28wngnd):
|
|
|
|
|
+ * - QMD_DISABLE_LOCAL_LLM=1 : hard-disable; throws on first ensureLlama()
|
|
|
|
|
+ * call. Use when the deployment must NEVER
|
|
|
|
|
+ * load node-llama-cpp (e.g. headless cron
|
|
|
|
|
+ * on a host without libvulkan-dev/glslc).
|
|
|
|
|
+ * - QMD_LLAMA_GPU=off|none|... : force CPU-only (skip Vulkan probe).
|
|
|
|
|
+ * - QMD_LLAMA_GPU=auto : explicit opt-in to GPU probe even when
|
|
|
|
|
+ * QMD_EMBED_ENDPOINT is set (rare; useful
|
|
|
|
|
+ * for hybrid local-rerank + remote-embed).
|
|
|
|
|
+ *
|
|
|
|
|
+ * Auto-detect: when QMD_EMBED_ENDPOINT is set (HTTP embed provider, e.g.
|
|
|
|
|
+ * cron on `code` → ai.mm.mk → models:8082), we default to CPU-only because
|
|
|
|
|
+ * the embed path runs over HTTP and the only remaining local LLM consumers
|
|
|
|
|
+ * are rerank/query-expansion, which work fine on the prebuilt CPU binary
|
|
|
|
|
+ * and never need to invoke cmake-js-llama. This silences ~30s/run of
|
|
|
|
|
+ * Vulkan probe + cmake noise on headless LXCs.
|
|
|
*/
|
|
*/
|
|
|
private async ensureLlama(): Promise<Llama> {
|
|
private async ensureLlama(): Promise<Llama> {
|
|
|
if (!this.llama) {
|
|
if (!this.llama) {
|
|
|
- // Allow override via QMD_LLAMA_GPU: "false" | "off" | "none" forces CPU
|
|
|
|
|
- const gpuOverride = (process.env.QMD_LLAMA_GPU ?? "").toLowerCase();
|
|
|
|
|
- const forceCpu = ["false", "off", "none", "disable", "disabled", "0"].includes(gpuOverride);
|
|
|
|
|
|
|
+ // Hard-disable opt-out — fails fast so the caller knows. Throw early
|
|
|
|
|
+ // so any path that ignores the documented `EmbeddingProvider` route
|
|
|
|
|
+ // and reaches for the local LLM gets a loud, actionable error rather
|
|
|
|
|
+ // than a silent 30s Vulkan compile attempt.
|
|
|
|
|
+ if (isLocalLlmDisabled(process.env)) {
|
|
|
|
|
+ throw new Error(
|
|
|
|
|
+ "QMD_DISABLE_LOCAL_LLM=1 — local node-llama-cpp is disabled. " +
|
|
|
|
|
+ "This deployment is configured for remote embeddings only; the " +
|
|
|
|
|
+ "code path that reached `ensureLlama()` should route through an " +
|
|
|
|
|
+ "EmbeddingProvider (set QMD_EMBED_ENDPOINT) instead. Unset " +
|
|
|
|
|
+ "QMD_DISABLE_LOCAL_LLM to re-enable local rerank/expand."
|
|
|
|
|
+ );
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // Resolve GPU mode: explicit QMD_LLAMA_GPU wins, else auto-detect
|
|
|
|
|
+ // remote-only deployment (CPU when QMD_EMBED_ENDPOINT is set), else
|
|
|
|
|
+ // probe GPU normally for legacy local-only setups.
|
|
|
|
|
+ const gpuMode = resolveLlamaGpuMode(process.env);
|
|
|
|
|
|
|
|
const loadLlama = async (gpu: "auto" | false) =>
|
|
const loadLlama = async (gpu: "auto" | false) =>
|
|
|
await getLlama({
|
|
await getLlama({
|
|
@@ -564,7 +658,7 @@ export class LlamaCpp implements LLM {
|
|
|
});
|
|
});
|
|
|
|
|
|
|
|
let llama: Llama;
|
|
let llama: Llama;
|
|
|
- if (forceCpu) {
|
|
|
|
|
|
|
+ if (gpuMode === "cpu") {
|
|
|
llama = await loadLlama(false);
|
|
llama = await loadLlama(false);
|
|
|
} else {
|
|
} else {
|
|
|
try {
|
|
try {
|
|
@@ -579,7 +673,11 @@ export class LlamaCpp implements LLM {
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- if (llama.gpu === false) {
|
|
|
|
|
|
|
+ // Suppress the "running on CPU (slow)" warning when CPU was requested
|
|
|
|
|
+ // explicitly or auto-selected for a remote-only deployment — there's
|
|
|
|
|
+ // nothing the operator can do about it and the hint isn't relevant
|
|
|
|
|
+ // (embed runs via HTTP; only rerank/expand use the local CPU path).
|
|
|
|
|
+ if (llama.gpu === false && gpuMode === "auto") {
|
|
|
process.stderr.write(
|
|
process.stderr.write(
|
|
|
"QMD Warning: no GPU acceleration, running on CPU (slow). Run 'qmd status' for details.\n"
|
|
"QMD Warning: no GPU acceleration, running on CPU (slow). Run 'qmd status' for details.\n"
|
|
|
);
|
|
);
|