hace 2 meses · 66cbadc06c
--- a/dist/cli/qmd.js
+++ b/dist/cli/qmd.js
@@ -12,6 +12,7 @@ import { disposeDefaultLlamaCpp, getDefaultLlamaCpp, setDefaultLlamaCpp, LlamaCp
 
				 import { formatSearchResults, formatDocuments, escapeXml, escapeCSV, } from "./formatter.js";
			
 
				 import { getCollection as getCollectionFromYaml, listCollections as yamlListCollections, getDefaultCollectionNames, addContext as yamlAddContext, removeContext as yamlRemoveContext, removeCollection as yamlRemoveCollectionFn, renameCollection as yamlRenameCollectionFn, setGlobalContext, listAllContexts, setConfigIndexName, loadConfig, } from "../collections.js";
			
 
				 import { getEmbeddedQmdSkillContent, getEmbeddedQmdSkillFiles } from "../embedded-skills.js";
			
 
				+import { createEmbeddingProvider, resolveProviderKind, ModelMismatchError, } from "../embedding/index.js";
			
 
				 // Enable production mode - allows using default database path
			
 
				 // Tests must set INDEX_PATH or use createStore() with explicit path
			
 
				 enableProductionMode();
			
@@ -95,6 +96,7 @@ const c = {
 
				     green: useColor ? "\x1b[32m" : "",
			
 
				     magenta: useColor ? "\x1b[35m" : "",
			
 
				     blue: useColor ? "\x1b[34m" : "",
			
 
				+    red: useColor ? "\x1b[31m" : "",
			
 
				 };
			
 
				 // Terminal cursor control
			
 
				 const cursor = {
			
@@ -1419,6 +1421,59 @@ function parseChunkStrategy(value) {
 
				         return s;
			
 
				     throw new Error(`--chunk-strategy must be "auto", "regex", or "function" (got "${s}")`);
			
 
				 }
			
 
				+function parseProviderKind(value) {
			
 
				+    if (value === undefined)
			
 
				+        return undefined;
			
 
				+    const s = String(value).toLowerCase();
			
 
				+    if (s === "local" || s === "openai")
			
 
				+        return s;
			
 
				+    throw new Error(`--provider must be "local" or "openai" (got "${s}")`);
			
 
				+}
			
 
				+function parseOptionalPositiveInt(name, value) {
			
 
				+    if (value === undefined)
			
 
				+        return undefined;
			
 
				+    const parsed = Number(value);
			
 
				+    if (!Number.isInteger(parsed) || parsed < 1) {
			
 
				+        throw new Error(`${name} must be a positive integer`);
			
 
				+    }
			
 
				+    return parsed;
			
 
				+}
			
 
				+/**
			
 
				+ * Translate `cli.values` into `CreateEmbeddingProviderOptions`. CLI flags
			
 
				+ * win over env vars (the factory itself reads env when these are unset).
			
 
				+ */
			
 
				+function buildProviderOpts(values, providerCliKind) {
			
 
				+    const endpoint = optionalString(values["embed-endpoint"]);
			
 
				+    const apiKey = optionalString(values["embed-api-key"]);
			
 
				+    const modelId = optionalString(values["embed-model-id"]);
			
 
				+    const upstreamModel = optionalString(values["embed-upstream-model"]);
			
 
				+    const batchSize = parseOptionalPositiveInt("--embed-batch-size", values["embed-batch-size"]);
			
 
				+    const timeoutMs = parseOptionalPositiveInt("--embed-timeout-ms", values["embed-timeout-ms"]);
			
 
				+    // Only build the openai overrides object if the user supplied flags
			
 
				+    const openai = endpoint || apiKey || modelId || upstreamModel || batchSize !== undefined || timeoutMs !== undefined
			
 
				+        ? {
			
 
				+            ...(endpoint !== undefined ? { endpoint } : {}),
			
 
				+            ...(apiKey !== undefined ? { apiKey } : {}),
			
 
				+            ...(modelId !== undefined ? { modelId } : {}),
			
 
				+            ...(upstreamModel !== undefined ? { upstreamModel } : {}),
			
 
				+            ...(batchSize !== undefined ? { batchSize } : {}),
			
 
				+            ...(timeoutMs !== undefined ? { timeoutMs } : {}),
			
 
				+        }
			
 
				+        : undefined;
			
 
				+    // CLI flag for auto-fallback wrapping (only meaningful when kind === openai)
			
 
				+    const autoFallback = values["embed-auto-fallback"] === true ? true : undefined;
			
 
				+    return {
			
 
				+        ...(providerCliKind ? { kind: providerCliKind } : {}),
			
 
				+        ...(openai ? { openai } : {}),
			
 
				+        ...(autoFallback !== undefined ? { autoFallback } : {}),
			
 
				+    };
			
 
				+}
			
 
				+function optionalString(v) {
			
 
				+    if (v === undefined || v === null)
			
 
				+        return undefined;
			
 
				+    const s = String(v);
			
 
				+    return s === "" ? undefined : s;
			
 
				+}
			
 
				 async function vectorIndex(model = DEFAULT_EMBED_MODEL_URI, force = false, batchOptions) {
			
 
				     const storeInstance = getStore();
			
 
				     const db = storeInstance.db;
			
@@ -1433,6 +1488,14 @@ async function vectorIndex(model = DEFAULT_EMBED_MODEL_URI, force = false, batch
 
				         return;
			
 
				     }
			
 
				     console.log(`${c.dim}Model: ${model}${c.reset}\n`);
			
 
				+    if (batchOptions?.embedProvider) {
			
 
				+        const kind = batchOptions.embedProvider.kind;
			
 
				+        const providerModel = batchOptions.embedProvider.getModelId();
			
 
				+        console.log(`${c.dim}Provider: ${kind} (model id "${providerModel}")${c.reset}\n`);
			
 
				+    }
			
 
				+    else if (batchOptions?.providerKind) {
			
 
				+        console.log(`${c.dim}Provider: ${batchOptions.providerKind}${c.reset}\n`);
			
 
				+    }
			
 
				     if (batchOptions?.maxDocsPerBatch !== undefined || batchOptions?.maxBatchBytes !== undefined) {
			
 
				         const maxDocsPerBatch = batchOptions.maxDocsPerBatch ?? DEFAULT_EMBED_MAX_DOCS_PER_BATCH;
			
 
				         const maxBatchBytes = batchOptions.maxBatchBytes ?? DEFAULT_EMBED_MAX_BATCH_BYTES;
			
@@ -1447,6 +1510,7 @@ async function vectorIndex(model = DEFAULT_EMBED_MODEL_URI, force = false, batch
 
				         maxDocsPerBatch: batchOptions?.maxDocsPerBatch,
			
 
				         maxBatchBytes: batchOptions?.maxBatchBytes,
			
 
				         chunkStrategy: batchOptions?.chunkStrategy,
			
 
				+        embedProvider: batchOptions?.embedProvider,
			
 
				         onProgress: (info) => {
			
 
				             if (info.totalBytes === 0)
			
 
				                 return;
			
@@ -2109,6 +2173,14 @@ function parseCLI() {
 
				             force: { type: "boolean", short: "f" },
			
 
				             "max-docs-per-batch": { type: "string" },
			
 
				             "max-batch-mb": { type: "string" },
			
 
				+            provider: { type: "string" }, // "local" | "openai"
			
 
				+            "embed-endpoint": { type: "string" }, // OpenAI-compatible endpoint URL
			
 
				+            "embed-api-key": { type: "string" }, // Bearer token
			
 
				+            "embed-model-id": { type: "string" }, // Stable model id (default: embeddinggemma)
			
 
				+            "embed-upstream-model": { type: "string" }, // Upstream model name in HTTP body
			
 
				+            "embed-batch-size": { type: "string" }, // Batch size for HTTP provider
			
 
				+            "embed-timeout-ms": { type: "string" }, // Per-request timeout
			
 
				+            "embed-auto-fallback": { type: "boolean" }, // Wrap openai in AutoFallback (local fallback)
			
 
				             // Update options
			
 
				             pull: { type: "boolean" }, // git pull before update
			
 
				             refresh: { type: "boolean" },
			
@@ -2314,6 +2386,14 @@ function showHelp() {
 
				     console.log("  qmd embed [-f]                - Generate/refresh vector embeddings");
			
 
				     console.log("    --max-docs-per-batch <n>    - Cap docs loaded into memory per embedding batch");
			
 
				     console.log("    --max-batch-mb <n>          - Cap UTF-8 MB loaded into memory per embedding batch");
			
 
				+    console.log("    --provider {local,openai}   - Embedding backend (default: local llama.cpp)");
			
 
				+    console.log("    --embed-endpoint <url>      - OpenAI-compatible endpoint (or QMD_EMBED_ENDPOINT)");
			
 
				+    console.log("    --embed-api-key <key>       - Bearer token (or QMD_EMBED_API_KEY)");
			
 
				+    console.log("    --embed-model-id <id>       - Stable model id stored in DB (default: embeddinggemma)");
			
 
				+    console.log("    --embed-upstream-model <m>  - Model name sent in HTTP body (default: same as model-id)");
			
 
				+    console.log("    --embed-batch-size <n>      - Batch size for HTTP provider (default: 64)");
			
 
				+    console.log("    --embed-timeout-ms <n>      - Per-request timeout in ms (default: 30000)");
			
 
				+    console.log("    --embed-auto-fallback       - Wrap openai provider in local fallback (or QMD_EMBED_AUTO_FALLBACK)");
			
 
				     console.log("  qmd cleanup                   - Clear caches, vacuum DB");
			
 
				     console.log("");
			
 
				     console.log("Query syntax (qmd query):");
			
@@ -2669,14 +2749,29 @@ if (isMain) {
 
				                 const maxDocsPerBatch = parseEmbedBatchOption("maxDocsPerBatch", cli.values["max-docs-per-batch"]);
			
 
				                 const maxBatchMb = parseEmbedBatchOption("maxBatchBytes", cli.values["max-batch-mb"]);
			
 
				                 const embedChunkStrategy = parseChunkStrategy(cli.values["chunk-strategy"]);
			
 
				+                // Build embedding provider from CLI flags + env + config file.
			
 
				+                // Backward compat: with no flags / env vars, the factory returns
			
 
				+                // a LocalLlamaCppProvider that delegates to the default LlamaCpp
			
 
				+                // singleton — identical to pre-patch behavior.
			
 
				+                const providerCliKind = parseProviderKind(cli.values["provider"]);
			
 
				+                const providerOpts = buildProviderOpts(cli.values, providerCliKind);
			
 
				+                const embedProvider = createEmbeddingProvider(providerOpts);
			
 
				                 await vectorIndex(DEFAULT_EMBED_MODEL_URI, !!cli.values.force, {
			
 
				                     maxDocsPerBatch,
			
 
				                     maxBatchBytes: maxBatchMb === undefined ? undefined : maxBatchMb * 1024 * 1024,
			
 
				                     chunkStrategy: embedChunkStrategy,
			
 
				+                    embedProvider,
			
 
				+                    providerKind: embedProvider.kind,
			
 
				                 });
			
 
				             }
			
 
				             catch (error) {
			
 
				-                console.error(error instanceof Error ? error.message : String(error));
			
 
				+                if (error instanceof ModelMismatchError) {
			
 
				+                    // Friendlier output for the migration-safety guard
			
 
				+                    console.error(`${c.red}Model mismatch:${c.reset} ${error.message}`);
			
 
				+                }
			
 
				+                else {
			
 
				+                    console.error(error instanceof Error ? error.message : String(error));
			
 
				+                }
			
 
				                 process.exit(1);
			
 
				             }
			
 
				             break;
			
--- a/dist/embedding/autofallback.d.ts
+++ b/dist/embedding/autofallback.d.ts
@@ -0,0 +1,84 @@
 
				+/**
			
 
				+ * autofallback.ts - AutoFallbackEmbeddingProvider.
			
 
				+ *
			
 
				+ * Composes a primary `EmbeddingProvider` (typically `OpenAIEmbeddingsProvider`)
			
 
				+ * and a fallback (typically `LocalLlamaCppProvider`). When the primary trips
			
 
				+ * its circuit breaker — or when persistent failures cross a threshold — calls
			
 
				+ * are routed to the fallback. After a recovery cooldown, the primary is
			
 
				+ * probed again; success closes the breaker and routing returns.
			
 
				+ *
			
 
				+ * Acceptance criterion 4 from i-qkarfffa: "Endpoint down → fallback local + WARN".
			
 
				+ *
			
 
				+ * Behavior summary:
			
 
				+ *   - Primary call succeeds → return; record success.
			
 
				+ *   - Primary throws CircuitOpenError → fall back, log WARN once per transition.
			
 
				+ *   - Primary throws any other error → fall back for THIS call only;
			
 
				+ *     count toward the failure-streak threshold.
			
 
				+ *   - When failure streak crosses threshold (default 3) → set our own
			
 
				+ *     "open until" timestamp; until expiry, route directly to fallback
			
 
				+ *     (skip primary entirely).
			
 
				+ *   - On expiry, retry primary opportunistically.
			
 
				+ *   - getModelId / getDimensions / dispose are delegated to whichever
			
 
				+ *     provider is currently active (or to the primary if both are usable).
			
 
				+ */
			
 
				+import type { EmbeddingProvider, ProviderEmbedOptions, ProviderEmbedding, ProviderHealth, ProviderKind } from "./provider.js";
			
 
				+export type AutoFallbackProviderConfig = {
			
 
				+    primary: EmbeddingProvider;
			
 
				+    fallback: EmbeddingProvider;
			
 
				+    /**
			
 
				+     * Number of consecutive non-CircuitOpenError failures before we suppress
			
 
				+     * primary calls and route directly to fallback. Default: 3.
			
 
				+     */
			
 
				+    failureStreakThreshold?: number;
			
 
				+    /**
			
 
				+     * Time in ms to keep routing through fallback after the breaker opens.
			
 
				+     * Default: 5 minutes (matches `OpenAIEmbeddingsProvider`'s circuit duration).
			
 
				+     */
			
 
				+    cooldownMs?: number;
			
 
				+    /**
			
 
				+     * Optional WARN sink. Defaults to writing to `process.stderr` once per
			
 
				+     * routing transition (closed→open and open→closed).
			
 
				+     */
			
 
				+    warn?: (msg: string) => void;
			
 
				+    /** Custom clock for tests */
			
 
				+    now?: () => number;
			
 
				+};
			
 
				+export type FallbackState = "primary" | "fallback";
			
 
				+export declare class AutoFallbackEmbeddingProvider implements EmbeddingProvider {
			
 
				+    readonly kind: ProviderKind;
			
 
				+    readonly primary: EmbeddingProvider;
			
 
				+    readonly fallback: EmbeddingProvider;
			
 
				+    private readonly failureStreakThreshold;
			
 
				+    private readonly cooldownMs;
			
 
				+    private readonly warn;
			
 
				+    private readonly now;
			
 
				+    private failureStreak;
			
 
				+    private fallbackUntil;
			
 
				+    private lastTransitionState;
			
 
				+    constructor(config: AutoFallbackProviderConfig);
			
 
				+    /**
			
 
				+     * Stable model id reported by the primary. The model-id guard runs against
			
 
				+     * the primary's id because that's what callers actually want when the
			
 
				+     * remote endpoint is online; on fallback-only operation, the local
			
 
				+     * provider should report a compatible id (in the default config, both
			
 
				+     * report "embeddinggemma" so this is moot).
			
 
				+     */
			
 
				+    getModelId(): string;
			
 
				+    getDimensions(): number | undefined;
			
 
				+    /** Current routing state (mostly for tests + observability) */
			
 
				+    getRoutingState(): FallbackState;
			
 
				+    /** Reset failure-streak + cooldown (mostly for tests / admin) */
			
 
				+    reset(): void;
			
 
				+    healthcheck(signal?: AbortSignal): Promise<ProviderHealth>;
			
 
				+    embed(text: string, options?: ProviderEmbedOptions): Promise<ProviderEmbedding | null>;
			
 
				+    embedBatch(texts: string[], options?: ProviderEmbedOptions): Promise<(ProviderEmbedding | null)[]>;
			
 
				+    dispose(): Promise<void>;
			
 
				+    /**
			
 
				+     * Generic dispatcher: try primary if not in cooldown, fall back on
			
 
				+     * `CircuitOpenError`, count other errors against the failure streak.
			
 
				+     * `op` is invoked with whichever provider is selected.
			
 
				+     */
			
 
				+    private run;
			
 
				+    private openCooldown;
			
 
				+    private transition;
			
 
				+}
			
--- a/dist/embedding/autofallback.js
+++ b/dist/embedding/autofallback.js
@@ -0,0 +1,180 @@
 
				+/**
			
 
				+ * autofallback.ts - AutoFallbackEmbeddingProvider.
			
 
				+ *
			
 
				+ * Composes a primary `EmbeddingProvider` (typically `OpenAIEmbeddingsProvider`)
			
 
				+ * and a fallback (typically `LocalLlamaCppProvider`). When the primary trips
			
 
				+ * its circuit breaker — or when persistent failures cross a threshold — calls
			
 
				+ * are routed to the fallback. After a recovery cooldown, the primary is
			
 
				+ * probed again; success closes the breaker and routing returns.
			
 
				+ *
			
 
				+ * Acceptance criterion 4 from i-qkarfffa: "Endpoint down → fallback local + WARN".
			
 
				+ *
			
 
				+ * Behavior summary:
			
 
				+ *   - Primary call succeeds → return; record success.
			
 
				+ *   - Primary throws CircuitOpenError → fall back, log WARN once per transition.
			
 
				+ *   - Primary throws any other error → fall back for THIS call only;
			
 
				+ *     count toward the failure-streak threshold.
			
 
				+ *   - When failure streak crosses threshold (default 3) → set our own
			
 
				+ *     "open until" timestamp; until expiry, route directly to fallback
			
 
				+ *     (skip primary entirely).
			
 
				+ *   - On expiry, retry primary opportunistically.
			
 
				+ *   - getModelId / getDimensions / dispose are delegated to whichever
			
 
				+ *     provider is currently active (or to the primary if both are usable).
			
 
				+ */
			
 
				+import { CircuitOpenError } from "./openai.js";
			
 
				+const DEFAULT_FAILURE_STREAK = 3;
			
 
				+const DEFAULT_COOLDOWN_MS = 5 * 60_000;
			
 
				+function defaultWarn(msg) {
			
 
				+    process.stderr.write(`${msg}\n`);
			
 
				+}
			
 
				+export class AutoFallbackEmbeddingProvider {
			
 
				+    kind;
			
 
				+    primary;
			
 
				+    fallback;
			
 
				+    failureStreakThreshold;
			
 
				+    cooldownMs;
			
 
				+    warn;
			
 
				+    now;
			
 
				+    failureStreak = 0;
			
 
				+    fallbackUntil = null;
			
 
				+    lastTransitionState = "primary";
			
 
				+    constructor(config) {
			
 
				+        if (!config.primary)
			
 
				+            throw new Error("AutoFallbackEmbeddingProvider: primary is required");
			
 
				+        if (!config.fallback)
			
 
				+            throw new Error("AutoFallbackEmbeddingProvider: fallback is required");
			
 
				+        if (config.primary === config.fallback) {
			
 
				+            throw new Error("AutoFallbackEmbeddingProvider: primary and fallback must differ");
			
 
				+        }
			
 
				+        this.primary = config.primary;
			
 
				+        this.fallback = config.fallback;
			
 
				+        // Inherit the primary's kind for callers introspecting `provider.kind`.
			
 
				+        this.kind = config.primary.kind;
			
 
				+        this.failureStreakThreshold = config.failureStreakThreshold ?? DEFAULT_FAILURE_STREAK;
			
 
				+        this.cooldownMs = config.cooldownMs ?? DEFAULT_COOLDOWN_MS;
			
 
				+        this.warn = config.warn ?? defaultWarn;
			
 
				+        this.now = config.now ?? Date.now;
			
 
				+    }
			
 
				+    /**
			
 
				+     * Stable model id reported by the primary. The model-id guard runs against
			
 
				+     * the primary's id because that's what callers actually want when the
			
 
				+     * remote endpoint is online; on fallback-only operation, the local
			
 
				+     * provider should report a compatible id (in the default config, both
			
 
				+     * report "embeddinggemma" so this is moot).
			
 
				+     */
			
 
				+    getModelId() {
			
 
				+        return this.primary.getModelId();
			
 
				+    }
			
 
				+    getDimensions() {
			
 
				+        return this.primary.getDimensions() ?? this.fallback.getDimensions();
			
 
				+    }
			
 
				+    /** Current routing state (mostly for tests + observability) */
			
 
				+    getRoutingState() {
			
 
				+        if (this.fallbackUntil !== null && this.now() < this.fallbackUntil) {
			
 
				+            return "fallback";
			
 
				+        }
			
 
				+        return "primary";
			
 
				+    }
			
 
				+    /** Reset failure-streak + cooldown (mostly for tests / admin) */
			
 
				+    reset() {
			
 
				+        this.failureStreak = 0;
			
 
				+        this.fallbackUntil = null;
			
 
				+        this.transition("primary");
			
 
				+    }
			
 
				+    async healthcheck(signal) {
			
 
				+        // Primary first; if degraded, check fallback so callers can still tell
			
 
				+        // whether they have *any* working backend.
			
 
				+        const primaryHealth = await this.primary.healthcheck(signal);
			
 
				+        if (primaryHealth.ok)
			
 
				+            return primaryHealth;
			
 
				+        const fallbackHealth = await this.fallback.healthcheck(signal);
			
 
				+        return {
			
 
				+            ok: fallbackHealth.ok,
			
 
				+            model: this.primary.getModelId(),
			
 
				+            dimensions: primaryHealth.dimensions ?? fallbackHealth.dimensions,
			
 
				+            detail: `primary: ${primaryHealth.detail ?? "fail"} | fallback: ${fallbackHealth.detail ?? (fallbackHealth.ok ? "ok" : "fail")}`,
			
 
				+        };
			
 
				+    }
			
 
				+    async embed(text, options = {}) {
			
 
				+        return this.run((p, opts) => p.embed(text, opts), options);
			
 
				+    }
			
 
				+    async embedBatch(texts, options = {}) {
			
 
				+        if (texts.length === 0)
			
 
				+            return [];
			
 
				+        return this.run((p, opts) => p.embedBatch(texts, opts), options, () => texts.map(() => null));
			
 
				+    }
			
 
				+    async dispose() {
			
 
				+        await Promise.allSettled([this.primary.dispose(), this.fallback.dispose()]);
			
 
				+    }
			
 
				+    // ────────────────────── Internals ──────────────────────
			
 
				+    /**
			
 
				+     * Generic dispatcher: try primary if not in cooldown, fall back on
			
 
				+     * `CircuitOpenError`, count other errors against the failure streak.
			
 
				+     * `op` is invoked with whichever provider is selected.
			
 
				+     */
			
 
				+    async run(op, options, onTotalFail) {
			
 
				+        const inCooldown = this.fallbackUntil !== null && this.now() < this.fallbackUntil;
			
 
				+        if (inCooldown) {
			
 
				+            // Skip primary entirely
			
 
				+            this.transition("fallback");
			
 
				+            try {
			
 
				+                return await op(this.fallback, options);
			
 
				+            }
			
 
				+            catch (err) {
			
 
				+                if (onTotalFail)
			
 
				+                    return onTotalFail();
			
 
				+                throw err;
			
 
				+            }
			
 
				+        }
			
 
				+        // Try primary first
			
 
				+        try {
			
 
				+            const result = await op(this.primary, options);
			
 
				+            // Success — clear streak and ensure routing reads "primary"
			
 
				+            this.failureStreak = 0;
			
 
				+            this.fallbackUntil = null;
			
 
				+            this.transition("primary");
			
 
				+            return result;
			
 
				+        }
			
 
				+        catch (err) {
			
 
				+            if (err instanceof CircuitOpenError) {
			
 
				+                // Primary circuit is open — open our own cooldown matching its
			
 
				+                // expected duration so subsequent calls skip the primary.
			
 
				+                this.openCooldown(`primary CircuitOpenError`);
			
 
				+            }
			
 
				+            else {
			
 
				+                this.failureStreak++;
			
 
				+                if (this.failureStreak >= this.failureStreakThreshold) {
			
 
				+                    this.openCooldown(`primary failure streak ${this.failureStreak} ≥ ${this.failureStreakThreshold}`);
			
 
				+                }
			
 
				+            }
			
 
				+            // Try fallback for THIS call regardless
			
 
				+            try {
			
 
				+                this.transition("fallback");
			
 
				+                return await op(this.fallback, options);
			
 
				+            }
			
 
				+            catch (fbErr) {
			
 
				+                if (onTotalFail)
			
 
				+                    return onTotalFail();
			
 
				+                // Both providers failed — surface the fallback error (the primary
			
 
				+                // failure already informed the breaker).
			
 
				+                throw fbErr;
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+    openCooldown(reason) {
			
 
				+        if (this.fallbackUntil === null || this.now() >= this.fallbackUntil) {
			
 
				+            this.fallbackUntil = this.now() + this.cooldownMs;
			
 
				+            this.warn(`[AutoFallbackEmbeddingProvider] WARN — falling back to "${this.fallback.kind}" provider for ${Math.round(this.cooldownMs / 1000)}s (reason: ${reason})`);
			
 
				+        }
			
 
				+    }
			
 
				+    transition(to) {
			
 
				+        if (this.lastTransitionState === to)
			
 
				+            return;
			
 
				+        this.lastTransitionState = to;
			
 
				+        if (to === "primary") {
			
 
				+            this.warn(`[AutoFallbackEmbeddingProvider] WARN — primary "${this.primary.kind}" recovered, routing restored`);
			
 
				+        }
			
 
				+        // The "fallback" transition WARN is already emitted by openCooldown
			
 
				+        // (with a richer message). No second WARN here.
			
 
				+    }
			
 
				+}
			
--- a/dist/embedding/factory.d.ts
+++ b/dist/embedding/factory.d.ts
@@ -0,0 +1,82 @@
 
				+/**
			
 
				+ * factory.ts - EmbeddingProvider factory with config precedence.
			
 
				+ *
			
 
				+ * Resolution order (first match wins):
			
 
				+ *   1. Explicit `kind` argument or `--provider` CLI flag → forces a kind
			
 
				+ *   2. `QMD_EMBED_ENDPOINT` env var present and non-empty → "openai"
			
 
				+ *   3. Config file (`~/.config/qmd/config.json`) `embedProvider.kind` → that kind
			
 
				+ *   4. Otherwise → "local" (legacy / backward-compat)
			
 
				+ *
			
 
				+ * Backward compat invariant: when neither `QMD_EMBED_ENDPOINT` nor
			
 
				+ * `~/.config/qmd/config.json` mentions a provider, callers get the same
			
 
				+ * `LocalLlamaCppProvider` they had before this change.
			
 
				+ */
			
 
				+import { type LocalLlamaCppProviderConfig } from "./local.js";
			
 
				+import { type OpenAIProviderConfig } from "./openai.js";
			
 
				+import { type AutoFallbackProviderConfig } from "./autofallback.js";
			
 
				+import type { EmbeddingProvider, ProviderKind } from "./provider.js";
			
 
				+export type EmbedProviderConfigFile = {
			
 
				+    embedProvider?: {
			
 
				+        kind?: ProviderKind;
			
 
				+        endpoint?: string;
			
 
				+        apiKey?: string;
			
 
				+        modelId?: string;
			
 
				+        upstreamModel?: string;
			
 
				+        batchSize?: number;
			
 
				+        timeoutMs?: number;
			
 
				+        /** When true, wrap the openai provider in AutoFallback (local fallback). */
			
 
				+        autoFallback?: boolean;
			
 
				+    };
			
 
				+};
			
 
				+export declare function defaultConfigPath(): string;
			
 
				+/**
			
 
				+ * Load `~/.config/qmd/config.json` if present. Returns an empty object on
			
 
				+ * any read/parse error so we silently fall back to env/local.
			
 
				+ */
			
 
				+export declare function loadConfigFile(path?: string): EmbedProviderConfigFile;
			
 
				+export type CreateEmbeddingProviderOptions = {
			
 
				+    /** Force a specific provider kind. Overrides env + config. */
			
 
				+    kind?: ProviderKind;
			
 
				+    /** Override config file path (mostly for tests) */
			
 
				+    configPath?: string;
			
 
				+    /** Local-provider overrides */
			
 
				+    local?: LocalLlamaCppProviderConfig;
			
 
				+    /** OpenAI-provider overrides — merged on top of env/config */
			
 
				+    openai?: Partial<OpenAIProviderConfig>;
			
 
				+    /**
			
 
				+     * Wrap the chosen provider in `AutoFallbackEmbeddingProvider` so that a
			
 
				+     * remote outage transparently falls back to local llama.cpp. Default:
			
 
				+     * `false` — opt-in, since the wrapper requires both backends to be
			
 
				+     * available and the local one will warm node-llama-cpp on first call.
			
 
				+     *
			
 
				+     * Resolution: explicit `autoFallback` wins → env `QMD_EMBED_AUTO_FALLBACK`
			
 
				+     * (`1`/`true`) → config-file `embedProvider.autoFallback` → false.
			
 
				+     *
			
 
				+     * Only applies when the resolved kind is `openai` (no fallback wrap when
			
 
				+     * the primary IS local already).
			
 
				+     */
			
 
				+    autoFallback?: boolean;
			
 
				+    /**
			
 
				+     * Override config for `AutoFallbackEmbeddingProvider` (failureStreak,
			
 
				+     * cooldownMs, etc.). Only used when `autoFallback` resolves true.
			
 
				+     * Primary + fallback are constructed automatically.
			
 
				+     */
			
 
				+    autoFallbackOverrides?: Omit<AutoFallbackProviderConfig, "primary" | "fallback">;
			
 
				+    /**
			
 
				+     * Custom env source (mostly for tests). Defaults to `process.env`.
			
 
				+     * Read keys: QMD_EMBED_PROVIDER, QMD_EMBED_ENDPOINT, QMD_EMBED_API_KEY,
			
 
				+     * QMD_EMBED_MODEL_ID, QMD_EMBED_UPSTREAM_MODEL, QMD_EMBED_BATCH_SIZE,
			
 
				+     * QMD_EMBED_TIMEOUT_MS, QMD_EMBED_AUTO_FALLBACK.
			
 
				+     */
			
 
				+    env?: Record<string, string | undefined>;
			
 
				+};
			
 
				+/**
			
 
				+ * Resolve the provider kind without instantiating anything. Useful for
			
 
				+ * logging and tests.
			
 
				+ */
			
 
				+export declare function resolveProviderKind(opts?: CreateEmbeddingProviderOptions): ProviderKind;
			
 
				+/**
			
 
				+ * Factory entry point — returns the appropriate `EmbeddingProvider`.
			
 
				+ * Throws if `openai` kind is requested but no endpoint is configured.
			
 
				+ */
			
 
				+export declare function createEmbeddingProvider(opts?: CreateEmbeddingProviderOptions): EmbeddingProvider;
			
--- a/dist/embedding/factory.js
+++ b/dist/embedding/factory.js
@@ -0,0 +1,150 @@
 
				+/**
			
 
				+ * factory.ts - EmbeddingProvider factory with config precedence.
			
 
				+ *
			
 
				+ * Resolution order (first match wins):
			
 
				+ *   1. Explicit `kind` argument or `--provider` CLI flag → forces a kind
			
 
				+ *   2. `QMD_EMBED_ENDPOINT` env var present and non-empty → "openai"
			
 
				+ *   3. Config file (`~/.config/qmd/config.json`) `embedProvider.kind` → that kind
			
 
				+ *   4. Otherwise → "local" (legacy / backward-compat)
			
 
				+ *
			
 
				+ * Backward compat invariant: when neither `QMD_EMBED_ENDPOINT` nor
			
 
				+ * `~/.config/qmd/config.json` mentions a provider, callers get the same
			
 
				+ * `LocalLlamaCppProvider` they had before this change.
			
 
				+ */
			
 
				+import { existsSync, readFileSync } from "node:fs";
			
 
				+import { homedir } from "node:os";
			
 
				+import { join } from "node:path";
			
 
				+import { LocalLlamaCppProvider } from "./local.js";
			
 
				+import { OpenAIEmbeddingsProvider, } from "./openai.js";
			
 
				+import { AutoFallbackEmbeddingProvider, } from "./autofallback.js";
			
 
				+export function defaultConfigPath() {
			
 
				+    const xdg = process.env.XDG_CONFIG_HOME;
			
 
				+    const base = xdg ? xdg : join(homedir(), ".config");
			
 
				+    return join(base, "qmd", "config.json");
			
 
				+}
			
 
				+/**
			
 
				+ * Load `~/.config/qmd/config.json` if present. Returns an empty object on
			
 
				+ * any read/parse error so we silently fall back to env/local.
			
 
				+ */
			
 
				+export function loadConfigFile(path = defaultConfigPath()) {
			
 
				+    if (!existsSync(path))
			
 
				+        return {};
			
 
				+    try {
			
 
				+        const raw = readFileSync(path, "utf-8");
			
 
				+        const parsed = JSON.parse(raw);
			
 
				+        if (parsed && typeof parsed === "object")
			
 
				+            return parsed;
			
 
				+    }
			
 
				+    catch {
			
 
				+        // Ignore — invalid JSON, missing read perm, etc.
			
 
				+    }
			
 
				+    return {};
			
 
				+}
			
 
				+/**
			
 
				+ * Resolve the provider kind without instantiating anything. Useful for
			
 
				+ * logging and tests.
			
 
				+ */
			
 
				+export function resolveProviderKind(opts = {}) {
			
 
				+    const env = opts.env ?? process.env;
			
 
				+    const cfg = loadConfigFile(opts.configPath);
			
 
				+    // 1. Explicit kind argument
			
 
				+    if (opts.kind)
			
 
				+        return opts.kind;
			
 
				+    // 2a. Explicit env override
			
 
				+    const envKind = env.QMD_EMBED_PROVIDER?.trim().toLowerCase();
			
 
				+    if (envKind === "local" || envKind === "openai")
			
 
				+        return envKind;
			
 
				+    // 2b. Endpoint env present → openai
			
 
				+    if (env.QMD_EMBED_ENDPOINT && env.QMD_EMBED_ENDPOINT.trim() !== "") {
			
 
				+        return "openai";
			
 
				+    }
			
 
				+    // 3. Config file
			
 
				+    if (cfg.embedProvider?.kind === "local" || cfg.embedProvider?.kind === "openai") {
			
 
				+        return cfg.embedProvider.kind;
			
 
				+    }
			
 
				+    if (cfg.embedProvider?.endpoint && cfg.embedProvider.endpoint.trim() !== "") {
			
 
				+        return "openai";
			
 
				+    }
			
 
				+    // 4. Default
			
 
				+    return "local";
			
 
				+}
			
 
				+/**
			
 
				+ * Factory entry point — returns the appropriate `EmbeddingProvider`.
			
 
				+ * Throws if `openai` kind is requested but no endpoint is configured.
			
 
				+ */
			
 
				+export function createEmbeddingProvider(opts = {}) {
			
 
				+    const env = opts.env ?? process.env;
			
 
				+    const cfg = loadConfigFile(opts.configPath);
			
 
				+    const kind = resolveProviderKind(opts);
			
 
				+    if (kind === "local") {
			
 
				+        return new LocalLlamaCppProvider(opts.local ?? {});
			
 
				+    }
			
 
				+    // OpenAI
			
 
				+    const endpoint = opts.openai?.endpoint ??
			
 
				+        env.QMD_EMBED_ENDPOINT ??
			
 
				+        cfg.embedProvider?.endpoint;
			
 
				+    if (!endpoint || endpoint.trim() === "") {
			
 
				+        throw new Error('createEmbeddingProvider: kind="openai" requires an endpoint. ' +
			
 
				+            "Set QMD_EMBED_ENDPOINT env var, or `embedProvider.endpoint` in " +
			
 
				+            "~/.config/qmd/config.json, or pass `openai.endpoint`.");
			
 
				+    }
			
 
				+    const apiKey = opts.openai?.apiKey ??
			
 
				+        env.QMD_EMBED_API_KEY ??
			
 
				+        cfg.embedProvider?.apiKey;
			
 
				+    const modelId = opts.openai?.modelId ??
			
 
				+        env.QMD_EMBED_MODEL_ID ??
			
 
				+        cfg.embedProvider?.modelId ??
			
 
				+        "embeddinggemma";
			
 
				+    const upstreamModel = opts.openai?.upstreamModel ??
			
 
				+        env.QMD_EMBED_UPSTREAM_MODEL ??
			
 
				+        cfg.embedProvider?.upstreamModel;
			
 
				+    const batchSizeRaw = opts.openai?.batchSize ??
			
 
				+        parsePositiveInt(env.QMD_EMBED_BATCH_SIZE) ??
			
 
				+        cfg.embedProvider?.batchSize;
			
 
				+    const timeoutMsRaw = opts.openai?.timeoutMs ??
			
 
				+        parsePositiveInt(env.QMD_EMBED_TIMEOUT_MS) ??
			
 
				+        cfg.embedProvider?.timeoutMs;
			
 
				+    const openaiProvider = new OpenAIEmbeddingsProvider({
			
 
				+        endpoint,
			
 
				+        apiKey,
			
 
				+        modelId,
			
 
				+        upstreamModel,
			
 
				+        batchSize: batchSizeRaw,
			
 
				+        timeoutMs: timeoutMsRaw,
			
 
				+        fetchImpl: opts.openai?.fetchImpl,
			
 
				+        retryBackoffsMs: opts.openai?.retryBackoffsMs,
			
 
				+        sleep: opts.openai?.sleep,
			
 
				+        now: opts.openai?.now,
			
 
				+    });
			
 
				+    // Should we wrap with AutoFallback? Resolution: arg → env → config → false.
			
 
				+    const autoFallback = resolveAutoFallback(opts, env, cfg);
			
 
				+    if (!autoFallback)
			
 
				+        return openaiProvider;
			
 
				+    return new AutoFallbackEmbeddingProvider({
			
 
				+        primary: openaiProvider,
			
 
				+        fallback: new LocalLlamaCppProvider(opts.local ?? { modelId }),
			
 
				+        ...(opts.autoFallbackOverrides ?? {}),
			
 
				+    });
			
 
				+}
			
 
				+function resolveAutoFallback(opts, env, cfg) {
			
 
				+    if (typeof opts.autoFallback === "boolean")
			
 
				+        return opts.autoFallback;
			
 
				+    const envVal = env.QMD_EMBED_AUTO_FALLBACK?.trim().toLowerCase();
			
 
				+    if (envVal === "1" || envVal === "true" || envVal === "yes")
			
 
				+        return true;
			
 
				+    if (envVal === "0" || envVal === "false" || envVal === "no")
			
 
				+        return false;
			
 
				+    if (typeof cfg.embedProvider?.autoFallback === "boolean") {
			
 
				+        return cfg.embedProvider.autoFallback;
			
 
				+    }
			
 
				+    return false;
			
 
				+}
			
 
				+// ─────────────────────────── Helpers ────────────────────────────────────────
			
 
				+function parsePositiveInt(v) {
			
 
				+    if (!v)
			
 
				+        return undefined;
			
 
				+    const parsed = Number.parseInt(v, 10);
			
 
				+    if (!Number.isFinite(parsed) || parsed <= 0)
			
 
				+        return undefined;
			
 
				+    return parsed;
			
 
				+}
			
--- a/dist/embedding/index.d.ts
+++ b/dist/embedding/index.d.ts
@@ -0,0 +1,8 @@
 
				+/**
			
 
				+ * embedding/index.ts - re-exports for the embedding provider abstraction.
			
 
				+ */
			
 
				+export { type EmbeddingProvider, type ProviderKind, type ProviderEmbedding, type ProviderEmbedOptions, type ProviderHealth, ModelMismatchError, assertModelCompatible, } from "./provider.js";
			
 
				+export { LocalLlamaCppProvider, type LocalLlamaCppProviderConfig, } from "./local.js";
			
 
				+export { OpenAIEmbeddingsProvider, CircuitBreaker, CircuitOpenError, HttpError, isRetryableStatus, chunkArray, type OpenAIProviderConfig, type CircuitState, DEFAULT_BATCH_SIZE, DEFAULT_TIMEOUT_MS, RETRY_BACKOFFS_MS, } from "./openai.js";
			
 
				+export { createEmbeddingProvider, resolveProviderKind, loadConfigFile, defaultConfigPath, type CreateEmbeddingProviderOptions, type EmbedProviderConfigFile, } from "./factory.js";
			
 
				+export { AutoFallbackEmbeddingProvider, type AutoFallbackProviderConfig, type FallbackState, } from "./autofallback.js";
			
--- a/dist/embedding/index.js
+++ b/dist/embedding/index.js
@@ -0,0 +1,8 @@
 
				+/**
			
 
				+ * embedding/index.ts - re-exports for the embedding provider abstraction.
			
 
				+ */
			
 
				+export { ModelMismatchError, assertModelCompatible, } from "./provider.js";
			
 
				+export { LocalLlamaCppProvider, } from "./local.js";
			
 
				+export { OpenAIEmbeddingsProvider, CircuitBreaker, CircuitOpenError, HttpError, isRetryableStatus, chunkArray, DEFAULT_BATCH_SIZE, DEFAULT_TIMEOUT_MS, RETRY_BACKOFFS_MS, } from "./openai.js";
			
 
				+export { createEmbeddingProvider, resolveProviderKind, loadConfigFile, defaultConfigPath, } from "./factory.js";
			
 
				+export { AutoFallbackEmbeddingProvider, } from "./autofallback.js";
			
--- a/dist/embedding/local.d.ts
+++ b/dist/embedding/local.d.ts
@@ -0,0 +1,31 @@
 
				+/**
			
 
				+ * local.ts - Local llama.cpp adapter implementing EmbeddingProvider.
			
 
				+ *
			
 
				+ * Wraps an existing `LlamaCpp` instance so the legacy GGUF path looks like
			
 
				+ * any other EmbeddingProvider to upstream callers. Used as the default and
			
 
				+ * as the fallback target when `OpenAIEmbeddingsProvider` trips its breaker.
			
 
				+ */
			
 
				+import { type LlamaCpp } from "../llm.js";
			
 
				+import type { EmbeddingProvider, ProviderEmbedOptions, ProviderEmbedding, ProviderHealth, ProviderKind } from "./provider.js";
			
 
				+export type LocalLlamaCppProviderConfig = {
			
 
				+    /** Pre-built LlamaCpp instance (optional — falls back to global singleton). */
			
 
				+    llm?: LlamaCpp;
			
 
				+    /**
			
 
				+     * Stable model id reported via `getModelId()`. Defaults to "embeddinggemma"
			
 
				+     * to match the value in `content_vectors.model` for existing qmd installs.
			
 
				+     */
			
 
				+    modelId?: string;
			
 
				+};
			
 
				+export declare class LocalLlamaCppProvider implements EmbeddingProvider {
			
 
				+    readonly kind: ProviderKind;
			
 
				+    private readonly llm;
			
 
				+    private readonly modelId;
			
 
				+    private dimensions;
			
 
				+    constructor(config?: LocalLlamaCppProviderConfig);
			
 
				+    getModelId(): string;
			
 
				+    getDimensions(): number | undefined;
			
 
				+    healthcheck(_signal?: AbortSignal): Promise<ProviderHealth>;
			
 
				+    embed(text: string, options?: ProviderEmbedOptions): Promise<ProviderEmbedding | null>;
			
 
				+    embedBatch(texts: string[], options?: ProviderEmbedOptions): Promise<(ProviderEmbedding | null)[]>;
			
 
				+    dispose(): Promise<void>;
			
 
				+}
			
--- a/dist/embedding/local.js
+++ b/dist/embedding/local.js
@@ -0,0 +1,91 @@
 
				+/**
			
 
				+ * local.ts - Local llama.cpp adapter implementing EmbeddingProvider.
			
 
				+ *
			
 
				+ * Wraps an existing `LlamaCpp` instance so the legacy GGUF path looks like
			
 
				+ * any other EmbeddingProvider to upstream callers. Used as the default and
			
 
				+ * as the fallback target when `OpenAIEmbeddingsProvider` trips its breaker.
			
 
				+ */
			
 
				+import { getDefaultLlamaCpp, } from "../llm.js";
			
 
				+export class LocalLlamaCppProvider {
			
 
				+    kind = "local";
			
 
				+    llm;
			
 
				+    modelId;
			
 
				+    dimensions = undefined;
			
 
				+    constructor(config = {}) {
			
 
				+        this.llm = config.llm ?? getDefaultLlamaCpp();
			
 
				+        this.modelId = config.modelId ?? "embeddinggemma";
			
 
				+    }
			
 
				+    getModelId() {
			
 
				+        return this.modelId;
			
 
				+    }
			
 
				+    getDimensions() {
			
 
				+        return this.dimensions;
			
 
				+    }
			
 
				+    async healthcheck(_signal) {
			
 
				+        // For the local provider, "healthy" means the embed model loads.
			
 
				+        // We probe with a single embed call.
			
 
				+        try {
			
 
				+            const result = await this.llm.embed("healthcheck", { model: this.modelId });
			
 
				+            if (!result) {
			
 
				+                return {
			
 
				+                    ok: false,
			
 
				+                    model: this.modelId,
			
 
				+                    detail: "embed probe returned null",
			
 
				+                };
			
 
				+            }
			
 
				+            this.dimensions = result.embedding.length;
			
 
				+            return {
			
 
				+                ok: true,
			
 
				+                model: this.modelId,
			
 
				+                dimensions: this.dimensions,
			
 
				+                detail: `local llama.cpp ready, ${this.dimensions}-d`,
			
 
				+            };
			
 
				+        }
			
 
				+        catch (err) {
			
 
				+            return {
			
 
				+                ok: false,
			
 
				+                model: this.modelId,
			
 
				+                detail: err instanceof Error ? err.message : String(err),
			
 
				+            };
			
 
				+        }
			
 
				+    }
			
 
				+    async embed(text, options = {}) {
			
 
				+        if (options.signal?.aborted)
			
 
				+            return null;
			
 
				+        const result = await this.llm.embed(text, { model: options.model ?? this.modelId });
			
 
				+        if (!result)
			
 
				+            return null;
			
 
				+        if (this.dimensions === undefined) {
			
 
				+            this.dimensions = result.embedding.length;
			
 
				+        }
			
 
				+        return {
			
 
				+            embedding: result.embedding,
			
 
				+            model: this.modelId,
			
 
				+        };
			
 
				+    }
			
 
				+    async embedBatch(texts, options = {}) {
			
 
				+        if (texts.length === 0)
			
 
				+            return [];
			
 
				+        if (options.signal?.aborted)
			
 
				+            return texts.map(() => null);
			
 
				+        const raw = await this.llm.embedBatch(texts, {
			
 
				+            model: options.model ?? this.modelId,
			
 
				+        });
			
 
				+        return raw.map((r) => {
			
 
				+            if (!r)
			
 
				+                return null;
			
 
				+            if (this.dimensions === undefined && r.embedding.length > 0) {
			
 
				+                this.dimensions = r.embedding.length;
			
 
				+            }
			
 
				+            return {
			
 
				+                embedding: r.embedding,
			
 
				+                model: this.modelId,
			
 
				+            };
			
 
				+        });
			
 
				+    }
			
 
				+    async dispose() {
			
 
				+        // We do NOT dispose the underlying LlamaCpp here because the singleton
			
 
				+        // is shared with rerank/generate/expansion paths. Disposal is handled
			
 
				+        // by the existing `disposeDefaultLlamaCpp()` global hook.
			
 
				+    }
			
 
				+}
			
--- a/dist/embedding/openai.d.ts
+++ b/dist/embedding/openai.d.ts
@@ -0,0 +1,184 @@
 
				+/**
			
 
				+ * openai.ts - OpenAI-compatible HTTP embedding provider
			
 
				+ *
			
 
				+ * Talks to any endpoint that implements `POST /v1/embeddings` with the OpenAI
			
 
				+ * shape: request `{model, input: string|string[]}`, response
			
 
				+ * `{data: [{embedding: number[], index: number}, ...]}`.
			
 
				+ *
			
 
				+ * Used by qmd to delegate embeddings to a GPU worker (e.g. ai.mm.mk →
			
 
				+ * qmd-embed-worker on `models` LXC, RTX 4090) instead of running
			
 
				+ * node-llama-cpp locally.
			
 
				+ *
			
 
				+ * Features:
			
 
				+ *   - Batches input in groups of ≤64 (configurable via QMD_EMBED_BATCH_SIZE)
			
 
				+ *   - Retries 429 / 503 with exponential backoff (1s, 4s, 16s)
			
 
				+ *   - 4xx (non-429) → no retry, count as failure
			
 
				+ *   - Circuit breaker: >50% failures in a 60s window → OPEN for 5 min,
			
 
				+ *     callers can use this to fall back to a local provider
			
 
				+ *   - Per-call timeout via AbortSignal (default QMD_EMBED_TIMEOUT_MS=30000)
			
 
				+ *   - Healthcheck via `GET /health` if available, else a probe embed call
			
 
				+ */
			
 
				+import type { EmbeddingProvider, ProviderEmbedOptions, ProviderEmbedding, ProviderHealth, ProviderKind } from "./provider.js";
			
 
				+/**
			
 
				+ * Default batch size — most OpenAI-compatible embedding endpoints accept up to
			
 
				+ * 2048 inputs per call but for memory and latency we cap at 64.
			
 
				+ */
			
 
				+export declare const DEFAULT_BATCH_SIZE = 64;
			
 
				+/**
			
 
				+ * Default per-request timeout (30 s). embeddinggemma-300M on RTX 4090 takes
			
 
				+ * <500ms per batch of 64 in practice; 30s is a safe upper bound.
			
 
				+ */
			
 
				+export declare const DEFAULT_TIMEOUT_MS = 30000;
			
 
				+/**
			
 
				+ * Retry backoff schedule (ms) for 429/503 responses. 3 attempts total
			
 
				+ * (initial + 2 retries) — aligns with issue spec "1s/4s/16s".
			
 
				+ */
			
 
				+export declare const RETRY_BACKOFFS_MS: readonly number[];
			
 
				+/**
			
 
				+ * Circuit breaker — flips OPEN when error rate exceeds threshold within
			
 
				+ * window. While OPEN, every call fails fast so the caller can fall back.
			
 
				+ */
			
 
				+export declare const CIRCUIT_WINDOW_MS = 60000;
			
 
				+export declare const CIRCUIT_OPEN_DURATION_MS: number;
			
 
				+export declare const CIRCUIT_FAILURE_RATE_THRESHOLD = 0.5;
			
 
				+export declare const CIRCUIT_MIN_SAMPLES = 4;
			
 
				+export type OpenAIProviderConfig = {
			
 
				+    /** Endpoint base URL — e.g. "https://ai.mm.mk" (no trailing slash) */
			
 
				+    endpoint: string;
			
 
				+    /** Optional bearer token sent as `Authorization: Bearer ...` */
			
 
				+    apiKey?: string;
			
 
				+    /**
			
 
				+     * Stable model identifier to report up via `getModelId()`.
			
 
				+     * Defaults to "embeddinggemma" to match qmd's existing DB rows.
			
 
				+     */
			
 
				+    modelId?: string;
			
 
				+    /**
			
 
				+     * Upstream model name sent in the HTTP request body. Often differs from
			
 
				+     * `modelId` (e.g. modelId="embeddinggemma" but upstream model="embeddinggemma:300m").
			
 
				+     */
			
 
				+    upstreamModel?: string;
			
 
				+    /** Batch size cap (default DEFAULT_BATCH_SIZE = 64) */
			
 
				+    batchSize?: number;
			
 
				+    /** Per-request timeout in ms (default DEFAULT_TIMEOUT_MS = 30_000) */
			
 
				+    timeoutMs?: number;
			
 
				+    /** Custom fetch (for testing). Defaults to global `fetch`. */
			
 
				+    fetchImpl?: typeof fetch;
			
 
				+    /** Custom retry schedule (for testing). Defaults to RETRY_BACKOFFS_MS. */
			
 
				+    retryBackoffsMs?: readonly number[];
			
 
				+    /** Custom sleep impl (for testing). Defaults to setTimeout. */
			
 
				+    sleep?: (ms: number) => Promise<void>;
			
 
				+    /** Custom clock (for testing). Defaults to Date.now. */
			
 
				+    now?: () => number;
			
 
				+};
			
 
				+export type OpenAIEmbeddingsResponse = {
			
 
				+    object?: string;
			
 
				+    model?: string;
			
 
				+    data: Array<{
			
 
				+        object?: string;
			
 
				+        index: number;
			
 
				+        embedding: number[];
			
 
				+    }>;
			
 
				+    usage?: {
			
 
				+        prompt_tokens?: number;
			
 
				+        total_tokens?: number;
			
 
				+    };
			
 
				+};
			
 
				+/**
			
 
				+ * Circuit breaker state — exported for tests
			
 
				+ */
			
 
				+export type CircuitState = "closed" | "open" | "half-open";
			
 
				+/**
			
 
				+ * Determine whether an HTTP status is retryable. 429 (Too Many Requests)
			
 
				+ * and 503 (Service Unavailable) are retried; 4xx (other than 429) are not.
			
 
				+ */
			
 
				+export declare function isRetryableStatus(status: number): boolean;
			
 
				+/**
			
 
				+ * Chunk an array into pieces of ≤ size each. `size` MUST be ≥ 1.
			
 
				+ */
			
 
				+export declare function chunkArray<T>(items: T[], size: number): T[][];
			
 
				+/**
			
 
				+ * Sliding-window circuit breaker. Tracks the last N samples (min 4) over a
			
 
				+ * 60-second window; flips OPEN when failure rate exceeds 50%, then auto-
			
 
				+ * resets to HALF-OPEN after 5 minutes — at which point the next probe
			
 
				+ * decides whether to close (success) or re-open (failure).
			
 
				+ */
			
 
				+export declare class CircuitBreaker {
			
 
				+    private samples;
			
 
				+    private state;
			
 
				+    private openedAt;
			
 
				+    private readonly windowMs;
			
 
				+    private readonly openDurationMs;
			
 
				+    private readonly threshold;
			
 
				+    private readonly minSamples;
			
 
				+    private readonly now;
			
 
				+    constructor(opts?: {
			
 
				+        windowMs?: number;
			
 
				+        openDurationMs?: number;
			
 
				+        threshold?: number;
			
 
				+        minSamples?: number;
			
 
				+        now?: () => number;
			
 
				+    });
			
 
				+    getState(): CircuitState;
			
 
				+    /**
			
 
				+     * Returns true when calls should be short-circuited (skip HTTP, fall back).
			
 
				+     * Side-effects: may transition OPEN → HALF-OPEN if the open window expired.
			
 
				+     */
			
 
				+    shouldFailFast(): boolean;
			
 
				+    /** Record a successful call. */
			
 
				+    recordSuccess(): void;
			
 
				+    /** Record a failed call. May trigger OPEN. */
			
 
				+    recordFailure(): void;
			
 
				+    /** Force-reset the breaker (used by tests / admin) */
			
 
				+    reset(): void;
			
 
				+    private pushSample;
			
 
				+    private evaluate;
			
 
				+    private tickAutoReset;
			
 
				+}
			
 
				+/**
			
 
				+ * Raised when the circuit breaker is OPEN and a call is short-circuited.
			
 
				+ * Callers (e.g. fallback wrapper) can catch this to switch to local provider.
			
 
				+ */
			
 
				+export declare class CircuitOpenError extends Error {
			
 
				+    constructor(message?: string);
			
 
				+}
			
 
				+/**
			
 
				+ * Persistent (non-retryable) HTTP error from upstream. Includes status code.
			
 
				+ */
			
 
				+export declare class HttpError extends Error {
			
 
				+    readonly status: number;
			
 
				+    readonly bodyPreview: string;
			
 
				+    constructor(status: number, bodyPreview: string);
			
 
				+}
			
 
				+export declare class OpenAIEmbeddingsProvider implements EmbeddingProvider {
			
 
				+    readonly kind: ProviderKind;
			
 
				+    private readonly endpoint;
			
 
				+    private readonly apiKey?;
			
 
				+    private readonly modelId;
			
 
				+    private readonly upstreamModel;
			
 
				+    private readonly batchSize;
			
 
				+    private readonly timeoutMs;
			
 
				+    private readonly fetchImpl;
			
 
				+    private readonly retryBackoffsMs;
			
 
				+    private readonly sleep;
			
 
				+    private readonly now;
			
 
				+    private dimensions;
			
 
				+    readonly breaker: CircuitBreaker;
			
 
				+    constructor(config: OpenAIProviderConfig);
			
 
				+    getModelId(): string;
			
 
				+    getDimensions(): number | undefined;
			
 
				+    healthcheck(signal?: AbortSignal): Promise<ProviderHealth>;
			
 
				+    embed(text: string, options?: ProviderEmbedOptions): Promise<ProviderEmbedding | null>;
			
 
				+    embedBatch(texts: string[], options?: ProviderEmbedOptions): Promise<(ProviderEmbedding | null)[]>;
			
 
				+    dispose(): Promise<void>;
			
 
				+    private buildHeaders;
			
 
				+    /**
			
 
				+     * Single HTTP request with retry on 429/503. Returns embeddings indexed
			
 
				+     * the same as `texts`. Throws on non-retryable failure or all attempts
			
 
				+     * exhausted.
			
 
				+     */
			
 
				+    private requestWithRetry;
			
 
				+    /**
			
 
				+     * Issue one HTTP attempt to `POST /v1/embeddings`. Does NOT retry.
			
 
				+     */
			
 
				+    private requestOnce;
			
 
				+}
			
--- a/dist/embedding/openai.js
+++ b/dist/embedding/openai.js
@@ -0,0 +1,477 @@
 
				+/**
			
 
				+ * openai.ts - OpenAI-compatible HTTP embedding provider
			
 
				+ *
			
 
				+ * Talks to any endpoint that implements `POST /v1/embeddings` with the OpenAI
			
 
				+ * shape: request `{model, input: string|string[]}`, response
			
 
				+ * `{data: [{embedding: number[], index: number}, ...]}`.
			
 
				+ *
			
 
				+ * Used by qmd to delegate embeddings to a GPU worker (e.g. ai.mm.mk →
			
 
				+ * qmd-embed-worker on `models` LXC, RTX 4090) instead of running
			
 
				+ * node-llama-cpp locally.
			
 
				+ *
			
 
				+ * Features:
			
 
				+ *   - Batches input in groups of ≤64 (configurable via QMD_EMBED_BATCH_SIZE)
			
 
				+ *   - Retries 429 / 503 with exponential backoff (1s, 4s, 16s)
			
 
				+ *   - 4xx (non-429) → no retry, count as failure
			
 
				+ *   - Circuit breaker: >50% failures in a 60s window → OPEN for 5 min,
			
 
				+ *     callers can use this to fall back to a local provider
			
 
				+ *   - Per-call timeout via AbortSignal (default QMD_EMBED_TIMEOUT_MS=30000)
			
 
				+ *   - Healthcheck via `GET /health` if available, else a probe embed call
			
 
				+ */
			
 
				+// ─────────────────────────── Configuration ───────────────────────────────────
			
 
				+/**
			
 
				+ * Default batch size — most OpenAI-compatible embedding endpoints accept up to
			
 
				+ * 2048 inputs per call but for memory and latency we cap at 64.
			
 
				+ */
			
 
				+export const DEFAULT_BATCH_SIZE = 64;
			
 
				+/**
			
 
				+ * Default per-request timeout (30 s). embeddinggemma-300M on RTX 4090 takes
			
 
				+ * <500ms per batch of 64 in practice; 30s is a safe upper bound.
			
 
				+ */
			
 
				+export const DEFAULT_TIMEOUT_MS = 30_000;
			
 
				+/**
			
 
				+ * Retry backoff schedule (ms) for 429/503 responses. 3 attempts total
			
 
				+ * (initial + 2 retries) — aligns with issue spec "1s/4s/16s".
			
 
				+ */
			
 
				+export const RETRY_BACKOFFS_MS = [1_000, 4_000, 16_000];
			
 
				+/**
			
 
				+ * Circuit breaker — flips OPEN when error rate exceeds threshold within
			
 
				+ * window. While OPEN, every call fails fast so the caller can fall back.
			
 
				+ */
			
 
				+export const CIRCUIT_WINDOW_MS = 60_000;
			
 
				+export const CIRCUIT_OPEN_DURATION_MS = 5 * 60_000;
			
 
				+export const CIRCUIT_FAILURE_RATE_THRESHOLD = 0.5;
			
 
				+export const CIRCUIT_MIN_SAMPLES = 4;
			
 
				+// ─────────────────────────── Helpers ─────────────────────────────────────────
			
 
				+function defaultSleep(ms) {
			
 
				+    return new Promise((resolve) => setTimeout(resolve, ms));
			
 
				+}
			
 
				+/**
			
 
				+ * Build the merged AbortSignal for a single HTTP attempt: combines an
			
 
				+ * external `userSignal` (from caller / withLLMSession) with a per-attempt
			
 
				+ * timeout signal. Returns the merged signal AND the timeout id so the
			
 
				+ * caller can `clearTimeout` after the attempt completes (avoids leaks).
			
 
				+ */
			
 
				+function buildAttemptSignal(userSignal, timeoutMs) {
			
 
				+    const ctrl = new AbortController();
			
 
				+    const timeoutId = setTimeout(() => {
			
 
				+        ctrl.abort(new Error(`Request timed out after ${timeoutMs}ms`));
			
 
				+    }, timeoutMs);
			
 
				+    // Don't keep process alive just for this timer
			
 
				+    if (typeof timeoutId === "object" && timeoutId !== null && "unref" in timeoutId) {
			
 
				+        timeoutId.unref();
			
 
				+    }
			
 
				+    const onUserAbort = () => ctrl.abort(userSignal?.reason);
			
 
				+    if (userSignal) {
			
 
				+        if (userSignal.aborted) {
			
 
				+            ctrl.abort(userSignal.reason);
			
 
				+        }
			
 
				+        else {
			
 
				+            userSignal.addEventListener("abort", onUserAbort, { once: true });
			
 
				+        }
			
 
				+    }
			
 
				+    const cleanup = () => {
			
 
				+        clearTimeout(timeoutId);
			
 
				+        if (userSignal)
			
 
				+            userSignal.removeEventListener("abort", onUserAbort);
			
 
				+    };
			
 
				+    return { signal: ctrl.signal, cleanup };
			
 
				+}
			
 
				+/**
			
 
				+ * Determine whether an HTTP status is retryable. 429 (Too Many Requests)
			
 
				+ * and 503 (Service Unavailable) are retried; 4xx (other than 429) are not.
			
 
				+ */
			
 
				+export function isRetryableStatus(status) {
			
 
				+    return status === 429 || status === 503;
			
 
				+}
			
 
				+/**
			
 
				+ * Chunk an array into pieces of ≤ size each. `size` MUST be ≥ 1.
			
 
				+ */
			
 
				+export function chunkArray(items, size) {
			
 
				+    if (size < 1)
			
 
				+        throw new Error(`chunkArray: size must be ≥ 1, got ${size}`);
			
 
				+    if (items.length <= size)
			
 
				+        return items.length === 0 ? [] : [items];
			
 
				+    const out = [];
			
 
				+    for (let i = 0; i < items.length; i += size) {
			
 
				+        out.push(items.slice(i, i + size));
			
 
				+    }
			
 
				+    return out;
			
 
				+}
			
 
				+// ─────────────────────────── Circuit Breaker ─────────────────────────────────
			
 
				+/**
			
 
				+ * Sliding-window circuit breaker. Tracks the last N samples (min 4) over a
			
 
				+ * 60-second window; flips OPEN when failure rate exceeds 50%, then auto-
			
 
				+ * resets to HALF-OPEN after 5 minutes — at which point the next probe
			
 
				+ * decides whether to close (success) or re-open (failure).
			
 
				+ */
			
 
				+export class CircuitBreaker {
			
 
				+    samples = [];
			
 
				+    state = "closed";
			
 
				+    openedAt = null;
			
 
				+    windowMs;
			
 
				+    openDurationMs;
			
 
				+    threshold;
			
 
				+    minSamples;
			
 
				+    now;
			
 
				+    constructor(opts = {}) {
			
 
				+        this.windowMs = opts.windowMs ?? CIRCUIT_WINDOW_MS;
			
 
				+        this.openDurationMs = opts.openDurationMs ?? CIRCUIT_OPEN_DURATION_MS;
			
 
				+        this.threshold = opts.threshold ?? CIRCUIT_FAILURE_RATE_THRESHOLD;
			
 
				+        this.minSamples = opts.minSamples ?? CIRCUIT_MIN_SAMPLES;
			
 
				+        this.now = opts.now ?? Date.now;
			
 
				+    }
			
 
				+    getState() {
			
 
				+        this.tickAutoReset();
			
 
				+        return this.state;
			
 
				+    }
			
 
				+    /**
			
 
				+     * Returns true when calls should be short-circuited (skip HTTP, fall back).
			
 
				+     * Side-effects: may transition OPEN → HALF-OPEN if the open window expired.
			
 
				+     */
			
 
				+    shouldFailFast() {
			
 
				+        return this.getState() === "open";
			
 
				+    }
			
 
				+    /** Record a successful call. */
			
 
				+    recordSuccess() {
			
 
				+        // Honor the time-based OPEN→HALF-OPEN transition before deciding what
			
 
				+        // to do with this sample. Without this, a success that lands AFTER the
			
 
				+        // open window expired would still see state==="open" and never close
			
 
				+        // the breaker (a probe call could only flip it via getState()).
			
 
				+        this.tickAutoReset();
			
 
				+        this.pushSample(true);
			
 
				+        if (this.state === "half-open") {
			
 
				+            this.state = "closed";
			
 
				+            this.openedAt = null;
			
 
				+        }
			
 
				+    }
			
 
				+    /** Record a failed call. May trigger OPEN. */
			
 
				+    recordFailure() {
			
 
				+        // Same reasoning as recordSuccess — apply lazy auto-reset before
			
 
				+        // classifying the sample.
			
 
				+        this.tickAutoReset();
			
 
				+        this.pushSample(false);
			
 
				+        if (this.state === "half-open") {
			
 
				+            // Probe failed — re-open
			
 
				+            this.state = "open";
			
 
				+            this.openedAt = this.now();
			
 
				+            return;
			
 
				+        }
			
 
				+        if (this.state === "closed")
			
 
				+            this.evaluate();
			
 
				+    }
			
 
				+    /** Force-reset the breaker (used by tests / admin) */
			
 
				+    reset() {
			
 
				+        this.samples = [];
			
 
				+        this.state = "closed";
			
 
				+        this.openedAt = null;
			
 
				+    }
			
 
				+    pushSample(ok) {
			
 
				+        const ts = this.now();
			
 
				+        this.samples.push({ ts, ok });
			
 
				+        // Drop samples outside the window
			
 
				+        const cutoff = ts - this.windowMs;
			
 
				+        while (this.samples.length > 0 && this.samples[0].ts < cutoff) {
			
 
				+            this.samples.shift();
			
 
				+        }
			
 
				+    }
			
 
				+    evaluate() {
			
 
				+        if (this.samples.length < this.minSamples)
			
 
				+            return;
			
 
				+        const failures = this.samples.filter((s) => !s.ok).length;
			
 
				+        const rate = failures / this.samples.length;
			
 
				+        if (rate > this.threshold) {
			
 
				+            this.state = "open";
			
 
				+            this.openedAt = this.now();
			
 
				+        }
			
 
				+    }
			
 
				+    tickAutoReset() {
			
 
				+        if (this.state === "open" && this.openedAt !== null) {
			
 
				+            if (this.now() - this.openedAt >= this.openDurationMs) {
			
 
				+                this.state = "half-open";
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+// ─────────────────────────── Errors ──────────────────────────────────────────
			
 
				+/**
			
 
				+ * Raised when the circuit breaker is OPEN and a call is short-circuited.
			
 
				+ * Callers (e.g. fallback wrapper) can catch this to switch to local provider.
			
 
				+ */
			
 
				+export class CircuitOpenError extends Error {
			
 
				+    constructor(message = "OpenAIEmbeddingsProvider circuit is OPEN") {
			
 
				+        super(message);
			
 
				+        this.name = "CircuitOpenError";
			
 
				+    }
			
 
				+}
			
 
				+/**
			
 
				+ * Persistent (non-retryable) HTTP error from upstream. Includes status code.
			
 
				+ */
			
 
				+export class HttpError extends Error {
			
 
				+    status;
			
 
				+    bodyPreview;
			
 
				+    constructor(status, bodyPreview) {
			
 
				+        super(`HTTP ${status}: ${bodyPreview.slice(0, 200)}`);
			
 
				+        this.name = "HttpError";
			
 
				+        this.status = status;
			
 
				+        this.bodyPreview = bodyPreview.slice(0, 1024);
			
 
				+    }
			
 
				+}
			
 
				+// ─────────────────────────── Provider ────────────────────────────────────────
			
 
				+export class OpenAIEmbeddingsProvider {
			
 
				+    kind = "openai";
			
 
				+    endpoint;
			
 
				+    apiKey;
			
 
				+    modelId;
			
 
				+    upstreamModel;
			
 
				+    batchSize;
			
 
				+    timeoutMs;
			
 
				+    fetchImpl;
			
 
				+    retryBackoffsMs;
			
 
				+    sleep;
			
 
				+    now;
			
 
				+    dimensions = undefined;
			
 
				+    breaker;
			
 
				+    constructor(config) {
			
 
				+        if (!config.endpoint) {
			
 
				+            throw new Error("OpenAIEmbeddingsProvider: endpoint is required");
			
 
				+        }
			
 
				+        this.endpoint = config.endpoint.replace(/\/+$/, "");
			
 
				+        this.apiKey = config.apiKey;
			
 
				+        this.modelId = config.modelId ?? "embeddinggemma";
			
 
				+        this.upstreamModel = config.upstreamModel ?? this.modelId;
			
 
				+        this.batchSize = config.batchSize ?? DEFAULT_BATCH_SIZE;
			
 
				+        this.timeoutMs = config.timeoutMs ?? DEFAULT_TIMEOUT_MS;
			
 
				+        this.fetchImpl = config.fetchImpl ?? globalThis.fetch;
			
 
				+        this.retryBackoffsMs = config.retryBackoffsMs ?? RETRY_BACKOFFS_MS;
			
 
				+        this.sleep = config.sleep ?? defaultSleep;
			
 
				+        this.now = config.now ?? Date.now;
			
 
				+        this.breaker = new CircuitBreaker({ now: this.now });
			
 
				+        if (!this.fetchImpl) {
			
 
				+            throw new Error("OpenAIEmbeddingsProvider: global fetch is unavailable. " +
			
 
				+                "Provide a `fetchImpl` config option (Node ≥18 ships fetch by default).");
			
 
				+        }
			
 
				+        if (this.batchSize < 1) {
			
 
				+            throw new Error(`OpenAIEmbeddingsProvider: batchSize must be ≥ 1, got ${this.batchSize}`);
			
 
				+        }
			
 
				+    }
			
 
				+    getModelId() {
			
 
				+        return this.modelId;
			
 
				+    }
			
 
				+    getDimensions() {
			
 
				+        return this.dimensions;
			
 
				+    }
			
 
				+    async healthcheck(signal) {
			
 
				+        // Try GET /health first (worker exposes it). Fall back to probe embed.
			
 
				+        try {
			
 
				+            const { signal: attemptSig, cleanup } = buildAttemptSignal(signal, this.timeoutMs);
			
 
				+            try {
			
 
				+                const resp = await this.fetchImpl(`${this.endpoint}/health`, {
			
 
				+                    method: "GET",
			
 
				+                    headers: this.buildHeaders(),
			
 
				+                    signal: attemptSig,
			
 
				+                });
			
 
				+                if (resp.ok) {
			
 
				+                    return {
			
 
				+                        ok: true,
			
 
				+                        model: this.modelId,
			
 
				+                        dimensions: this.dimensions,
			
 
				+                        detail: `GET /health → ${resp.status}`,
			
 
				+                    };
			
 
				+                }
			
 
				+                return {
			
 
				+                    ok: false,
			
 
				+                    model: this.modelId,
			
 
				+                    detail: `GET /health → HTTP ${resp.status}`,
			
 
				+                };
			
 
				+            }
			
 
				+            finally {
			
 
				+                cleanup();
			
 
				+            }
			
 
				+        }
			
 
				+        catch (err) {
			
 
				+            // Endpoint may not implement /health — try a single embed probe instead.
			
 
				+            try {
			
 
				+                const probe = await this.embed("healthcheck", { signal });
			
 
				+                if (probe) {
			
 
				+                    return {
			
 
				+                        ok: true,
			
 
				+                        model: this.modelId,
			
 
				+                        dimensions: probe.embedding.length,
			
 
				+                        detail: "embed probe ok",
			
 
				+                    };
			
 
				+                }
			
 
				+                return {
			
 
				+                    ok: false,
			
 
				+                    model: this.modelId,
			
 
				+                    detail: "embed probe returned null",
			
 
				+                };
			
 
				+            }
			
 
				+            catch (probeErr) {
			
 
				+                return {
			
 
				+                    ok: false,
			
 
				+                    model: this.modelId,
			
 
				+                    detail: (err instanceof Error ? err.message : String(err)) +
			
 
				+                        " | probe: " +
			
 
				+                        (probeErr instanceof Error ? probeErr.message : String(probeErr)),
			
 
				+                };
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+    async embed(text, options = {}) {
			
 
				+        const batch = await this.embedBatch([text], options);
			
 
				+        return batch[0] ?? null;
			
 
				+    }
			
 
				+    async embedBatch(texts, options = {}) {
			
 
				+        if (texts.length === 0)
			
 
				+            return [];
			
 
				+        if (this.breaker.shouldFailFast()) {
			
 
				+            throw new CircuitOpenError();
			
 
				+        }
			
 
				+        const chunks = chunkArray(texts, this.batchSize);
			
 
				+        const results = new Array(texts.length).fill(null);
			
 
				+        let cursor = 0;
			
 
				+        for (const chunk of chunks) {
			
 
				+            const start = cursor;
			
 
				+            cursor += chunk.length;
			
 
				+            // Abort early if signal already fired
			
 
				+            if (options.signal?.aborted) {
			
 
				+                // Leave remaining slots as null (caller treats as errors)
			
 
				+                return results;
			
 
				+            }
			
 
				+            // Fail-fast if breaker tripped mid-loop
			
 
				+            if (this.breaker.shouldFailFast()) {
			
 
				+                throw new CircuitOpenError();
			
 
				+            }
			
 
				+            try {
			
 
				+                const embeddings = await this.requestWithRetry(chunk, options);
			
 
				+                for (let i = 0; i < chunk.length; i++) {
			
 
				+                    const embedding = embeddings[i];
			
 
				+                    if (embedding) {
			
 
				+                        results[start + i] = {
			
 
				+                            embedding,
			
 
				+                            model: this.modelId,
			
 
				+                        };
			
 
				+                        // Record dimensions on first success
			
 
				+                        if (this.dimensions === undefined) {
			
 
				+                            this.dimensions = embedding.length;
			
 
				+                        }
			
 
				+                    }
			
 
				+                }
			
 
				+                this.breaker.recordSuccess();
			
 
				+            }
			
 
				+            catch (err) {
			
 
				+                this.breaker.recordFailure();
			
 
				+                // CircuitOpenError must propagate so the caller can fall back
			
 
				+                if (err instanceof CircuitOpenError)
			
 
				+                    throw err;
			
 
				+                // Other errors mark the chunk as null and continue with next chunk.
			
 
				+                // (The store layer already handles per-text nulls as errors.)
			
 
				+                if (process.env.QMD_EMBED_DEBUG) {
			
 
				+                    process.stderr.write(`OpenAIEmbeddingsProvider: chunk failed (${err instanceof Error ? err.message : String(err)})\n`);
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+        return results;
			
 
				+    }
			
 
				+    async dispose() {
			
 
				+        // Nothing to release — fetch handles its own connection pooling.
			
 
				+        // Reset the breaker so a re-instantiation starts fresh.
			
 
				+        this.breaker.reset();
			
 
				+    }
			
 
				+    // ────────────────────── Internals ──────────────────────
			
 
				+    buildHeaders() {
			
 
				+        const headers = {
			
 
				+            "Content-Type": "application/json",
			
 
				+            "Accept": "application/json",
			
 
				+        };
			
 
				+        if (this.apiKey) {
			
 
				+            headers["Authorization"] = `Bearer ${this.apiKey}`;
			
 
				+        }
			
 
				+        return headers;
			
 
				+    }
			
 
				+    /**
			
 
				+     * Single HTTP request with retry on 429/503. Returns embeddings indexed
			
 
				+     * the same as `texts`. Throws on non-retryable failure or all attempts
			
 
				+     * exhausted.
			
 
				+     */
			
 
				+    async requestWithRetry(texts, options) {
			
 
				+        let lastErr = null;
			
 
				+        const maxAttempts = this.retryBackoffsMs.length + 1;
			
 
				+        for (let attempt = 0; attempt < maxAttempts; attempt++) {
			
 
				+            // Honor user abort BEFORE issuing the call (avoids wasted network)
			
 
				+            if (options.signal?.aborted) {
			
 
				+                throw new Error("aborted by caller");
			
 
				+            }
			
 
				+            try {
			
 
				+                return await this.requestOnce(texts, options);
			
 
				+            }
			
 
				+            catch (err) {
			
 
				+                lastErr = err;
			
 
				+                const retryable = err instanceof HttpError ? isRetryableStatus(err.status) : false;
			
 
				+                if (!retryable)
			
 
				+                    throw err;
			
 
				+                if (attempt < this.retryBackoffsMs.length) {
			
 
				+                    await this.sleep(this.retryBackoffsMs[attempt]);
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+        // Exhausted retries → throw the last error so caller marks the chunk null
			
 
				+        throw lastErr ?? new Error("requestWithRetry exhausted");
			
 
				+    }
			
 
				+    /**
			
 
				+     * Issue one HTTP attempt to `POST /v1/embeddings`. Does NOT retry.
			
 
				+     */
			
 
				+    async requestOnce(texts, options) {
			
 
				+        const { signal: attemptSig, cleanup } = buildAttemptSignal(options.signal, this.timeoutMs);
			
 
				+        try {
			
 
				+            const body = JSON.stringify({
			
 
				+                model: options.model ?? this.upstreamModel,
			
 
				+                input: texts,
			
 
				+            });
			
 
				+            const resp = await this.fetchImpl(`${this.endpoint}/v1/embeddings`, {
			
 
				+                method: "POST",
			
 
				+                headers: this.buildHeaders(),
			
 
				+                body,
			
 
				+                signal: attemptSig,
			
 
				+            });
			
 
				+            if (!resp.ok) {
			
 
				+                const text = await resp.text().catch(() => "");
			
 
				+                throw new HttpError(resp.status, text);
			
 
				+            }
			
 
				+            let parsed;
			
 
				+            try {
			
 
				+                parsed = (await resp.json());
			
 
				+            }
			
 
				+            catch (err) {
			
 
				+                throw new Error(`OpenAIEmbeddingsProvider: malformed JSON from ${this.endpoint}/v1/embeddings: ${err instanceof Error ? err.message : String(err)}`);
			
 
				+            }
			
 
				+            if (!parsed || !Array.isArray(parsed.data)) {
			
 
				+                throw new Error(`OpenAIEmbeddingsProvider: response missing "data" array (got ${typeof parsed})`);
			
 
				+            }
			
 
				+            // Sort by index to match input order (in case server returns out-of-order).
			
 
				+            const out = new Array(texts.length);
			
 
				+            for (const item of parsed.data) {
			
 
				+                if (typeof item.index !== "number" ||
			
 
				+                    item.index < 0 ||
			
 
				+                    item.index >= texts.length) {
			
 
				+                    throw new Error(`OpenAIEmbeddingsProvider: data item index out of range (${item.index}, expected 0..${texts.length - 1})`);
			
 
				+                }
			
 
				+                if (!Array.isArray(item.embedding)) {
			
 
				+                    throw new Error(`OpenAIEmbeddingsProvider: data[${item.index}].embedding is not an array`);
			
 
				+                }
			
 
				+                out[item.index] = item.embedding;
			
 
				+            }
			
 
				+            // Sanity check — every slot must be filled
			
 
				+            for (let i = 0; i < texts.length; i++) {
			
 
				+                if (!out[i]) {
			
 
				+                    throw new Error(`OpenAIEmbeddingsProvider: response missing embedding for index ${i}`);
			
 
				+                }
			
 
				+            }
			
 
				+            return out;
			
 
				+        }
			
 
				+        finally {
			
 
				+            cleanup();
			
 
				+        }
			
 
				+    }
			
 
				+}
			
--- a/dist/embedding/provider.d.ts
+++ b/dist/embedding/provider.d.ts
@@ -0,0 +1,109 @@
 
				+/**
			
 
				+ * provider.ts - Embedding provider abstraction
			
 
				+ *
			
 
				+ * Defines the EmbeddingProvider interface that allows qmd to use either:
			
 
				+ *   - LocalLlamaCppProvider (legacy, GGUF via node-llama-cpp)
			
 
				+ *   - OpenAIEmbeddingsProvider (HTTP, OpenAI-compatible endpoint like ai.mm.mk)
			
 
				+ *
			
 
				+ * The factory in `./factory.ts` selects an implementation based on env vars,
			
 
				+ * a CLI flag, or `~/.config/qmd/config.json`.
			
 
				+ */
			
 
				+/**
			
 
				+ * Single embedding result
			
 
				+ */
			
 
				+export type ProviderEmbedding = {
			
 
				+    embedding: number[];
			
 
				+    /** Model identifier used to produce this embedding (matches content_vectors.model in DB) */
			
 
				+    model: string;
			
 
				+};
			
 
				+/**
			
 
				+ * Supported provider kinds
			
 
				+ */
			
 
				+export type ProviderKind = "local" | "openai";
			
 
				+/**
			
 
				+ * Healthcheck result for provider startup verification
			
 
				+ */
			
 
				+export type ProviderHealth = {
			
 
				+    ok: boolean;
			
 
				+    /** Model identifier reported by the provider */
			
 
				+    model: string;
			
 
				+    /** Embedding dimensions (e.g. 768 for embeddinggemma-300M) */
			
 
				+    dimensions?: number;
			
 
				+    /** Detail message (error reason on failure, status on success) */
			
 
				+    detail?: string;
			
 
				+};
			
 
				+/**
			
 
				+ * Per-call options for provider embedding
			
 
				+ */
			
 
				+export type ProviderEmbedOptions = {
			
 
				+    /** Optional model id override (rare; usually provider has a fixed model) */
			
 
				+    model?: string;
			
 
				+    /** Abort signal for cancellation / timeout */
			
 
				+    signal?: AbortSignal;
			
 
				+};
			
 
				+/**
			
 
				+ * Provider interface — both LocalLlamaCppProvider and OpenAIEmbeddingsProvider implement this.
			
 
				+ *
			
 
				+ * Implementations MUST:
			
 
				+ *   - Return `null` (not throw) for individual texts that fail to embed;
			
 
				+ *     the caller will count it as an error and continue.
			
 
				+ *   - Honor `options.signal` for cancellation.
			
 
				+ *   - Be safe to call concurrently for `embedBatch`.
			
 
				+ */
			
 
				+export interface EmbeddingProvider {
			
 
				+    /** Provider kind tag — useful for logging and factory introspection */
			
 
				+    readonly kind: ProviderKind;
			
 
				+    /**
			
 
				+     * Stable model identifier reported to the caller.
			
 
				+     *
			
 
				+     * MUST match what's stored in `content_vectors.model` for the existing
			
 
				+     * index — otherwise the model-id guard refuses to embed.
			
 
				+     */
			
 
				+    getModelId(): string;
			
 
				+    /**
			
 
				+     * Embedding vector dimensions. May return `undefined` before the first call
			
 
				+     * (some providers probe lazily). Once known, MUST stay stable.
			
 
				+     */
			
 
				+    getDimensions(): number | undefined;
			
 
				+    /**
			
 
				+     * Healthcheck — verifies the provider is reachable and the model is loaded.
			
 
				+     * Should NOT throw — return `{ ok: false, detail: ... }` on failure.
			
 
				+     *
			
 
				+     * For HTTP providers: ping `/health` endpoint.
			
 
				+     * For local provider: ensure model loads.
			
 
				+     */
			
 
				+    healthcheck(signal?: AbortSignal): Promise<ProviderHealth>;
			
 
				+    /**
			
 
				+     * Embed a single text. Returns `null` on per-call failure.
			
 
				+     */
			
 
				+    embed(text: string, options?: ProviderEmbedOptions): Promise<ProviderEmbedding | null>;
			
 
				+    /**
			
 
				+     * Embed multiple texts in a batch (more efficient than calling `embed` N times).
			
 
				+     *
			
 
				+     * Output array length MUST equal input array length. Failed entries are `null`.
			
 
				+     * Implementations are responsible for chunking large batches per their
			
 
				+     * upstream limits (e.g. OpenAI provider chunks to 64).
			
 
				+     */
			
 
				+    embedBatch(texts: string[], options?: ProviderEmbedOptions): Promise<(ProviderEmbedding | null)[]>;
			
 
				+    /** Release any held resources (HTTP keep-alive sockets, model handles, …) */
			
 
				+    dispose(): Promise<void>;
			
 
				+}
			
 
				+/**
			
 
				+ * Error thrown when the provider's reported model id does not match the
			
 
				+ * model id baked into existing `content_vectors` rows. Forces user to
			
 
				+ * re-embed (`qmd embed -f`) or pin the matching model id.
			
 
				+ */
			
 
				+export declare class ModelMismatchError extends Error {
			
 
				+    readonly providerModel: string;
			
 
				+    readonly existingModels: string[];
			
 
				+    constructor(providerModel: string, existingModels: string[]);
			
 
				+}
			
 
				+/**
			
 
				+ * Verify that the provider's model id is compatible with the existing
			
 
				+ * `content_vectors` entries. Pass-through (no-op) if the table is empty
			
 
				+ * (fresh DB) or if the model id appears in the distinct set.
			
 
				+ *
			
 
				+ * Caller passes `existingModels` (typically result of
			
 
				+ * `SELECT DISTINCT model FROM content_vectors`).
			
 
				+ */
			
 
				+export declare function assertModelCompatible(providerModel: string, existingModels: string[]): void;
			
--- a/dist/embedding/provider.js
+++ b/dist/embedding/provider.js
@@ -0,0 +1,46 @@
 
				+/**
			
 
				+ * provider.ts - Embedding provider abstraction
			
 
				+ *
			
 
				+ * Defines the EmbeddingProvider interface that allows qmd to use either:
			
 
				+ *   - LocalLlamaCppProvider (legacy, GGUF via node-llama-cpp)
			
 
				+ *   - OpenAIEmbeddingsProvider (HTTP, OpenAI-compatible endpoint like ai.mm.mk)
			
 
				+ *
			
 
				+ * The factory in `./factory.ts` selects an implementation based on env vars,
			
 
				+ * a CLI flag, or `~/.config/qmd/config.json`.
			
 
				+ */
			
 
				+/**
			
 
				+ * Error thrown when the provider's reported model id does not match the
			
 
				+ * model id baked into existing `content_vectors` rows. Forces user to
			
 
				+ * re-embed (`qmd embed -f`) or pin the matching model id.
			
 
				+ */
			
 
				+export class ModelMismatchError extends Error {
			
 
				+    providerModel;
			
 
				+    existingModels;
			
 
				+    constructor(providerModel, existingModels) {
			
 
				+        const list = existingModels.join(", ");
			
 
				+        super(`Embedding model mismatch: existing vectors use model(s) [${list}] ` +
			
 
				+            `but the configured provider reports "${providerModel}". ` +
			
 
				+            `Run \`qmd embed -f\` (or \`--rebuild\`) to re-embed everything with ` +
			
 
				+            `the new model, or set QMD_EMBED_MODEL_ID="${existingModels[0] ?? ""}" ` +
			
 
				+            `to keep the existing vectors.`);
			
 
				+        this.name = "ModelMismatchError";
			
 
				+        this.providerModel = providerModel;
			
 
				+        this.existingModels = existingModels;
			
 
				+    }
			
 
				+}
			
 
				+/**
			
 
				+ * Verify that the provider's model id is compatible with the existing
			
 
				+ * `content_vectors` entries. Pass-through (no-op) if the table is empty
			
 
				+ * (fresh DB) or if the model id appears in the distinct set.
			
 
				+ *
			
 
				+ * Caller passes `existingModels` (typically result of
			
 
				+ * `SELECT DISTINCT model FROM content_vectors`).
			
 
				+ */
			
 
				+export function assertModelCompatible(providerModel, existingModels) {
			
 
				+    // Empty DB — nothing to compare against, anything goes.
			
 
				+    if (existingModels.length === 0)
			
 
				+        return;
			
 
				+    if (existingModels.includes(providerModel))
			
 
				+        return;
			
 
				+    throw new ModelMismatchError(providerModel, existingModels);
			
 
				+}
			
--- a/dist/index.d.ts
+++ b/dist/index.d.ts
@@ -24,6 +24,8 @@ export { extractSnippet, addLineNumbers, DEFAULT_MULTI_GET_MAX_BYTES };
 
				 export type { ChunkStrategy } from "./store.js";
			
 
				 export { getDefaultDbPath } from "./store.js";
			
 
				 export { Maintenance } from "./maintenance.js";
			
 
				+export { createEmbeddingProvider, resolveProviderKind, LocalLlamaCppProvider, OpenAIEmbeddingsProvider, CircuitBreaker, CircuitOpenError, HttpError, ModelMismatchError, assertModelCompatible, type EmbeddingProvider, type ProviderKind, type ProviderEmbedding, type ProviderEmbedOptions, type ProviderHealth, type CreateEmbeddingProviderOptions, type OpenAIProviderConfig, type LocalLlamaCppProviderConfig, type EmbedProviderConfigFile, DEFAULT_BATCH_SIZE as DEFAULT_PROVIDER_BATCH_SIZE, DEFAULT_TIMEOUT_MS as DEFAULT_PROVIDER_TIMEOUT_MS, RETRY_BACKOFFS_MS as PROVIDER_RETRY_BACKOFFS_MS, } from "./embedding/index.js";
			
 
				+export { getDistinctEmbeddingModels } from "./store.js";
			
 
				 /**
			
 
				  * Progress info emitted during update() for each file processed.
			
 
				  */
			
--- a/dist/index.js
+++ b/dist/index.js
@@ -25,6 +25,12 @@ export { extractSnippet, addLineNumbers, DEFAULT_MULTI_GET_MAX_BYTES };
 
				 export { getDefaultDbPath } from "./store.js";
			
 
				 // Re-export Maintenance class for CLI housekeeping operations
			
 
				 export { Maintenance } from "./maintenance.js";
			
 
				+// Re-export embedding provider abstraction for SDK consumers (i-qkarfffa).
			
 
				+// `createEmbeddingProvider` honors QMD_EMBED_ENDPOINT / config-file / kind
			
 
				+// arg precedence; default fallback is the legacy LocalLlamaCppProvider so
			
 
				+// SDK code that doesn't pass `embedProvider` keeps the prior behavior.
			
 
				+export { createEmbeddingProvider, resolveProviderKind, LocalLlamaCppProvider, OpenAIEmbeddingsProvider, CircuitBreaker, CircuitOpenError, HttpError, ModelMismatchError, assertModelCompatible, DEFAULT_BATCH_SIZE as DEFAULT_PROVIDER_BATCH_SIZE, DEFAULT_TIMEOUT_MS as DEFAULT_PROVIDER_TIMEOUT_MS, RETRY_BACKOFFS_MS as PROVIDER_RETRY_BACKOFFS_MS, } from "./embedding/index.js";
			
 
				+export { getDistinctEmbeddingModels } from "./store.js";
			
 
				 /**
			
 
				  * Create a QMD store for programmatic access to search and indexing.
			
 
				  *
			
--- a/dist/store.d.ts
+++ b/dist/store.d.ts
@@ -13,6 +13,7 @@
 
				 import type { Database } from "./db.js";
			
 
				 import { LlamaCpp, formatQueryForEmbedding, formatDocForEmbedding, type ILLMSession } from "./llm.js";
			
 
				 import type { NamedCollection, Collection, CollectionConfig } from "./collections.js";
			
 
				+import { type EmbeddingProvider } from "./embedding/provider.js";
			
 
				 export declare const DEFAULT_EMBED_MODEL = "embeddinggemma";
			
 
				 export declare const DEFAULT_RERANK_MODEL = "ExpedientFalcon/qwen3-reranker:0.6b-q8_0";
			
 
				 export declare const DEFAULT_QUERY_MODEL = "Qwen/Qwen3-1.7B";
			
@@ -336,6 +337,16 @@ export type EmbedOptions = {
 
				     maxBatchBytes?: number;
			
 
				     chunkStrategy?: ChunkStrategy;
			
 
				     onProgress?: (info: EmbedProgress) => void;
			
 
				+    /**
			
 
				+     * Optional embedding provider. When supplied, embeddings are routed through
			
 
				+     * this provider (HTTP, GPU worker, etc.) instead of the local llama.cpp
			
 
				+     * session path. The provider's `getModelId()` is verified against existing
			
 
				+     * `content_vectors.model` rows; mismatch throws unless `force` is set.
			
 
				+     *
			
 
				+     * When omitted, behavior is identical to pre-patch: embeddings come from
			
 
				+     * the store's `LlamaCpp` (or the global singleton).
			
 
				+     */
			
 
				+    embedProvider?: EmbeddingProvider;
			
 
				 };
			
 
				 /**
			
 
				  * Generate vector embeddings for documents that need them.
			
@@ -713,6 +724,17 @@ export declare function getHashesForEmbedding(db: Database): {
 
				  * Deletes all rows from content_vectors and drops the vectors_vec table.
			
 
				  */
			
 
				 export declare function clearAllEmbeddings(db: Database): void;
			
 
				+/**
			
 
				+ * Get the distinct set of model identifiers present in `content_vectors`.
			
 
				+ *
			
 
				+ * Used by the embedding migration-safety guard: if a configured provider's
			
 
				+ * `getModelId()` does not appear in this list (and the table is non-empty),
			
 
				+ * we refuse to embed and ask the user to run `qmd embed -f` to rebuild.
			
 
				+ *
			
 
				+ * Returns `[]` when the table is empty (fresh DB) — in which case any
			
 
				+ * provider is allowed.
			
 
				+ */
			
 
				+export declare function getDistinctEmbeddingModels(db: Database): string[];
			
 
				 /**
			
 
				  * Insert a single embedding into both content_vectors and vectors_vec tables.
			
 
				  * The hash_seq key is formatted as "hash_seq" for the vectors_vec table.
			
--- a/dist/store.js
+++ b/dist/store.js
@@ -17,6 +17,7 @@ import { readFileSync, realpathSync, statSync, mkdirSync } from "node:fs";
 
				 // Note: node:path resolve is not imported — we export our own cross-platform resolve()
			
 
				 import fastGlob from "fast-glob";
			
 
				 import { LlamaCpp, getDefaultLlamaCpp, formatQueryForEmbedding, formatDocForEmbedding, withLLMSessionForLlm, } from "./llm.js";
			
 
				+import { assertModelCompatible, } from "./embedding/provider.js";
			
 
				 // =============================================================================
			
 
				 // Configuration
			
 
				 // =============================================================================
			
@@ -1007,6 +1008,49 @@ function getEmbeddingDocsForBatch(db, batch) {
 
				         body: bodyByHash.get(doc.hash) ?? "",
			
 
				     }));
			
 
				 }
			
 
				+/**
			
 
				+ * Run `body` with a session-shaped argument that supplies an AbortSignal +
			
 
				+ * isValid flag. When `provider` is supplied, the session is a lightweight
			
 
				+ * AbortController-backed stub — `getLlm(store)` is never called and
			
 
				+ * `withLLMSessionForLlm` is bypassed entirely, so node-llama-cpp is not
			
 
				+ * warmed up on remote-only deployments (i-08ovbvtb, follow-up to i-qkarfffa).
			
 
				+ *
			
 
				+ * When `provider` is undefined, behavior is unchanged: a real `LLMSession`
			
 
				+ * is created via `withLLMSessionForLlm(getLlm(store), ...)` so that the
			
 
				+ * body can use `session.embed`/`session.embedBatch` for the local path.
			
 
				+ *
			
 
				+ * The fake session's LLM-only methods (embed/embedBatch/expandQuery/rerank)
			
 
				+ * throw if called — they MUST NOT be reached when `provider` is set, since
			
 
				+ * the embed path is supposed to route through the provider instead.
			
 
				+ */
			
 
				+async function withEmbedSession(store, provider, body, options) {
			
 
				+    if (provider) {
			
 
				+        const ac = new AbortController();
			
 
				+        const fakeSession = {
			
 
				+            get signal() { return ac.signal; },
			
 
				+            get isValid() { return !ac.signal.aborted; },
			
 
				+            embed: async () => {
			
 
				+                throw new Error("withEmbedSession: provider supplied — session.embed must not be called");
			
 
				+            },
			
 
				+            embedBatch: async () => {
			
 
				+                throw new Error("withEmbedSession: provider supplied — session.embedBatch must not be called");
			
 
				+            },
			
 
				+            expandQuery: async () => {
			
 
				+                throw new Error("withEmbedSession: provider supplied — session.expandQuery must not be called");
			
 
				+            },
			
 
				+            rerank: async () => {
			
 
				+                throw new Error("withEmbedSession: provider supplied — session.rerank must not be called");
			
 
				+            },
			
 
				+        };
			
 
				+        try {
			
 
				+            return await body(fakeSession);
			
 
				+        }
			
 
				+        finally {
			
 
				+            ac.abort();
			
 
				+        }
			
 
				+    }
			
 
				+    return withLLMSessionForLlm(getLlm(store), body, options);
			
 
				+}
			
 
				 /**
			
 
				  * Generate vector embeddings for documents that need them.
			
 
				  * Pure function — no console output, no db lifecycle management.
			
@@ -1018,6 +1062,14 @@ export async function generateEmbeddings(store, options) {
 
				     const now = new Date().toISOString();
			
 
				     const { maxDocsPerBatch, maxBatchBytes } = resolveEmbedOptions(options);
			
 
				     const encoder = new TextEncoder();
			
 
				+    // Migration safety: if an embedProvider is supplied, verify its model id
			
 
				+    // matches the existing content_vectors rows (unless we're about to clear
			
 
				+    // them via `force`). This must happen BEFORE we clear vectors so users
			
 
				+    // who pass `--force` aren't blocked.
			
 
				+    if (options?.embedProvider && !options.force) {
			
 
				+        const existing = getDistinctEmbeddingModels(db);
			
 
				+        assertModelCompatible(options.embedProvider.getModelId(), existing);
			
 
				+    }
			
 
				     if (options?.force) {
			
 
				         clearAllEmbeddings(db);
			
 
				     }
			
@@ -1046,11 +1098,23 @@ export async function generateEmbeddings(store, options) {
 
				         // global strategy — no collection overrides. Keeps SDK/inline
			
 
				         // callers that never touch ~/.config/qmd working.
			
 
				     }
			
 
				-    // Use store's LlamaCpp or global singleton, wrapped in a session
			
 
				-    const llm = getLlm(store);
			
 
				-    const embedModelUri = llm.embedModelName;
			
 
				-    // Create a session manager for this llm instance
			
 
				-    const result = await withLLMSessionForLlm(llm, async (session) => {
			
 
				+    // Provider routing — when an EmbeddingProvider is supplied, embed calls go
			
 
				+    // through it (HTTP, GPU worker, etc.). Otherwise, use the LLM session path.
			
 
				+    // The outer session is still created for its abort signal (chunking uses
			
 
				+    // `session.signal` for cooperative cancellation).
			
 
				+    const provider = options?.embedProvider;
			
 
				+    const providerModel = provider?.getModelId() ?? model;
			
 
				+    // Resolve `embedModelUri` (used for formatting prefixes etc.) lazily —
			
 
				+    // when `provider` is set, take it from the provider; otherwise fall back
			
 
				+    // to the local LlamaCpp's embed model name. Accessing `getLlm(store)` is
			
 
				+    // deferred to the non-provider branch so remote-only deployments do not
			
 
				+    // construct a `LlamaCpp` instance just to read its embedModelName.
			
 
				+    const embedModelUri = provider
			
 
				+        ? provider.getModelId()
			
 
				+        : getLlm(store).embedModelName;
			
 
				+    // Run the embedding loop inside a session-scoped wrapper. When `provider`
			
 
				+    // is set, this short-circuits the local LLM warm-up entirely (i-08ovbvtb).
			
 
				+    const result = await withEmbedSession(store, provider, async (session) => {
			
 
				         let chunksEmbedded = 0;
			
 
				         let errors = 0;
			
 
				         let bytesProcessed = 0;
			
@@ -1058,6 +1122,25 @@ export async function generateEmbeddings(store, options) {
 
				         let vectorTableInitialized = false;
			
 
				         const BATCH_SIZE = 32;
			
 
				         const batches = buildEmbeddingBatches(docsToEmbed, maxDocsPerBatch, maxBatchBytes);
			
 
				+        // Embedding helpers — single point of provider/session selection.
			
 
				+        // Both return the same shape as ILLMSession.embed/embedBatch so the
			
 
				+        // rest of the loop is unchanged.
			
 
				+        const embedOne = async (text, modelArg) => {
			
 
				+            if (provider) {
			
 
				+                const sig = provider.kind === 'local' ? session.signal : undefined;
			
 
				+                const r = await provider.embed(text, { model: modelArg, signal: sig });
			
 
				+                return r ? { embedding: r.embedding, model: r.model } : null;
			
 
				+            }
			
 
				+            return session.embed(text, { model: modelArg });
			
 
				+        };
			
 
				+        const embedMany = async (texts, modelArg) => {
			
 
				+            if (provider) {
			
 
				+                const sig = provider.kind === 'local' ? session.signal : undefined;
			
 
				+                const r = await provider.embedBatch(texts, { model: modelArg, signal: sig });
			
 
				+                return r.map((x) => (x ? { embedding: x.embedding, model: x.model } : null));
			
 
				+            }
			
 
				+            return session.embedBatch(texts, { model: modelArg });
			
 
				+        };
			
 
				         for (const batchMeta of batches) {
			
 
				             // Abort early if session has been invalidated
			
 
				             if (!session.isValid) {
			
@@ -1095,7 +1178,7 @@ export async function generateEmbeddings(store, options) {
 
				             if (!vectorTableInitialized) {
			
 
				                 const firstChunk = batchChunks[0];
			
 
				                 const firstText = formatDocForEmbedding(firstChunk.text, firstChunk.title, embedModelUri);
			
 
				-                const firstResult = await session.embed(firstText, { model });
			
 
				+                const firstResult = await embedOne(firstText, providerModel);
			
 
				                 if (!firstResult) {
			
 
				                     throw new Error("Failed to get embedding dimensions from first chunk");
			
 
				                 }
			
@@ -1124,12 +1207,12 @@ export async function generateEmbeddings(store, options) {
 
				                 const chunkBatch = batchChunks.slice(batchStart, batchEnd);
			
 
				                 const texts = chunkBatch.map(chunk => formatDocForEmbedding(chunk.text, chunk.title, embedModelUri));
			
 
				                 try {
			
 
				-                    const embeddings = await session.embedBatch(texts, { model });
			
 
				+                    const embeddings = await embedMany(texts, providerModel);
			
 
				                     for (let i = 0; i < chunkBatch.length; i++) {
			
 
				                         const chunk = chunkBatch[i];
			
 
				                         const embedding = embeddings[i];
			
 
				                         if (embedding) {
			
 
				-                            insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(embedding.embedding), model, now);
			
 
				+                            insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(embedding.embedding), providerModel, now);
			
 
				                             chunksEmbedded++;
			
 
				                         }
			
 
				                         else {
			
@@ -1149,9 +1232,9 @@ export async function generateEmbeddings(store, options) {
 
				                         for (const chunk of chunkBatch) {
			
 
				                             try {
			
 
				                                 const text = formatDocForEmbedding(chunk.text, chunk.title, embedModelUri);
			
 
				-                                const result = await session.embed(text, { model });
			
 
				+                                const result = await embedOne(text, providerModel);
			
 
				                                 if (result) {
			
 
				-                                    insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(result.embedding), model, now);
			
 
				+                                    insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(result.embedding), providerModel, now);
			
 
				                                     chunksEmbedded++;
			
 
				                                 }
			
 
				                                 else {
			
@@ -2518,6 +2601,20 @@ export function clearAllEmbeddings(db) {
 
				     db.exec(`DELETE FROM content_vectors`);
			
 
				     db.exec(`DROP TABLE IF EXISTS vectors_vec`);
			
 
				 }
			
 
				+/**
			
 
				+ * Get the distinct set of model identifiers present in `content_vectors`.
			
 
				+ *
			
 
				+ * Used by the embedding migration-safety guard: if a configured provider's
			
 
				+ * `getModelId()` does not appear in this list (and the table is non-empty),
			
 
				+ * we refuse to embed and ask the user to run `qmd embed -f` to rebuild.
			
 
				+ *
			
 
				+ * Returns `[]` when the table is empty (fresh DB) — in which case any
			
 
				+ * provider is allowed.
			
 
				+ */
			
 
				+export function getDistinctEmbeddingModels(db) {
			
 
				+    const rows = db.prepare(`SELECT DISTINCT model FROM content_vectors WHERE model IS NOT NULL`).all();
			
 
				+    return rows.map((r) => r.model).filter((m) => typeof m === "string" && m.length > 0);
			
 
				+}
			
 
				 /**
			
 
				  * Insert a single embedding into both content_vectors and vectors_vec tables.
			
 
				  * The hash_seq key is formatted as "hash_seq" for the vectors_vec table.
			
--- a/src/cli/qmd.ts
+++ b/src/cli/qmd.ts
@@ -199,6 +199,7 @@ const c = {
 
				   green: useColor ? "\x1b[32m" : "",
			
 
				   magenta: useColor ? "\x1b[35m" : "",
			
 
				   blue: useColor ? "\x1b[34m" : "",
			
 
				+  red: useColor ? "\x1b[31m" : "",
			
 
				 };
			
 
				 
			
 
				 // Terminal cursor control