Explorar el Código

fix(cli): add missing `red` field to color palette + ship dist (i-08ovbvtb)

The `c.red` palette entry was referenced at `src/cli/qmd.ts:3199` (in the
ModelMismatchError friendly-output branch added by the i-qkarfffa Stage-3
work) but never added to the palette object's literal definition at line
193. TypeScript flagged this on every typecheck since 0463dd5.

Also commits the previously-untracked `dist/embedding/` directory. The
i-qkarfffa Stage-3 commit added `src/embedding/{provider,openai,local,
factory,autofallback,index}.ts` and `dist/store.js` imports
`./embedding/provider.js` at runtime — but `dist/embedding/` itself was
never committed, so consumers of `@oivo/qmd/dist/index.js` would have
hit a `MODULE_NOT_FOUND` at import time. This commit ships the compiled
artefacts alongside the i-08ovbvtb refactor of `dist/store.js`.

Changes:
  * src/cli/qmd.ts: +1 line (`red: useColor ? "\x1b[31m" : ""`)
  * dist/cli/qmd.js, dist/index.{d.ts,js}, dist/store.{d.ts,js}: refreshed
    by `npm run` to reflect i-08ovbvtb's `withEmbedSession` helper +
    palette fix (regen, not hand-edit)
  * dist/embedding/{provider,openai,local,factory,autofallback,index}
    .{d.ts,js}: newly committed (existed in src/ since 0463dd5 but
    weren't tracked in dist/)

Verification:
  * Typecheck inside vendor/qmd (`npm run` with the build script) exits
    0 — all pre-existing TS errors resolved
  * dist/cli/qmd.js: `red: useColor ? "\x1b[31m" : ""` present at line 99
  * dist/store.js: `withEmbedSession` helper present at line 1026

Generated with [Claude Code](https://claude.ai/code)
via [Oivo](https://oivo.com)

Co-Authored-By: Claude <noreply@anthropic.com>
Session-Id: 6309e407
root hace 3 semanas
padre
commit
66cbadc06c

+ 96 - 1
dist/cli/qmd.js

@@ -12,6 +12,7 @@ import { disposeDefaultLlamaCpp, getDefaultLlamaCpp, setDefaultLlamaCpp, LlamaCp
 import { formatSearchResults, formatDocuments, escapeXml, escapeCSV, } from "./formatter.js";
 import { getCollection as getCollectionFromYaml, listCollections as yamlListCollections, getDefaultCollectionNames, addContext as yamlAddContext, removeContext as yamlRemoveContext, removeCollection as yamlRemoveCollectionFn, renameCollection as yamlRenameCollectionFn, setGlobalContext, listAllContexts, setConfigIndexName, loadConfig, } from "../collections.js";
 import { getEmbeddedQmdSkillContent, getEmbeddedQmdSkillFiles } from "../embedded-skills.js";
+import { createEmbeddingProvider, resolveProviderKind, ModelMismatchError, } from "../embedding/index.js";
 // Enable production mode - allows using default database path
 // Tests must set INDEX_PATH or use createStore() with explicit path
 enableProductionMode();
@@ -95,6 +96,7 @@ const c = {
     green: useColor ? "\x1b[32m" : "",
     magenta: useColor ? "\x1b[35m" : "",
     blue: useColor ? "\x1b[34m" : "",
+    red: useColor ? "\x1b[31m" : "",
 };
 // Terminal cursor control
 const cursor = {
@@ -1419,6 +1421,59 @@ function parseChunkStrategy(value) {
         return s;
     throw new Error(`--chunk-strategy must be "auto", "regex", or "function" (got "${s}")`);
 }
+function parseProviderKind(value) {
+    if (value === undefined)
+        return undefined;
+    const s = String(value).toLowerCase();
+    if (s === "local" || s === "openai")
+        return s;
+    throw new Error(`--provider must be "local" or "openai" (got "${s}")`);
+}
+function parseOptionalPositiveInt(name, value) {
+    if (value === undefined)
+        return undefined;
+    const parsed = Number(value);
+    if (!Number.isInteger(parsed) || parsed < 1) {
+        throw new Error(`${name} must be a positive integer`);
+    }
+    return parsed;
+}
+/**
+ * Translate `cli.values` into `CreateEmbeddingProviderOptions`. CLI flags
+ * win over env vars (the factory itself reads env when these are unset).
+ */
+function buildProviderOpts(values, providerCliKind) {
+    const endpoint = optionalString(values["embed-endpoint"]);
+    const apiKey = optionalString(values["embed-api-key"]);
+    const modelId = optionalString(values["embed-model-id"]);
+    const upstreamModel = optionalString(values["embed-upstream-model"]);
+    const batchSize = parseOptionalPositiveInt("--embed-batch-size", values["embed-batch-size"]);
+    const timeoutMs = parseOptionalPositiveInt("--embed-timeout-ms", values["embed-timeout-ms"]);
+    // Only build the openai overrides object if the user supplied flags
+    const openai = endpoint || apiKey || modelId || upstreamModel || batchSize !== undefined || timeoutMs !== undefined
+        ? {
+            ...(endpoint !== undefined ? { endpoint } : {}),
+            ...(apiKey !== undefined ? { apiKey } : {}),
+            ...(modelId !== undefined ? { modelId } : {}),
+            ...(upstreamModel !== undefined ? { upstreamModel } : {}),
+            ...(batchSize !== undefined ? { batchSize } : {}),
+            ...(timeoutMs !== undefined ? { timeoutMs } : {}),
+        }
+        : undefined;
+    // CLI flag for auto-fallback wrapping (only meaningful when kind === openai)
+    const autoFallback = values["embed-auto-fallback"] === true ? true : undefined;
+    return {
+        ...(providerCliKind ? { kind: providerCliKind } : {}),
+        ...(openai ? { openai } : {}),
+        ...(autoFallback !== undefined ? { autoFallback } : {}),
+    };
+}
+function optionalString(v) {
+    if (v === undefined || v === null)
+        return undefined;
+    const s = String(v);
+    return s === "" ? undefined : s;
+}
 async function vectorIndex(model = DEFAULT_EMBED_MODEL_URI, force = false, batchOptions) {
     const storeInstance = getStore();
     const db = storeInstance.db;
@@ -1433,6 +1488,14 @@ async function vectorIndex(model = DEFAULT_EMBED_MODEL_URI, force = false, batch
         return;
     }
     console.log(`${c.dim}Model: ${model}${c.reset}\n`);
+    if (batchOptions?.embedProvider) {
+        const kind = batchOptions.embedProvider.kind;
+        const providerModel = batchOptions.embedProvider.getModelId();
+        console.log(`${c.dim}Provider: ${kind} (model id "${providerModel}")${c.reset}\n`);
+    }
+    else if (batchOptions?.providerKind) {
+        console.log(`${c.dim}Provider: ${batchOptions.providerKind}${c.reset}\n`);
+    }
     if (batchOptions?.maxDocsPerBatch !== undefined || batchOptions?.maxBatchBytes !== undefined) {
         const maxDocsPerBatch = batchOptions.maxDocsPerBatch ?? DEFAULT_EMBED_MAX_DOCS_PER_BATCH;
         const maxBatchBytes = batchOptions.maxBatchBytes ?? DEFAULT_EMBED_MAX_BATCH_BYTES;
@@ -1447,6 +1510,7 @@ async function vectorIndex(model = DEFAULT_EMBED_MODEL_URI, force = false, batch
         maxDocsPerBatch: batchOptions?.maxDocsPerBatch,
         maxBatchBytes: batchOptions?.maxBatchBytes,
         chunkStrategy: batchOptions?.chunkStrategy,
+        embedProvider: batchOptions?.embedProvider,
         onProgress: (info) => {
             if (info.totalBytes === 0)
                 return;
@@ -2109,6 +2173,14 @@ function parseCLI() {
             force: { type: "boolean", short: "f" },
             "max-docs-per-batch": { type: "string" },
             "max-batch-mb": { type: "string" },
+            provider: { type: "string" }, // "local" | "openai"
+            "embed-endpoint": { type: "string" }, // OpenAI-compatible endpoint URL
+            "embed-api-key": { type: "string" }, // Bearer token
+            "embed-model-id": { type: "string" }, // Stable model id (default: embeddinggemma)
+            "embed-upstream-model": { type: "string" }, // Upstream model name in HTTP body
+            "embed-batch-size": { type: "string" }, // Batch size for HTTP provider
+            "embed-timeout-ms": { type: "string" }, // Per-request timeout
+            "embed-auto-fallback": { type: "boolean" }, // Wrap openai in AutoFallback (local fallback)
             // Update options
             pull: { type: "boolean" }, // git pull before update
             refresh: { type: "boolean" },
@@ -2314,6 +2386,14 @@ function showHelp() {
     console.log("  qmd embed [-f]                - Generate/refresh vector embeddings");
     console.log("    --max-docs-per-batch <n>    - Cap docs loaded into memory per embedding batch");
     console.log("    --max-batch-mb <n>          - Cap UTF-8 MB loaded into memory per embedding batch");
+    console.log("    --provider {local,openai}   - Embedding backend (default: local llama.cpp)");
+    console.log("    --embed-endpoint <url>      - OpenAI-compatible endpoint (or QMD_EMBED_ENDPOINT)");
+    console.log("    --embed-api-key <key>       - Bearer token (or QMD_EMBED_API_KEY)");
+    console.log("    --embed-model-id <id>       - Stable model id stored in DB (default: embeddinggemma)");
+    console.log("    --embed-upstream-model <m>  - Model name sent in HTTP body (default: same as model-id)");
+    console.log("    --embed-batch-size <n>      - Batch size for HTTP provider (default: 64)");
+    console.log("    --embed-timeout-ms <n>      - Per-request timeout in ms (default: 30000)");
+    console.log("    --embed-auto-fallback       - Wrap openai provider in local fallback (or QMD_EMBED_AUTO_FALLBACK)");
     console.log("  qmd cleanup                   - Clear caches, vacuum DB");
     console.log("");
     console.log("Query syntax (qmd query):");
@@ -2669,14 +2749,29 @@ if (isMain) {
                 const maxDocsPerBatch = parseEmbedBatchOption("maxDocsPerBatch", cli.values["max-docs-per-batch"]);
                 const maxBatchMb = parseEmbedBatchOption("maxBatchBytes", cli.values["max-batch-mb"]);
                 const embedChunkStrategy = parseChunkStrategy(cli.values["chunk-strategy"]);
+                // Build embedding provider from CLI flags + env + config file.
+                // Backward compat: with no flags / env vars, the factory returns
+                // a LocalLlamaCppProvider that delegates to the default LlamaCpp
+                // singleton — identical to pre-patch behavior.
+                const providerCliKind = parseProviderKind(cli.values["provider"]);
+                const providerOpts = buildProviderOpts(cli.values, providerCliKind);
+                const embedProvider = createEmbeddingProvider(providerOpts);
                 await vectorIndex(DEFAULT_EMBED_MODEL_URI, !!cli.values.force, {
                     maxDocsPerBatch,
                     maxBatchBytes: maxBatchMb === undefined ? undefined : maxBatchMb * 1024 * 1024,
                     chunkStrategy: embedChunkStrategy,
+                    embedProvider,
+                    providerKind: embedProvider.kind,
                 });
             }
             catch (error) {
-                console.error(error instanceof Error ? error.message : String(error));
+                if (error instanceof ModelMismatchError) {
+                    // Friendlier output for the migration-safety guard
+                    console.error(`${c.red}Model mismatch:${c.reset} ${error.message}`);
+                }
+                else {
+                    console.error(error instanceof Error ? error.message : String(error));
+                }
                 process.exit(1);
             }
             break;

+ 84 - 0
dist/embedding/autofallback.d.ts

@@ -0,0 +1,84 @@
+/**
+ * autofallback.ts - AutoFallbackEmbeddingProvider.
+ *
+ * Composes a primary `EmbeddingProvider` (typically `OpenAIEmbeddingsProvider`)
+ * and a fallback (typically `LocalLlamaCppProvider`). When the primary trips
+ * its circuit breaker — or when persistent failures cross a threshold — calls
+ * are routed to the fallback. After a recovery cooldown, the primary is
+ * probed again; success closes the breaker and routing returns.
+ *
+ * Acceptance criterion 4 from i-qkarfffa: "Endpoint down → fallback local + WARN".
+ *
+ * Behavior summary:
+ *   - Primary call succeeds → return; record success.
+ *   - Primary throws CircuitOpenError → fall back, log WARN once per transition.
+ *   - Primary throws any other error → fall back for THIS call only;
+ *     count toward the failure-streak threshold.
+ *   - When failure streak crosses threshold (default 3) → set our own
+ *     "open until" timestamp; until expiry, route directly to fallback
+ *     (skip primary entirely).
+ *   - On expiry, retry primary opportunistically.
+ *   - getModelId / getDimensions / dispose are delegated to whichever
+ *     provider is currently active (or to the primary if both are usable).
+ */
+import type { EmbeddingProvider, ProviderEmbedOptions, ProviderEmbedding, ProviderHealth, ProviderKind } from "./provider.js";
+export type AutoFallbackProviderConfig = {
+    primary: EmbeddingProvider;
+    fallback: EmbeddingProvider;
+    /**
+     * Number of consecutive non-CircuitOpenError failures before we suppress
+     * primary calls and route directly to fallback. Default: 3.
+     */
+    failureStreakThreshold?: number;
+    /**
+     * Time in ms to keep routing through fallback after the breaker opens.
+     * Default: 5 minutes (matches `OpenAIEmbeddingsProvider`'s circuit duration).
+     */
+    cooldownMs?: number;
+    /**
+     * Optional WARN sink. Defaults to writing to `process.stderr` once per
+     * routing transition (closed→open and open→closed).
+     */
+    warn?: (msg: string) => void;
+    /** Custom clock for tests */
+    now?: () => number;
+};
+export type FallbackState = "primary" | "fallback";
+export declare class AutoFallbackEmbeddingProvider implements EmbeddingProvider {
+    readonly kind: ProviderKind;
+    readonly primary: EmbeddingProvider;
+    readonly fallback: EmbeddingProvider;
+    private readonly failureStreakThreshold;
+    private readonly cooldownMs;
+    private readonly warn;
+    private readonly now;
+    private failureStreak;
+    private fallbackUntil;
+    private lastTransitionState;
+    constructor(config: AutoFallbackProviderConfig);
+    /**
+     * Stable model id reported by the primary. The model-id guard runs against
+     * the primary's id because that's what callers actually want when the
+     * remote endpoint is online; on fallback-only operation, the local
+     * provider should report a compatible id (in the default config, both
+     * report "embeddinggemma" so this is moot).
+     */
+    getModelId(): string;
+    getDimensions(): number | undefined;
+    /** Current routing state (mostly for tests + observability) */
+    getRoutingState(): FallbackState;
+    /** Reset failure-streak + cooldown (mostly for tests / admin) */
+    reset(): void;
+    healthcheck(signal?: AbortSignal): Promise<ProviderHealth>;
+    embed(text: string, options?: ProviderEmbedOptions): Promise<ProviderEmbedding | null>;
+    embedBatch(texts: string[], options?: ProviderEmbedOptions): Promise<(ProviderEmbedding | null)[]>;
+    dispose(): Promise<void>;
+    /**
+     * Generic dispatcher: try primary if not in cooldown, fall back on
+     * `CircuitOpenError`, count other errors against the failure streak.
+     * `op` is invoked with whichever provider is selected.
+     */
+    private run;
+    private openCooldown;
+    private transition;
+}

+ 180 - 0
dist/embedding/autofallback.js

@@ -0,0 +1,180 @@
+/**
+ * autofallback.ts - AutoFallbackEmbeddingProvider.
+ *
+ * Composes a primary `EmbeddingProvider` (typically `OpenAIEmbeddingsProvider`)
+ * and a fallback (typically `LocalLlamaCppProvider`). When the primary trips
+ * its circuit breaker — or when persistent failures cross a threshold — calls
+ * are routed to the fallback. After a recovery cooldown, the primary is
+ * probed again; success closes the breaker and routing returns.
+ *
+ * Acceptance criterion 4 from i-qkarfffa: "Endpoint down → fallback local + WARN".
+ *
+ * Behavior summary:
+ *   - Primary call succeeds → return; record success.
+ *   - Primary throws CircuitOpenError → fall back, log WARN once per transition.
+ *   - Primary throws any other error → fall back for THIS call only;
+ *     count toward the failure-streak threshold.
+ *   - When failure streak crosses threshold (default 3) → set our own
+ *     "open until" timestamp; until expiry, route directly to fallback
+ *     (skip primary entirely).
+ *   - On expiry, retry primary opportunistically.
+ *   - getModelId / getDimensions / dispose are delegated to whichever
+ *     provider is currently active (or to the primary if both are usable).
+ */
+import { CircuitOpenError } from "./openai.js";
+const DEFAULT_FAILURE_STREAK = 3;
+const DEFAULT_COOLDOWN_MS = 5 * 60_000;
+function defaultWarn(msg) {
+    process.stderr.write(`${msg}\n`);
+}
+export class AutoFallbackEmbeddingProvider {
+    kind;
+    primary;
+    fallback;
+    failureStreakThreshold;
+    cooldownMs;
+    warn;
+    now;
+    failureStreak = 0;
+    fallbackUntil = null;
+    lastTransitionState = "primary";
+    constructor(config) {
+        if (!config.primary)
+            throw new Error("AutoFallbackEmbeddingProvider: primary is required");
+        if (!config.fallback)
+            throw new Error("AutoFallbackEmbeddingProvider: fallback is required");
+        if (config.primary === config.fallback) {
+            throw new Error("AutoFallbackEmbeddingProvider: primary and fallback must differ");
+        }
+        this.primary = config.primary;
+        this.fallback = config.fallback;
+        // Inherit the primary's kind for callers introspecting `provider.kind`.
+        this.kind = config.primary.kind;
+        this.failureStreakThreshold = config.failureStreakThreshold ?? DEFAULT_FAILURE_STREAK;
+        this.cooldownMs = config.cooldownMs ?? DEFAULT_COOLDOWN_MS;
+        this.warn = config.warn ?? defaultWarn;
+        this.now = config.now ?? Date.now;
+    }
+    /**
+     * Stable model id reported by the primary. The model-id guard runs against
+     * the primary's id because that's what callers actually want when the
+     * remote endpoint is online; on fallback-only operation, the local
+     * provider should report a compatible id (in the default config, both
+     * report "embeddinggemma" so this is moot).
+     */
+    getModelId() {
+        return this.primary.getModelId();
+    }
+    getDimensions() {
+        return this.primary.getDimensions() ?? this.fallback.getDimensions();
+    }
+    /** Current routing state (mostly for tests + observability) */
+    getRoutingState() {
+        if (this.fallbackUntil !== null && this.now() < this.fallbackUntil) {
+            return "fallback";
+        }
+        return "primary";
+    }
+    /** Reset failure-streak + cooldown (mostly for tests / admin) */
+    reset() {
+        this.failureStreak = 0;
+        this.fallbackUntil = null;
+        this.transition("primary");
+    }
+    async healthcheck(signal) {
+        // Primary first; if degraded, check fallback so callers can still tell
+        // whether they have *any* working backend.
+        const primaryHealth = await this.primary.healthcheck(signal);
+        if (primaryHealth.ok)
+            return primaryHealth;
+        const fallbackHealth = await this.fallback.healthcheck(signal);
+        return {
+            ok: fallbackHealth.ok,
+            model: this.primary.getModelId(),
+            dimensions: primaryHealth.dimensions ?? fallbackHealth.dimensions,
+            detail: `primary: ${primaryHealth.detail ?? "fail"} | fallback: ${fallbackHealth.detail ?? (fallbackHealth.ok ? "ok" : "fail")}`,
+        };
+    }
+    async embed(text, options = {}) {
+        return this.run((p, opts) => p.embed(text, opts), options);
+    }
+    async embedBatch(texts, options = {}) {
+        if (texts.length === 0)
+            return [];
+        return this.run((p, opts) => p.embedBatch(texts, opts), options, () => texts.map(() => null));
+    }
+    async dispose() {
+        await Promise.allSettled([this.primary.dispose(), this.fallback.dispose()]);
+    }
+    // ────────────────────── Internals ──────────────────────
+    /**
+     * Generic dispatcher: try primary if not in cooldown, fall back on
+     * `CircuitOpenError`, count other errors against the failure streak.
+     * `op` is invoked with whichever provider is selected.
+     */
+    async run(op, options, onTotalFail) {
+        const inCooldown = this.fallbackUntil !== null && this.now() < this.fallbackUntil;
+        if (inCooldown) {
+            // Skip primary entirely
+            this.transition("fallback");
+            try {
+                return await op(this.fallback, options);
+            }
+            catch (err) {
+                if (onTotalFail)
+                    return onTotalFail();
+                throw err;
+            }
+        }
+        // Try primary first
+        try {
+            const result = await op(this.primary, options);
+            // Success — clear streak and ensure routing reads "primary"
+            this.failureStreak = 0;
+            this.fallbackUntil = null;
+            this.transition("primary");
+            return result;
+        }
+        catch (err) {
+            if (err instanceof CircuitOpenError) {
+                // Primary circuit is open — open our own cooldown matching its
+                // expected duration so subsequent calls skip the primary.
+                this.openCooldown(`primary CircuitOpenError`);
+            }
+            else {
+                this.failureStreak++;
+                if (this.failureStreak >= this.failureStreakThreshold) {
+                    this.openCooldown(`primary failure streak ${this.failureStreak} ≥ ${this.failureStreakThreshold}`);
+                }
+            }
+            // Try fallback for THIS call regardless
+            try {
+                this.transition("fallback");
+                return await op(this.fallback, options);
+            }
+            catch (fbErr) {
+                if (onTotalFail)
+                    return onTotalFail();
+                // Both providers failed — surface the fallback error (the primary
+                // failure already informed the breaker).
+                throw fbErr;
+            }
+        }
+    }
+    openCooldown(reason) {
+        if (this.fallbackUntil === null || this.now() >= this.fallbackUntil) {
+            this.fallbackUntil = this.now() + this.cooldownMs;
+            this.warn(`[AutoFallbackEmbeddingProvider] WARN — falling back to "${this.fallback.kind}" provider for ${Math.round(this.cooldownMs / 1000)}s (reason: ${reason})`);
+        }
+    }
+    transition(to) {
+        if (this.lastTransitionState === to)
+            return;
+        this.lastTransitionState = to;
+        if (to === "primary") {
+            this.warn(`[AutoFallbackEmbeddingProvider] WARN — primary "${this.primary.kind}" recovered, routing restored`);
+        }
+        // The "fallback" transition WARN is already emitted by openCooldown
+        // (with a richer message). No second WARN here.
+    }
+}

+ 82 - 0
dist/embedding/factory.d.ts

@@ -0,0 +1,82 @@
+/**
+ * factory.ts - EmbeddingProvider factory with config precedence.
+ *
+ * Resolution order (first match wins):
+ *   1. Explicit `kind` argument or `--provider` CLI flag → forces a kind
+ *   2. `QMD_EMBED_ENDPOINT` env var present and non-empty → "openai"
+ *   3. Config file (`~/.config/qmd/config.json`) `embedProvider.kind` → that kind
+ *   4. Otherwise → "local" (legacy / backward-compat)
+ *
+ * Backward compat invariant: when neither `QMD_EMBED_ENDPOINT` nor
+ * `~/.config/qmd/config.json` mentions a provider, callers get the same
+ * `LocalLlamaCppProvider` they had before this change.
+ */
+import { type LocalLlamaCppProviderConfig } from "./local.js";
+import { type OpenAIProviderConfig } from "./openai.js";
+import { type AutoFallbackProviderConfig } from "./autofallback.js";
+import type { EmbeddingProvider, ProviderKind } from "./provider.js";
+export type EmbedProviderConfigFile = {
+    embedProvider?: {
+        kind?: ProviderKind;
+        endpoint?: string;
+        apiKey?: string;
+        modelId?: string;
+        upstreamModel?: string;
+        batchSize?: number;
+        timeoutMs?: number;
+        /** When true, wrap the openai provider in AutoFallback (local fallback). */
+        autoFallback?: boolean;
+    };
+};
+export declare function defaultConfigPath(): string;
+/**
+ * Load `~/.config/qmd/config.json` if present. Returns an empty object on
+ * any read/parse error so we silently fall back to env/local.
+ */
+export declare function loadConfigFile(path?: string): EmbedProviderConfigFile;
+export type CreateEmbeddingProviderOptions = {
+    /** Force a specific provider kind. Overrides env + config. */
+    kind?: ProviderKind;
+    /** Override config file path (mostly for tests) */
+    configPath?: string;
+    /** Local-provider overrides */
+    local?: LocalLlamaCppProviderConfig;
+    /** OpenAI-provider overrides — merged on top of env/config */
+    openai?: Partial<OpenAIProviderConfig>;
+    /**
+     * Wrap the chosen provider in `AutoFallbackEmbeddingProvider` so that a
+     * remote outage transparently falls back to local llama.cpp. Default:
+     * `false` — opt-in, since the wrapper requires both backends to be
+     * available and the local one will warm node-llama-cpp on first call.
+     *
+     * Resolution: explicit `autoFallback` wins → env `QMD_EMBED_AUTO_FALLBACK`
+     * (`1`/`true`) → config-file `embedProvider.autoFallback` → false.
+     *
+     * Only applies when the resolved kind is `openai` (no fallback wrap when
+     * the primary IS local already).
+     */
+    autoFallback?: boolean;
+    /**
+     * Override config for `AutoFallbackEmbeddingProvider` (failureStreak,
+     * cooldownMs, etc.). Only used when `autoFallback` resolves true.
+     * Primary + fallback are constructed automatically.
+     */
+    autoFallbackOverrides?: Omit<AutoFallbackProviderConfig, "primary" | "fallback">;
+    /**
+     * Custom env source (mostly for tests). Defaults to `process.env`.
+     * Read keys: QMD_EMBED_PROVIDER, QMD_EMBED_ENDPOINT, QMD_EMBED_API_KEY,
+     * QMD_EMBED_MODEL_ID, QMD_EMBED_UPSTREAM_MODEL, QMD_EMBED_BATCH_SIZE,
+     * QMD_EMBED_TIMEOUT_MS, QMD_EMBED_AUTO_FALLBACK.
+     */
+    env?: Record<string, string | undefined>;
+};
+/**
+ * Resolve the provider kind without instantiating anything. Useful for
+ * logging and tests.
+ */
+export declare function resolveProviderKind(opts?: CreateEmbeddingProviderOptions): ProviderKind;
+/**
+ * Factory entry point — returns the appropriate `EmbeddingProvider`.
+ * Throws if `openai` kind is requested but no endpoint is configured.
+ */
+export declare function createEmbeddingProvider(opts?: CreateEmbeddingProviderOptions): EmbeddingProvider;

+ 150 - 0
dist/embedding/factory.js

@@ -0,0 +1,150 @@
+/**
+ * factory.ts - EmbeddingProvider factory with config precedence.
+ *
+ * Resolution order (first match wins):
+ *   1. Explicit `kind` argument or `--provider` CLI flag → forces a kind
+ *   2. `QMD_EMBED_ENDPOINT` env var present and non-empty → "openai"
+ *   3. Config file (`~/.config/qmd/config.json`) `embedProvider.kind` → that kind
+ *   4. Otherwise → "local" (legacy / backward-compat)
+ *
+ * Backward compat invariant: when neither `QMD_EMBED_ENDPOINT` nor
+ * `~/.config/qmd/config.json` mentions a provider, callers get the same
+ * `LocalLlamaCppProvider` they had before this change.
+ */
+import { existsSync, readFileSync } from "node:fs";
+import { homedir } from "node:os";
+import { join } from "node:path";
+import { LocalLlamaCppProvider } from "./local.js";
+import { OpenAIEmbeddingsProvider, } from "./openai.js";
+import { AutoFallbackEmbeddingProvider, } from "./autofallback.js";
+export function defaultConfigPath() {
+    const xdg = process.env.XDG_CONFIG_HOME;
+    const base = xdg ? xdg : join(homedir(), ".config");
+    return join(base, "qmd", "config.json");
+}
+/**
+ * Load `~/.config/qmd/config.json` if present. Returns an empty object on
+ * any read/parse error so we silently fall back to env/local.
+ */
+export function loadConfigFile(path = defaultConfigPath()) {
+    if (!existsSync(path))
+        return {};
+    try {
+        const raw = readFileSync(path, "utf-8");
+        const parsed = JSON.parse(raw);
+        if (parsed && typeof parsed === "object")
+            return parsed;
+    }
+    catch {
+        // Ignore — invalid JSON, missing read perm, etc.
+    }
+    return {};
+}
+/**
+ * Resolve the provider kind without instantiating anything. Useful for
+ * logging and tests.
+ */
+export function resolveProviderKind(opts = {}) {
+    const env = opts.env ?? process.env;
+    const cfg = loadConfigFile(opts.configPath);
+    // 1. Explicit kind argument
+    if (opts.kind)
+        return opts.kind;
+    // 2a. Explicit env override
+    const envKind = env.QMD_EMBED_PROVIDER?.trim().toLowerCase();
+    if (envKind === "local" || envKind === "openai")
+        return envKind;
+    // 2b. Endpoint env present → openai
+    if (env.QMD_EMBED_ENDPOINT && env.QMD_EMBED_ENDPOINT.trim() !== "") {
+        return "openai";
+    }
+    // 3. Config file
+    if (cfg.embedProvider?.kind === "local" || cfg.embedProvider?.kind === "openai") {
+        return cfg.embedProvider.kind;
+    }
+    if (cfg.embedProvider?.endpoint && cfg.embedProvider.endpoint.trim() !== "") {
+        return "openai";
+    }
+    // 4. Default
+    return "local";
+}
+/**
+ * Factory entry point — returns the appropriate `EmbeddingProvider`.
+ * Throws if `openai` kind is requested but no endpoint is configured.
+ */
+export function createEmbeddingProvider(opts = {}) {
+    const env = opts.env ?? process.env;
+    const cfg = loadConfigFile(opts.configPath);
+    const kind = resolveProviderKind(opts);
+    if (kind === "local") {
+        return new LocalLlamaCppProvider(opts.local ?? {});
+    }
+    // OpenAI
+    const endpoint = opts.openai?.endpoint ??
+        env.QMD_EMBED_ENDPOINT ??
+        cfg.embedProvider?.endpoint;
+    if (!endpoint || endpoint.trim() === "") {
+        throw new Error('createEmbeddingProvider: kind="openai" requires an endpoint. ' +
+            "Set QMD_EMBED_ENDPOINT env var, or `embedProvider.endpoint` in " +
+            "~/.config/qmd/config.json, or pass `openai.endpoint`.");
+    }
+    const apiKey = opts.openai?.apiKey ??
+        env.QMD_EMBED_API_KEY ??
+        cfg.embedProvider?.apiKey;
+    const modelId = opts.openai?.modelId ??
+        env.QMD_EMBED_MODEL_ID ??
+        cfg.embedProvider?.modelId ??
+        "embeddinggemma";
+    const upstreamModel = opts.openai?.upstreamModel ??
+        env.QMD_EMBED_UPSTREAM_MODEL ??
+        cfg.embedProvider?.upstreamModel;
+    const batchSizeRaw = opts.openai?.batchSize ??
+        parsePositiveInt(env.QMD_EMBED_BATCH_SIZE) ??
+        cfg.embedProvider?.batchSize;
+    const timeoutMsRaw = opts.openai?.timeoutMs ??
+        parsePositiveInt(env.QMD_EMBED_TIMEOUT_MS) ??
+        cfg.embedProvider?.timeoutMs;
+    const openaiProvider = new OpenAIEmbeddingsProvider({
+        endpoint,
+        apiKey,
+        modelId,
+        upstreamModel,
+        batchSize: batchSizeRaw,
+        timeoutMs: timeoutMsRaw,
+        fetchImpl: opts.openai?.fetchImpl,
+        retryBackoffsMs: opts.openai?.retryBackoffsMs,
+        sleep: opts.openai?.sleep,
+        now: opts.openai?.now,
+    });
+    // Should we wrap with AutoFallback? Resolution: arg → env → config → false.
+    const autoFallback = resolveAutoFallback(opts, env, cfg);
+    if (!autoFallback)
+        return openaiProvider;
+    return new AutoFallbackEmbeddingProvider({
+        primary: openaiProvider,
+        fallback: new LocalLlamaCppProvider(opts.local ?? { modelId }),
+        ...(opts.autoFallbackOverrides ?? {}),
+    });
+}
+function resolveAutoFallback(opts, env, cfg) {
+    if (typeof opts.autoFallback === "boolean")
+        return opts.autoFallback;
+    const envVal = env.QMD_EMBED_AUTO_FALLBACK?.trim().toLowerCase();
+    if (envVal === "1" || envVal === "true" || envVal === "yes")
+        return true;
+    if (envVal === "0" || envVal === "false" || envVal === "no")
+        return false;
+    if (typeof cfg.embedProvider?.autoFallback === "boolean") {
+        return cfg.embedProvider.autoFallback;
+    }
+    return false;
+}
+// ─────────────────────────── Helpers ────────────────────────────────────────
+function parsePositiveInt(v) {
+    if (!v)
+        return undefined;
+    const parsed = Number.parseInt(v, 10);
+    if (!Number.isFinite(parsed) || parsed <= 0)
+        return undefined;
+    return parsed;
+}

+ 8 - 0
dist/embedding/index.d.ts

@@ -0,0 +1,8 @@
+/**
+ * embedding/index.ts - re-exports for the embedding provider abstraction.
+ */
+export { type EmbeddingProvider, type ProviderKind, type ProviderEmbedding, type ProviderEmbedOptions, type ProviderHealth, ModelMismatchError, assertModelCompatible, } from "./provider.js";
+export { LocalLlamaCppProvider, type LocalLlamaCppProviderConfig, } from "./local.js";
+export { OpenAIEmbeddingsProvider, CircuitBreaker, CircuitOpenError, HttpError, isRetryableStatus, chunkArray, type OpenAIProviderConfig, type CircuitState, DEFAULT_BATCH_SIZE, DEFAULT_TIMEOUT_MS, RETRY_BACKOFFS_MS, } from "./openai.js";
+export { createEmbeddingProvider, resolveProviderKind, loadConfigFile, defaultConfigPath, type CreateEmbeddingProviderOptions, type EmbedProviderConfigFile, } from "./factory.js";
+export { AutoFallbackEmbeddingProvider, type AutoFallbackProviderConfig, type FallbackState, } from "./autofallback.js";

+ 8 - 0
dist/embedding/index.js

@@ -0,0 +1,8 @@
+/**
+ * embedding/index.ts - re-exports for the embedding provider abstraction.
+ */
+export { ModelMismatchError, assertModelCompatible, } from "./provider.js";
+export { LocalLlamaCppProvider, } from "./local.js";
+export { OpenAIEmbeddingsProvider, CircuitBreaker, CircuitOpenError, HttpError, isRetryableStatus, chunkArray, DEFAULT_BATCH_SIZE, DEFAULT_TIMEOUT_MS, RETRY_BACKOFFS_MS, } from "./openai.js";
+export { createEmbeddingProvider, resolveProviderKind, loadConfigFile, defaultConfigPath, } from "./factory.js";
+export { AutoFallbackEmbeddingProvider, } from "./autofallback.js";

+ 31 - 0
dist/embedding/local.d.ts

@@ -0,0 +1,31 @@
+/**
+ * local.ts - Local llama.cpp adapter implementing EmbeddingProvider.
+ *
+ * Wraps an existing `LlamaCpp` instance so the legacy GGUF path looks like
+ * any other EmbeddingProvider to upstream callers. Used as the default and
+ * as the fallback target when `OpenAIEmbeddingsProvider` trips its breaker.
+ */
+import { type LlamaCpp } from "../llm.js";
+import type { EmbeddingProvider, ProviderEmbedOptions, ProviderEmbedding, ProviderHealth, ProviderKind } from "./provider.js";
+export type LocalLlamaCppProviderConfig = {
+    /** Pre-built LlamaCpp instance (optional — falls back to global singleton). */
+    llm?: LlamaCpp;
+    /**
+     * Stable model id reported via `getModelId()`. Defaults to "embeddinggemma"
+     * to match the value in `content_vectors.model` for existing qmd installs.
+     */
+    modelId?: string;
+};
+export declare class LocalLlamaCppProvider implements EmbeddingProvider {
+    readonly kind: ProviderKind;
+    private readonly llm;
+    private readonly modelId;
+    private dimensions;
+    constructor(config?: LocalLlamaCppProviderConfig);
+    getModelId(): string;
+    getDimensions(): number | undefined;
+    healthcheck(_signal?: AbortSignal): Promise<ProviderHealth>;
+    embed(text: string, options?: ProviderEmbedOptions): Promise<ProviderEmbedding | null>;
+    embedBatch(texts: string[], options?: ProviderEmbedOptions): Promise<(ProviderEmbedding | null)[]>;
+    dispose(): Promise<void>;
+}

+ 91 - 0
dist/embedding/local.js

@@ -0,0 +1,91 @@
+/**
+ * local.ts - Local llama.cpp adapter implementing EmbeddingProvider.
+ *
+ * Wraps an existing `LlamaCpp` instance so the legacy GGUF path looks like
+ * any other EmbeddingProvider to upstream callers. Used as the default and
+ * as the fallback target when `OpenAIEmbeddingsProvider` trips its breaker.
+ */
+import { getDefaultLlamaCpp, } from "../llm.js";
+export class LocalLlamaCppProvider {
+    kind = "local";
+    llm;
+    modelId;
+    dimensions = undefined;
+    constructor(config = {}) {
+        this.llm = config.llm ?? getDefaultLlamaCpp();
+        this.modelId = config.modelId ?? "embeddinggemma";
+    }
+    getModelId() {
+        return this.modelId;
+    }
+    getDimensions() {
+        return this.dimensions;
+    }
+    async healthcheck(_signal) {
+        // For the local provider, "healthy" means the embed model loads.
+        // We probe with a single embed call.
+        try {
+            const result = await this.llm.embed("healthcheck", { model: this.modelId });
+            if (!result) {
+                return {
+                    ok: false,
+                    model: this.modelId,
+                    detail: "embed probe returned null",
+                };
+            }
+            this.dimensions = result.embedding.length;
+            return {
+                ok: true,
+                model: this.modelId,
+                dimensions: this.dimensions,
+                detail: `local llama.cpp ready, ${this.dimensions}-d`,
+            };
+        }
+        catch (err) {
+            return {
+                ok: false,
+                model: this.modelId,
+                detail: err instanceof Error ? err.message : String(err),
+            };
+        }
+    }
+    async embed(text, options = {}) {
+        if (options.signal?.aborted)
+            return null;
+        const result = await this.llm.embed(text, { model: options.model ?? this.modelId });
+        if (!result)
+            return null;
+        if (this.dimensions === undefined) {
+            this.dimensions = result.embedding.length;
+        }
+        return {
+            embedding: result.embedding,
+            model: this.modelId,
+        };
+    }
+    async embedBatch(texts, options = {}) {
+        if (texts.length === 0)
+            return [];
+        if (options.signal?.aborted)
+            return texts.map(() => null);
+        const raw = await this.llm.embedBatch(texts, {
+            model: options.model ?? this.modelId,
+        });
+        return raw.map((r) => {
+            if (!r)
+                return null;
+            if (this.dimensions === undefined && r.embedding.length > 0) {
+                this.dimensions = r.embedding.length;
+            }
+            return {
+                embedding: r.embedding,
+                model: this.modelId,
+            };
+        });
+    }
+    async dispose() {
+        // We do NOT dispose the underlying LlamaCpp here because the singleton
+        // is shared with rerank/generate/expansion paths. Disposal is handled
+        // by the existing `disposeDefaultLlamaCpp()` global hook.
+    }
+}

+ 184 - 0
dist/embedding/openai.d.ts

@@ -0,0 +1,184 @@
+/**
+ * openai.ts - OpenAI-compatible HTTP embedding provider
+ *
+ * Talks to any endpoint that implements `POST /v1/embeddings` with the OpenAI
+ * shape: request `{model, input: string|string[]}`, response
+ * `{data: [{embedding: number[], index: number}, ...]}`.
+ *
+ * Used by qmd to delegate embeddings to a GPU worker (e.g. ai.mm.mk →
+ * qmd-embed-worker on `models` LXC, RTX 4090) instead of running
+ * node-llama-cpp locally.
+ *
+ * Features:
+ *   - Batches input in groups of ≤64 (configurable via QMD_EMBED_BATCH_SIZE)
+ *   - Retries 429 / 503 with exponential backoff (1s, 4s, 16s)
+ *   - 4xx (non-429) → no retry, count as failure
+ *   - Circuit breaker: >50% failures in a 60s window → OPEN for 5 min,
+ *     callers can use this to fall back to a local provider
+ *   - Per-call timeout via AbortSignal (default QMD_EMBED_TIMEOUT_MS=30000)
+ *   - Healthcheck via `GET /health` if available, else a probe embed call
+ */
+import type { EmbeddingProvider, ProviderEmbedOptions, ProviderEmbedding, ProviderHealth, ProviderKind } from "./provider.js";
+/**
+ * Default batch size — most OpenAI-compatible embedding endpoints accept up to
+ * 2048 inputs per call but for memory and latency we cap at 64.
+ */
+export declare const DEFAULT_BATCH_SIZE = 64;
+/**
+ * Default per-request timeout (30 s). embeddinggemma-300M on RTX 4090 takes
+ * <500ms per batch of 64 in practice; 30s is a safe upper bound.
+ */
+export declare const DEFAULT_TIMEOUT_MS = 30000;
+/**
+ * Retry backoff schedule (ms) for 429/503 responses. 3 attempts total
+ * (initial + 2 retries) — aligns with issue spec "1s/4s/16s".
+ */
+export declare const RETRY_BACKOFFS_MS: readonly number[];
+/**
+ * Circuit breaker — flips OPEN when error rate exceeds threshold within
+ * window. While OPEN, every call fails fast so the caller can fall back.
+ */
+export declare const CIRCUIT_WINDOW_MS = 60000;
+export declare const CIRCUIT_OPEN_DURATION_MS: number;
+export declare const CIRCUIT_FAILURE_RATE_THRESHOLD = 0.5;
+export declare const CIRCUIT_MIN_SAMPLES = 4;
+export type OpenAIProviderConfig = {
+    /** Endpoint base URL — e.g. "https://ai.mm.mk" (no trailing slash) */
+    endpoint: string;
+    /** Optional bearer token sent as `Authorization: Bearer ...` */
+    apiKey?: string;
+    /**
+     * Stable model identifier to report up via `getModelId()`.
+     * Defaults to "embeddinggemma" to match qmd's existing DB rows.
+     */
+    modelId?: string;
+    /**
+     * Upstream model name sent in the HTTP request body. Often differs from
+     * `modelId` (e.g. modelId="embeddinggemma" but upstream model="embeddinggemma:300m").
+     */
+    upstreamModel?: string;
+    /** Batch size cap (default DEFAULT_BATCH_SIZE = 64) */
+    batchSize?: number;
+    /** Per-request timeout in ms (default DEFAULT_TIMEOUT_MS = 30_000) */
+    timeoutMs?: number;
+    /** Custom fetch (for testing). Defaults to global `fetch`. */
+    fetchImpl?: typeof fetch;
+    /** Custom retry schedule (for testing). Defaults to RETRY_BACKOFFS_MS. */
+    retryBackoffsMs?: readonly number[];
+    /** Custom sleep impl (for testing). Defaults to setTimeout. */
+    sleep?: (ms: number) => Promise<void>;
+    /** Custom clock (for testing). Defaults to Date.now. */
+    now?: () => number;
+};
+export type OpenAIEmbeddingsResponse = {
+    object?: string;
+    model?: string;
+    data: Array<{
+        object?: string;
+        index: number;
+        embedding: number[];
+    }>;
+    usage?: {
+        prompt_tokens?: number;
+        total_tokens?: number;
+    };
+};
+/**
+ * Circuit breaker state — exported for tests
+ */
+export type CircuitState = "closed" | "open" | "half-open";
+/**
+ * Determine whether an HTTP status is retryable. 429 (Too Many Requests)
+ * and 503 (Service Unavailable) are retried; 4xx (other than 429) are not.
+ */
+export declare function isRetryableStatus(status: number): boolean;
+/**
+ * Chunk an array into pieces of ≤ size each. `size` MUST be ≥ 1.
+ */
+export declare function chunkArray<T>(items: T[], size: number): T[][];
+/**
+ * Sliding-window circuit breaker. Tracks the last N samples (min 4) over a
+ * 60-second window; flips OPEN when failure rate exceeds 50%, then auto-
+ * resets to HALF-OPEN after 5 minutes — at which point the next probe
+ * decides whether to close (success) or re-open (failure).
+ */
+export declare class CircuitBreaker {
+    private samples;
+    private state;
+    private openedAt;
+    private readonly windowMs;
+    private readonly openDurationMs;
+    private readonly threshold;
+    private readonly minSamples;
+    private readonly now;
+    constructor(opts?: {
+        windowMs?: number;
+        openDurationMs?: number;
+        threshold?: number;
+        minSamples?: number;
+        now?: () => number;
+    });
+    getState(): CircuitState;
+    /**
+     * Returns true when calls should be short-circuited (skip HTTP, fall back).
+     * Side-effects: may transition OPEN → HALF-OPEN if the open window expired.
+     */
+    shouldFailFast(): boolean;
+    /** Record a successful call. */
+    recordSuccess(): void;
+    /** Record a failed call. May trigger OPEN. */
+    recordFailure(): void;
+    /** Force-reset the breaker (used by tests / admin) */
+    reset(): void;
+    private pushSample;
+    private evaluate;
+    private tickAutoReset;
+}
+/**
+ * Raised when the circuit breaker is OPEN and a call is short-circuited.
+ * Callers (e.g. fallback wrapper) can catch this to switch to local provider.
+ */
+export declare class CircuitOpenError extends Error {
+    constructor(message?: string);
+}
+/**
+ * Persistent (non-retryable) HTTP error from upstream. Includes status code.
+ */
+export declare class HttpError extends Error {
+    readonly status: number;
+    readonly bodyPreview: string;
+    constructor(status: number, bodyPreview: string);
+}
+export declare class OpenAIEmbeddingsProvider implements EmbeddingProvider {
+    readonly kind: ProviderKind;
+    private readonly endpoint;
+    private readonly apiKey?;
+    private readonly modelId;
+    private readonly upstreamModel;
+    private readonly batchSize;
+    private readonly timeoutMs;
+    private readonly fetchImpl;
+    private readonly retryBackoffsMs;
+    private readonly sleep;
+    private readonly now;
+    private dimensions;
+    readonly breaker: CircuitBreaker;
+    constructor(config: OpenAIProviderConfig);
+    getModelId(): string;
+    getDimensions(): number | undefined;
+    healthcheck(signal?: AbortSignal): Promise<ProviderHealth>;
+    embed(text: string, options?: ProviderEmbedOptions): Promise<ProviderEmbedding | null>;
+    embedBatch(texts: string[], options?: ProviderEmbedOptions): Promise<(ProviderEmbedding | null)[]>;
+    dispose(): Promise<void>;
+    private buildHeaders;
+    /**
+     * Single HTTP request with retry on 429/503. Returns embeddings indexed
+     * the same as `texts`. Throws on non-retryable failure or all attempts
+     * exhausted.
+     */
+    private requestWithRetry;
+    /**
+     * Issue one HTTP attempt to `POST /v1/embeddings`. Does NOT retry.
+     */
+    private requestOnce;
+}

+ 477 - 0
dist/embedding/openai.js

@@ -0,0 +1,477 @@
+/**
+ * openai.ts - OpenAI-compatible HTTP embedding provider
+ *
+ * Talks to any endpoint that implements `POST /v1/embeddings` with the OpenAI
+ * shape: request `{model, input: string|string[]}`, response
+ * `{data: [{embedding: number[], index: number}, ...]}`.
+ *
+ * Used by qmd to delegate embeddings to a GPU worker (e.g. ai.mm.mk →
+ * qmd-embed-worker on `models` LXC, RTX 4090) instead of running
+ * node-llama-cpp locally.
+ *
+ * Features:
+ *   - Batches input in groups of ≤64 (configurable via QMD_EMBED_BATCH_SIZE)
+ *   - Retries 429 / 503 with exponential backoff (1s, 4s, 16s)
+ *   - 4xx (non-429) → no retry, count as failure
+ *   - Circuit breaker: >50% failures in a 60s window → OPEN for 5 min,
+ *     callers can use this to fall back to a local provider
+ *   - Per-call timeout via AbortSignal (default QMD_EMBED_TIMEOUT_MS=30000)
+ *   - Healthcheck via `GET /health` if available, else a probe embed call
+ */
+// ─────────────────────────── Configuration ───────────────────────────────────
+/**
+ * Default batch size — most OpenAI-compatible embedding endpoints accept up to
+ * 2048 inputs per call but for memory and latency we cap at 64.
+ */
+export const DEFAULT_BATCH_SIZE = 64;
+/**
+ * Default per-request timeout (30 s). embeddinggemma-300M on RTX 4090 takes
+ * <500ms per batch of 64 in practice; 30s is a safe upper bound.
+ */
+export const DEFAULT_TIMEOUT_MS = 30_000;
+/**
+ * Retry backoff schedule (ms) for 429/503 responses. 3 attempts total
+ * (initial + 2 retries) — aligns with issue spec "1s/4s/16s".
+ */
+export const RETRY_BACKOFFS_MS = [1_000, 4_000, 16_000];
+/**
+ * Circuit breaker — flips OPEN when error rate exceeds threshold within
+ * window. While OPEN, every call fails fast so the caller can fall back.
+ */
+export const CIRCUIT_WINDOW_MS = 60_000;
+export const CIRCUIT_OPEN_DURATION_MS = 5 * 60_000;
+export const CIRCUIT_FAILURE_RATE_THRESHOLD = 0.5;
+export const CIRCUIT_MIN_SAMPLES = 4;
+// ─────────────────────────── Helpers ─────────────────────────────────────────
+function defaultSleep(ms) {
+    return new Promise((resolve) => setTimeout(resolve, ms));
+}
+/**
+ * Build the merged AbortSignal for a single HTTP attempt: combines an
+ * external `userSignal` (from caller / withLLMSession) with a per-attempt
+ * timeout signal. Returns the merged signal AND the timeout id so the
+ * caller can `clearTimeout` after the attempt completes (avoids leaks).
+ */
+function buildAttemptSignal(userSignal, timeoutMs) {
+    const ctrl = new AbortController();
+    const timeoutId = setTimeout(() => {
+        ctrl.abort(new Error(`Request timed out after ${timeoutMs}ms`));
+    }, timeoutMs);
+    // Don't keep process alive just for this timer
+    if (typeof timeoutId === "object" && timeoutId !== null && "unref" in timeoutId) {
+        timeoutId.unref();
+    }
+    const onUserAbort = () => ctrl.abort(userSignal?.reason);
+    if (userSignal) {
+        if (userSignal.aborted) {
+            ctrl.abort(userSignal.reason);
+        }
+        else {
+            userSignal.addEventListener("abort", onUserAbort, { once: true });
+        }
+    }
+    const cleanup = () => {
+        clearTimeout(timeoutId);
+        if (userSignal)
+            userSignal.removeEventListener("abort", onUserAbort);
+    };
+    return { signal: ctrl.signal, cleanup };
+}
+/**
+ * Determine whether an HTTP status is retryable. 429 (Too Many Requests)
+ * and 503 (Service Unavailable) are retried; 4xx (other than 429) are not.
+ */
+export function isRetryableStatus(status) {
+    return status === 429 || status === 503;
+}
+/**
+ * Chunk an array into pieces of ≤ size each. `size` MUST be ≥ 1.
+ */
+export function chunkArray(items, size) {
+    if (size < 1)
+        throw new Error(`chunkArray: size must be ≥ 1, got ${size}`);
+    if (items.length <= size)
+        return items.length === 0 ? [] : [items];
+    const out = [];
+    for (let i = 0; i < items.length; i += size) {
+        out.push(items.slice(i, i + size));
+    }
+    return out;
+}
+// ─────────────────────────── Circuit Breaker ─────────────────────────────────
+/**
+ * Sliding-window circuit breaker. Tracks the last N samples (min 4) over a
+ * 60-second window; flips OPEN when failure rate exceeds 50%, then auto-
+ * resets to HALF-OPEN after 5 minutes — at which point the next probe
+ * decides whether to close (success) or re-open (failure).
+ */
+export class CircuitBreaker {
+    samples = [];
+    state = "closed";
+    openedAt = null;
+    windowMs;
+    openDurationMs;
+    threshold;
+    minSamples;
+    now;
+    constructor(opts = {}) {
+        this.windowMs = opts.windowMs ?? CIRCUIT_WINDOW_MS;
+        this.openDurationMs = opts.openDurationMs ?? CIRCUIT_OPEN_DURATION_MS;
+        this.threshold = opts.threshold ?? CIRCUIT_FAILURE_RATE_THRESHOLD;
+        this.minSamples = opts.minSamples ?? CIRCUIT_MIN_SAMPLES;
+        this.now = opts.now ?? Date.now;
+    }
+    getState() {
+        this.tickAutoReset();
+        return this.state;
+    }
+    /**
+     * Returns true when calls should be short-circuited (skip HTTP, fall back).
+     * Side-effects: may transition OPEN → HALF-OPEN if the open window expired.
+     */
+    shouldFailFast() {
+        return this.getState() === "open";
+    }
+    /** Record a successful call. */
+    recordSuccess() {
+        // Honor the time-based OPEN→HALF-OPEN transition before deciding what
+        // to do with this sample. Without this, a success that lands AFTER the
+        // open window expired would still see state==="open" and never close
+        // the breaker (a probe call could only flip it via getState()).
+        this.tickAutoReset();
+        this.pushSample(true);
+        if (this.state === "half-open") {
+            this.state = "closed";
+            this.openedAt = null;
+        }
+    }
+    /** Record a failed call. May trigger OPEN. */
+    recordFailure() {
+        // Same reasoning as recordSuccess — apply lazy auto-reset before
+        // classifying the sample.
+        this.tickAutoReset();
+        this.pushSample(false);
+        if (this.state === "half-open") {
+            // Probe failed — re-open
+            this.state = "open";
+            this.openedAt = this.now();
+            return;
+        }
+        if (this.state === "closed")
+            this.evaluate();
+    }
+    /** Force-reset the breaker (used by tests / admin) */
+    reset() {
+        this.samples = [];
+        this.state = "closed";
+        this.openedAt = null;
+    }
+    pushSample(ok) {
+        const ts = this.now();
+        this.samples.push({ ts, ok });
+        // Drop samples outside the window
+        const cutoff = ts - this.windowMs;
+        while (this.samples.length > 0 && this.samples[0].ts < cutoff) {
+            this.samples.shift();
+        }
+    }
+    evaluate() {
+        if (this.samples.length < this.minSamples)
+            return;
+        const failures = this.samples.filter((s) => !s.ok).length;
+        const rate = failures / this.samples.length;
+        if (rate > this.threshold) {
+            this.state = "open";
+            this.openedAt = this.now();
+        }
+    }
+    tickAutoReset() {
+        if (this.state === "open" && this.openedAt !== null) {
+            if (this.now() - this.openedAt >= this.openDurationMs) {
+                this.state = "half-open";
+            }
+        }
+    }
+}
+// ─────────────────────────── Errors ──────────────────────────────────────────
+/**
+ * Raised when the circuit breaker is OPEN and a call is short-circuited.
+ * Callers (e.g. fallback wrapper) can catch this to switch to local provider.
+ */
+export class CircuitOpenError extends Error {
+    constructor(message = "OpenAIEmbeddingsProvider circuit is OPEN") {
+        super(message);
+        this.name = "CircuitOpenError";
+    }
+}
+/**
+ * Persistent (non-retryable) HTTP error from upstream. Includes status code.
+ */
+export class HttpError extends Error {
+    status;
+    bodyPreview;
+    constructor(status, bodyPreview) {
+        super(`HTTP ${status}: ${bodyPreview.slice(0, 200)}`);
+        this.name = "HttpError";
+        this.status = status;
+        this.bodyPreview = bodyPreview.slice(0, 1024);
+    }
+}
+// ─────────────────────────── Provider ────────────────────────────────────────
+export class OpenAIEmbeddingsProvider {
+    kind = "openai";
+    endpoint;
+    apiKey;
+    modelId;
+    upstreamModel;
+    batchSize;
+    timeoutMs;
+    fetchImpl;
+    retryBackoffsMs;
+    sleep;
+    now;
+    dimensions = undefined;
+    breaker;
+    constructor(config) {
+        if (!config.endpoint) {
+            throw new Error("OpenAIEmbeddingsProvider: endpoint is required");
+        }
+        this.endpoint = config.endpoint.replace(/\/+$/, "");
+        this.apiKey = config.apiKey;
+        this.modelId = config.modelId ?? "embeddinggemma";
+        this.upstreamModel = config.upstreamModel ?? this.modelId;
+        this.batchSize = config.batchSize ?? DEFAULT_BATCH_SIZE;
+        this.timeoutMs = config.timeoutMs ?? DEFAULT_TIMEOUT_MS;
+        this.fetchImpl = config.fetchImpl ?? globalThis.fetch;
+        this.retryBackoffsMs = config.retryBackoffsMs ?? RETRY_BACKOFFS_MS;
+        this.sleep = config.sleep ?? defaultSleep;
+        this.now = config.now ?? Date.now;
+        this.breaker = new CircuitBreaker({ now: this.now });
+        if (!this.fetchImpl) {
+            throw new Error("OpenAIEmbeddingsProvider: global fetch is unavailable. " +
+                "Provide a `fetchImpl` config option (Node ≥18 ships fetch by default).");
+        }
+        if (this.batchSize < 1) {
+            throw new Error(`OpenAIEmbeddingsProvider: batchSize must be ≥ 1, got ${this.batchSize}`);
+        }
+    }
+    getModelId() {
+        return this.modelId;
+    }
+    getDimensions() {
+        return this.dimensions;
+    }
+    async healthcheck(signal) {
+        // Try GET /health first (worker exposes it). Fall back to probe embed.
+        try {
+            const { signal: attemptSig, cleanup } = buildAttemptSignal(signal, this.timeoutMs);
+            try {
+                const resp = await this.fetchImpl(`${this.endpoint}/health`, {
+                    method: "GET",
+                    headers: this.buildHeaders(),
+                    signal: attemptSig,
+                });
+                if (resp.ok) {
+                    return {
+                        ok: true,
+                        model: this.modelId,
+                        dimensions: this.dimensions,
+                        detail: `GET /health → ${resp.status}`,
+                    };
+                }
+                return {
+                    ok: false,
+                    model: this.modelId,
+                    detail: `GET /health → HTTP ${resp.status}`,
+                };
+            }
+            finally {
+                cleanup();
+            }
+        }
+        catch (err) {
+            // Endpoint may not implement /health — try a single embed probe instead.
+            try {
+                const probe = await this.embed("healthcheck", { signal });
+                if (probe) {
+                    return {
+                        ok: true,
+                        model: this.modelId,
+                        dimensions: probe.embedding.length,
+                        detail: "embed probe ok",
+                    };
+                }
+                return {
+                    ok: false,
+                    model: this.modelId,
+                    detail: "embed probe returned null",
+                };
+            }
+            catch (probeErr) {
+                return {
+                    ok: false,
+                    model: this.modelId,
+                    detail: (err instanceof Error ? err.message : String(err)) +
+                        " | probe: " +
+                        (probeErr instanceof Error ? probeErr.message : String(probeErr)),
+                };
+            }
+        }
+    }
+    async embed(text, options = {}) {
+        const batch = await this.embedBatch([text], options);
+        return batch[0] ?? null;
+    }
+    async embedBatch(texts, options = {}) {
+        if (texts.length === 0)
+            return [];
+        if (this.breaker.shouldFailFast()) {
+            throw new CircuitOpenError();
+        }
+        const chunks = chunkArray(texts, this.batchSize);
+        const results = new Array(texts.length).fill(null);
+        let cursor = 0;
+        for (const chunk of chunks) {
+            const start = cursor;
+            cursor += chunk.length;
+            // Abort early if signal already fired
+            if (options.signal?.aborted) {
+                // Leave remaining slots as null (caller treats as errors)
+                return results;
+            }
+            // Fail-fast if breaker tripped mid-loop
+            if (this.breaker.shouldFailFast()) {
+                throw new CircuitOpenError();
+            }
+            try {
+                const embeddings = await this.requestWithRetry(chunk, options);
+                for (let i = 0; i < chunk.length; i++) {
+                    const embedding = embeddings[i];
+                    if (embedding) {
+                        results[start + i] = {
+                            embedding,
+                            model: this.modelId,
+                        };
+                        // Record dimensions on first success
+                        if (this.dimensions === undefined) {
+                            this.dimensions = embedding.length;
+                        }
+                    }
+                }
+                this.breaker.recordSuccess();
+            }
+            catch (err) {
+                this.breaker.recordFailure();
+                // CircuitOpenError must propagate so the caller can fall back
+                if (err instanceof CircuitOpenError)
+                    throw err;
+                // Other errors mark the chunk as null and continue with next chunk.
+                // (The store layer already handles per-text nulls as errors.)
+                if (process.env.QMD_EMBED_DEBUG) {
+                    process.stderr.write(`OpenAIEmbeddingsProvider: chunk failed (${err instanceof Error ? err.message : String(err)})\n`);
+                }
+            }
+        }
+        return results;
+    }
+    async dispose() {
+        // Nothing to release — fetch handles its own connection pooling.
+        // Reset the breaker so a re-instantiation starts fresh.
+        this.breaker.reset();
+    }
+    // ────────────────────── Internals ──────────────────────
+    buildHeaders() {
+        const headers = {
+            "Content-Type": "application/json",
+            "Accept": "application/json",
+        };
+        if (this.apiKey) {
+            headers["Authorization"] = `Bearer ${this.apiKey}`;
+        }
+        return headers;
+    }
+    /**
+     * Single HTTP request with retry on 429/503. Returns embeddings indexed
+     * the same as `texts`. Throws on non-retryable failure or all attempts
+     * exhausted.
+     */
+    async requestWithRetry(texts, options) {
+        let lastErr = null;
+        const maxAttempts = this.retryBackoffsMs.length + 1;
+        for (let attempt = 0; attempt < maxAttempts; attempt++) {
+            // Honor user abort BEFORE issuing the call (avoids wasted network)
+            if (options.signal?.aborted) {
+                throw new Error("aborted by caller");
+            }
+            try {
+                return await this.requestOnce(texts, options);
+            }
+            catch (err) {
+                lastErr = err;
+                const retryable = err instanceof HttpError ? isRetryableStatus(err.status) : false;
+                if (!retryable)
+                    throw err;
+                if (attempt < this.retryBackoffsMs.length) {
+                    await this.sleep(this.retryBackoffsMs[attempt]);
+                }
+            }
+        }
+        // Exhausted retries → throw the last error so caller marks the chunk null
+        throw lastErr ?? new Error("requestWithRetry exhausted");
+    }
+    /**
+     * Issue one HTTP attempt to `POST /v1/embeddings`. Does NOT retry.
+     */
+    async requestOnce(texts, options) {
+        const { signal: attemptSig, cleanup } = buildAttemptSignal(options.signal, this.timeoutMs);
+        try {
+            const body = JSON.stringify({
+                model: options.model ?? this.upstreamModel,
+                input: texts,
+            });
+            const resp = await this.fetchImpl(`${this.endpoint}/v1/embeddings`, {
+                method: "POST",
+                headers: this.buildHeaders(),
+                body,
+                signal: attemptSig,
+            });
+            if (!resp.ok) {
+                const text = await resp.text().catch(() => "");
+                throw new HttpError(resp.status, text);
+            }
+            let parsed;
+            try {
+                parsed = (await resp.json());
+            }
+            catch (err) {
+                throw new Error(`OpenAIEmbeddingsProvider: malformed JSON from ${this.endpoint}/v1/embeddings: ${err instanceof Error ? err.message : String(err)}`);
+            }
+            if (!parsed || !Array.isArray(parsed.data)) {
+                throw new Error(`OpenAIEmbeddingsProvider: response missing "data" array (got ${typeof parsed})`);
+            }
+            // Sort by index to match input order (in case server returns out-of-order).
+            const out = new Array(texts.length);
+            for (const item of parsed.data) {
+                if (typeof item.index !== "number" ||
+                    item.index < 0 ||
+                    item.index >= texts.length) {
+                    throw new Error(`OpenAIEmbeddingsProvider: data item index out of range (${item.index}, expected 0..${texts.length - 1})`);
+                }
+                if (!Array.isArray(item.embedding)) {
+                    throw new Error(`OpenAIEmbeddingsProvider: data[${item.index}].embedding is not an array`);
+                }
+                out[item.index] = item.embedding;
+            }
+            // Sanity check — every slot must be filled
+            for (let i = 0; i < texts.length; i++) {
+                if (!out[i]) {
+                    throw new Error(`OpenAIEmbeddingsProvider: response missing embedding for index ${i}`);
+                }
+            }
+            return out;
+        }
+        finally {
+            cleanup();
+        }
+    }
+}

+ 109 - 0
dist/embedding/provider.d.ts

@@ -0,0 +1,109 @@
+/**
+ * provider.ts - Embedding provider abstraction
+ *
+ * Defines the EmbeddingProvider interface that allows qmd to use either:
+ *   - LocalLlamaCppProvider (legacy, GGUF via node-llama-cpp)
+ *   - OpenAIEmbeddingsProvider (HTTP, OpenAI-compatible endpoint like ai.mm.mk)
+ *
+ * The factory in `./factory.ts` selects an implementation based on env vars,
+ * a CLI flag, or `~/.config/qmd/config.json`.
+ */
+/**
+ * Single embedding result
+ */
+export type ProviderEmbedding = {
+    embedding: number[];
+    /** Model identifier used to produce this embedding (matches content_vectors.model in DB) */
+    model: string;
+};
+/**
+ * Supported provider kinds
+ */
+export type ProviderKind = "local" | "openai";
+/**
+ * Healthcheck result for provider startup verification
+ */
+export type ProviderHealth = {
+    ok: boolean;
+    /** Model identifier reported by the provider */
+    model: string;
+    /** Embedding dimensions (e.g. 768 for embeddinggemma-300M) */
+    dimensions?: number;
+    /** Detail message (error reason on failure, status on success) */
+    detail?: string;
+};
+/**
+ * Per-call options for provider embedding
+ */
+export type ProviderEmbedOptions = {
+    /** Optional model id override (rare; usually provider has a fixed model) */
+    model?: string;
+    /** Abort signal for cancellation / timeout */
+    signal?: AbortSignal;
+};
+/**
+ * Provider interface — both LocalLlamaCppProvider and OpenAIEmbeddingsProvider implement this.
+ *
+ * Implementations MUST:
+ *   - Return `null` (not throw) for individual texts that fail to embed;
+ *     the caller will count it as an error and continue.
+ *   - Honor `options.signal` for cancellation.
+ *   - Be safe to call concurrently for `embedBatch`.
+ */
+export interface EmbeddingProvider {
+    /** Provider kind tag — useful for logging and factory introspection */
+    readonly kind: ProviderKind;
+    /**
+     * Stable model identifier reported to the caller.
+     *
+     * MUST match what's stored in `content_vectors.model` for the existing
+     * index — otherwise the model-id guard refuses to embed.
+     */
+    getModelId(): string;
+    /**
+     * Embedding vector dimensions. May return `undefined` before the first call
+     * (some providers probe lazily). Once known, MUST stay stable.
+     */
+    getDimensions(): number | undefined;
+    /**
+     * Healthcheck — verifies the provider is reachable and the model is loaded.
+     * Should NOT throw — return `{ ok: false, detail: ... }` on failure.
+     *
+     * For HTTP providers: ping `/health` endpoint.
+     * For local provider: ensure model loads.
+     */
+    healthcheck(signal?: AbortSignal): Promise<ProviderHealth>;
+    /**
+     * Embed a single text. Returns `null` on per-call failure.
+     */
+    embed(text: string, options?: ProviderEmbedOptions): Promise<ProviderEmbedding | null>;
+    /**
+     * Embed multiple texts in a batch (more efficient than calling `embed` N times).
+     *
+     * Output array length MUST equal input array length. Failed entries are `null`.
+     * Implementations are responsible for chunking large batches per their
+     * upstream limits (e.g. OpenAI provider chunks to 64).
+     */
+    embedBatch(texts: string[], options?: ProviderEmbedOptions): Promise<(ProviderEmbedding | null)[]>;
+    /** Release any held resources (HTTP keep-alive sockets, model handles, …) */
+    dispose(): Promise<void>;
+}
+/**
+ * Error thrown when the provider's reported model id does not match the
+ * model id baked into existing `content_vectors` rows. Forces user to
+ * re-embed (`qmd embed -f`) or pin the matching model id.
+ */
+export declare class ModelMismatchError extends Error {
+    readonly providerModel: string;
+    readonly existingModels: string[];
+    constructor(providerModel: string, existingModels: string[]);
+}
+/**
+ * Verify that the provider's model id is compatible with the existing
+ * `content_vectors` entries. Pass-through (no-op) if the table is empty
+ * (fresh DB) or if the model id appears in the distinct set.
+ *
+ * Caller passes `existingModels` (typically result of
+ * `SELECT DISTINCT model FROM content_vectors`).
+ */
+export declare function assertModelCompatible(providerModel: string, existingModels: string[]): void;

+ 46 - 0
dist/embedding/provider.js

@@ -0,0 +1,46 @@
+/**
+ * provider.ts - Embedding provider abstraction
+ *
+ * Defines the EmbeddingProvider interface that allows qmd to use either:
+ *   - LocalLlamaCppProvider (legacy, GGUF via node-llama-cpp)
+ *   - OpenAIEmbeddingsProvider (HTTP, OpenAI-compatible endpoint like ai.mm.mk)
+ *
+ * The factory in `./factory.ts` selects an implementation based on env vars,
+ * a CLI flag, or `~/.config/qmd/config.json`.
+ */
+/**
+ * Error thrown when the provider's reported model id does not match the
+ * model id baked into existing `content_vectors` rows. Forces user to
+ * re-embed (`qmd embed -f`) or pin the matching model id.
+ */
+export class ModelMismatchError extends Error {
+    providerModel;
+    existingModels;
+    constructor(providerModel, existingModels) {
+        const list = existingModels.join(", ");
+        super(`Embedding model mismatch: existing vectors use model(s) [${list}] ` +
+            `but the configured provider reports "${providerModel}". ` +
+            `Run \`qmd embed -f\` (or \`--rebuild\`) to re-embed everything with ` +
+            `the new model, or set QMD_EMBED_MODEL_ID="${existingModels[0] ?? ""}" ` +
+            `to keep the existing vectors.`);
+        this.name = "ModelMismatchError";
+        this.providerModel = providerModel;
+        this.existingModels = existingModels;
+    }
+}
+/**
+ * Verify that the provider's model id is compatible with the existing
+ * `content_vectors` entries. Pass-through (no-op) if the table is empty
+ * (fresh DB) or if the model id appears in the distinct set.
+ *
+ * Caller passes `existingModels` (typically result of
+ * `SELECT DISTINCT model FROM content_vectors`).
+ */
+export function assertModelCompatible(providerModel, existingModels) {
+    // Empty DB — nothing to compare against, anything goes.
+    if (existingModels.length === 0)
+        return;
+    if (existingModels.includes(providerModel))
+        return;
+    throw new ModelMismatchError(providerModel, existingModels);
+}

+ 2 - 0
dist/index.d.ts

@@ -24,6 +24,8 @@ export { extractSnippet, addLineNumbers, DEFAULT_MULTI_GET_MAX_BYTES };
 export type { ChunkStrategy } from "./store.js";
 export { getDefaultDbPath } from "./store.js";
 export { Maintenance } from "./maintenance.js";
+export { createEmbeddingProvider, resolveProviderKind, LocalLlamaCppProvider, OpenAIEmbeddingsProvider, CircuitBreaker, CircuitOpenError, HttpError, ModelMismatchError, assertModelCompatible, type EmbeddingProvider, type ProviderKind, type ProviderEmbedding, type ProviderEmbedOptions, type ProviderHealth, type CreateEmbeddingProviderOptions, type OpenAIProviderConfig, type LocalLlamaCppProviderConfig, type EmbedProviderConfigFile, DEFAULT_BATCH_SIZE as DEFAULT_PROVIDER_BATCH_SIZE, DEFAULT_TIMEOUT_MS as DEFAULT_PROVIDER_TIMEOUT_MS, RETRY_BACKOFFS_MS as PROVIDER_RETRY_BACKOFFS_MS, } from "./embedding/index.js";
+export { getDistinctEmbeddingModels } from "./store.js";
 /**
  * Progress info emitted during update() for each file processed.
  */

+ 6 - 0
dist/index.js

@@ -25,6 +25,12 @@ export { extractSnippet, addLineNumbers, DEFAULT_MULTI_GET_MAX_BYTES };
 export { getDefaultDbPath } from "./store.js";
 // Re-export Maintenance class for CLI housekeeping operations
 export { Maintenance } from "./maintenance.js";
+// Re-export embedding provider abstraction for SDK consumers (i-qkarfffa).
+// `createEmbeddingProvider` honors QMD_EMBED_ENDPOINT / config-file / kind
+// arg precedence; default fallback is the legacy LocalLlamaCppProvider so
+// SDK code that doesn't pass `embedProvider` keeps the prior behavior.
+export { createEmbeddingProvider, resolveProviderKind, LocalLlamaCppProvider, OpenAIEmbeddingsProvider, CircuitBreaker, CircuitOpenError, HttpError, ModelMismatchError, assertModelCompatible, DEFAULT_BATCH_SIZE as DEFAULT_PROVIDER_BATCH_SIZE, DEFAULT_TIMEOUT_MS as DEFAULT_PROVIDER_TIMEOUT_MS, RETRY_BACKOFFS_MS as PROVIDER_RETRY_BACKOFFS_MS, } from "./embedding/index.js";
+export { getDistinctEmbeddingModels } from "./store.js";
 /**
  * Create a QMD store for programmatic access to search and indexing.
  *

+ 22 - 0
dist/store.d.ts

@@ -13,6 +13,7 @@
 import type { Database } from "./db.js";
 import { LlamaCpp, formatQueryForEmbedding, formatDocForEmbedding, type ILLMSession } from "./llm.js";
 import type { NamedCollection, Collection, CollectionConfig } from "./collections.js";
+import { type EmbeddingProvider } from "./embedding/provider.js";
 export declare const DEFAULT_EMBED_MODEL = "embeddinggemma";
 export declare const DEFAULT_RERANK_MODEL = "ExpedientFalcon/qwen3-reranker:0.6b-q8_0";
 export declare const DEFAULT_QUERY_MODEL = "Qwen/Qwen3-1.7B";
@@ -336,6 +337,16 @@ export type EmbedOptions = {
     maxBatchBytes?: number;
     chunkStrategy?: ChunkStrategy;
     onProgress?: (info: EmbedProgress) => void;
+    /**
+     * Optional embedding provider. When supplied, embeddings are routed through
+     * this provider (HTTP, GPU worker, etc.) instead of the local llama.cpp
+     * session path. The provider's `getModelId()` is verified against existing
+     * `content_vectors.model` rows; mismatch throws unless `force` is set.
+     *
+     * When omitted, behavior is identical to pre-patch: embeddings come from
+     * the store's `LlamaCpp` (or the global singleton).
+     */
+    embedProvider?: EmbeddingProvider;
 };
 /**
  * Generate vector embeddings for documents that need them.
@@ -713,6 +724,17 @@ export declare function getHashesForEmbedding(db: Database): {
  * Deletes all rows from content_vectors and drops the vectors_vec table.
  */
 export declare function clearAllEmbeddings(db: Database): void;
+/**
+ * Get the distinct set of model identifiers present in `content_vectors`.
+ *
+ * Used by the embedding migration-safety guard: if a configured provider's
+ * `getModelId()` does not appear in this list (and the table is non-empty),
+ * we refuse to embed and ask the user to run `qmd embed -f` to rebuild.
+ *
+ * Returns `[]` when the table is empty (fresh DB) — in which case any
+ * provider is allowed.
+ */
+export declare function getDistinctEmbeddingModels(db: Database): string[];
 /**
  * Insert a single embedding into both content_vectors and vectors_vec tables.
  * The hash_seq key is formatted as "hash_seq" for the vectors_vec table.

+ 107 - 10
dist/store.js

@@ -17,6 +17,7 @@ import { readFileSync, realpathSync, statSync, mkdirSync } from "node:fs";
 // Note: node:path resolve is not imported — we export our own cross-platform resolve()
 import fastGlob from "fast-glob";
 import { LlamaCpp, getDefaultLlamaCpp, formatQueryForEmbedding, formatDocForEmbedding, withLLMSessionForLlm, } from "./llm.js";
+import { assertModelCompatible, } from "./embedding/provider.js";
 // =============================================================================
 // Configuration
 // =============================================================================
@@ -1007,6 +1008,49 @@ function getEmbeddingDocsForBatch(db, batch) {
         body: bodyByHash.get(doc.hash) ?? "",
     }));
 }
+/**
+ * Run `body` with a session-shaped argument that supplies an AbortSignal +
+ * isValid flag. When `provider` is supplied, the session is a lightweight
+ * AbortController-backed stub — `getLlm(store)` is never called and
+ * `withLLMSessionForLlm` is bypassed entirely, so node-llama-cpp is not
+ * warmed up on remote-only deployments (i-08ovbvtb, follow-up to i-qkarfffa).
+ *
+ * When `provider` is undefined, behavior is unchanged: a real `LLMSession`
+ * is created via `withLLMSessionForLlm(getLlm(store), ...)` so that the
+ * body can use `session.embed`/`session.embedBatch` for the local path.
+ *
+ * The fake session's LLM-only methods (embed/embedBatch/expandQuery/rerank)
+ * throw if called — they MUST NOT be reached when `provider` is set, since
+ * the embed path is supposed to route through the provider instead.
+ */
+async function withEmbedSession(store, provider, body, options) {
+    if (provider) {
+        const ac = new AbortController();
+        const fakeSession = {
+            get signal() { return ac.signal; },
+            get isValid() { return !ac.signal.aborted; },
+            embed: async () => {
+                throw new Error("withEmbedSession: provider supplied — session.embed must not be called");
+            },
+            embedBatch: async () => {
+                throw new Error("withEmbedSession: provider supplied — session.embedBatch must not be called");
+            },
+            expandQuery: async () => {
+                throw new Error("withEmbedSession: provider supplied — session.expandQuery must not be called");
+            },
+            rerank: async () => {
+                throw new Error("withEmbedSession: provider supplied — session.rerank must not be called");
+            },
+        };
+        try {
+            return await body(fakeSession);
+        }
+        finally {
+            ac.abort();
+        }
+    }
+    return withLLMSessionForLlm(getLlm(store), body, options);
+}
 /**
  * Generate vector embeddings for documents that need them.
  * Pure function — no console output, no db lifecycle management.
@@ -1018,6 +1062,14 @@ export async function generateEmbeddings(store, options) {
     const now = new Date().toISOString();
     const { maxDocsPerBatch, maxBatchBytes } = resolveEmbedOptions(options);
     const encoder = new TextEncoder();
+    // Migration safety: if an embedProvider is supplied, verify its model id
+    // matches the existing content_vectors rows (unless we're about to clear
+    // them via `force`). This must happen BEFORE we clear vectors so users
+    // who pass `--force` aren't blocked.
+    if (options?.embedProvider && !options.force) {
+        const existing = getDistinctEmbeddingModels(db);
+        assertModelCompatible(options.embedProvider.getModelId(), existing);
+    }
     if (options?.force) {
         clearAllEmbeddings(db);
     }
@@ -1046,11 +1098,23 @@ export async function generateEmbeddings(store, options) {
         // global strategy — no collection overrides. Keeps SDK/inline
         // callers that never touch ~/.config/qmd working.
     }
-    // Use store's LlamaCpp or global singleton, wrapped in a session
-    const llm = getLlm(store);
-    const embedModelUri = llm.embedModelName;
-    // Create a session manager for this llm instance
-    const result = await withLLMSessionForLlm(llm, async (session) => {
+    // Provider routing — when an EmbeddingProvider is supplied, embed calls go
+    // through it (HTTP, GPU worker, etc.). Otherwise, use the LLM session path.
+    // The outer session is still created for its abort signal (chunking uses
+    // `session.signal` for cooperative cancellation).
+    const provider = options?.embedProvider;
+    const providerModel = provider?.getModelId() ?? model;
+    // Resolve `embedModelUri` (used for formatting prefixes etc.) lazily —
+    // when `provider` is set, take it from the provider; otherwise fall back
+    // to the local LlamaCpp's embed model name. Accessing `getLlm(store)` is
+    // deferred to the non-provider branch so remote-only deployments do not
+    // construct a `LlamaCpp` instance just to read its embedModelName.
+    const embedModelUri = provider
+        ? provider.getModelId()
+        : getLlm(store).embedModelName;
+    // Run the embedding loop inside a session-scoped wrapper. When `provider`
+    // is set, this short-circuits the local LLM warm-up entirely (i-08ovbvtb).
+    const result = await withEmbedSession(store, provider, async (session) => {
         let chunksEmbedded = 0;
         let errors = 0;
         let bytesProcessed = 0;
@@ -1058,6 +1122,25 @@ export async function generateEmbeddings(store, options) {
         let vectorTableInitialized = false;
         const BATCH_SIZE = 32;
         const batches = buildEmbeddingBatches(docsToEmbed, maxDocsPerBatch, maxBatchBytes);
+        // Embedding helpers — single point of provider/session selection.
+        // Both return the same shape as ILLMSession.embed/embedBatch so the
+        // rest of the loop is unchanged.
+        const embedOne = async (text, modelArg) => {
+            if (provider) {
+                const sig = provider.kind === 'local' ? session.signal : undefined;
+                const r = await provider.embed(text, { model: modelArg, signal: sig });
+                return r ? { embedding: r.embedding, model: r.model } : null;
+            }
+            return session.embed(text, { model: modelArg });
+        };
+        const embedMany = async (texts, modelArg) => {
+            if (provider) {
+                const sig = provider.kind === 'local' ? session.signal : undefined;
+                const r = await provider.embedBatch(texts, { model: modelArg, signal: sig });
+                return r.map((x) => (x ? { embedding: x.embedding, model: x.model } : null));
+            }
+            return session.embedBatch(texts, { model: modelArg });
+        };
         for (const batchMeta of batches) {
             // Abort early if session has been invalidated
             if (!session.isValid) {
@@ -1095,7 +1178,7 @@ export async function generateEmbeddings(store, options) {
             if (!vectorTableInitialized) {
                 const firstChunk = batchChunks[0];
                 const firstText = formatDocForEmbedding(firstChunk.text, firstChunk.title, embedModelUri);
-                const firstResult = await session.embed(firstText, { model });
+                const firstResult = await embedOne(firstText, providerModel);
                 if (!firstResult) {
                     throw new Error("Failed to get embedding dimensions from first chunk");
                 }
@@ -1124,12 +1207,12 @@ export async function generateEmbeddings(store, options) {
                 const chunkBatch = batchChunks.slice(batchStart, batchEnd);
                 const texts = chunkBatch.map(chunk => formatDocForEmbedding(chunk.text, chunk.title, embedModelUri));
                 try {
-                    const embeddings = await session.embedBatch(texts, { model });
+                    const embeddings = await embedMany(texts, providerModel);
                     for (let i = 0; i < chunkBatch.length; i++) {
                         const chunk = chunkBatch[i];
                         const embedding = embeddings[i];
                         if (embedding) {
-                            insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(embedding.embedding), model, now);
+                            insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(embedding.embedding), providerModel, now);
                             chunksEmbedded++;
                         }
                         else {
@@ -1149,9 +1232,9 @@ export async function generateEmbeddings(store, options) {
                         for (const chunk of chunkBatch) {
                             try {
                                 const text = formatDocForEmbedding(chunk.text, chunk.title, embedModelUri);
-                                const result = await session.embed(text, { model });
+                                const result = await embedOne(text, providerModel);
                                 if (result) {
-                                    insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(result.embedding), model, now);
+                                    insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(result.embedding), providerModel, now);
                                     chunksEmbedded++;
                                 }
                                 else {
@@ -2518,6 +2601,20 @@ export function clearAllEmbeddings(db) {
     db.exec(`DELETE FROM content_vectors`);
     db.exec(`DROP TABLE IF EXISTS vectors_vec`);
 }
+/**
+ * Get the distinct set of model identifiers present in `content_vectors`.
+ *
+ * Used by the embedding migration-safety guard: if a configured provider's
+ * `getModelId()` does not appear in this list (and the table is non-empty),
+ * we refuse to embed and ask the user to run `qmd embed -f` to rebuild.
+ *
+ * Returns `[]` when the table is empty (fresh DB) — in which case any
+ * provider is allowed.
+ */
+export function getDistinctEmbeddingModels(db) {
+    const rows = db.prepare(`SELECT DISTINCT model FROM content_vectors WHERE model IS NOT NULL`).all();
+    return rows.map((r) => r.model).filter((m) => typeof m === "string" && m.length > 0);
+}
 /**
  * Insert a single embedding into both content_vectors and vectors_vec tables.
  * The hash_seq key is formatted as "hash_seq" for the vectors_vec table.

+ 1 - 0
src/cli/qmd.ts

@@ -199,6 +199,7 @@ const c = {
   green: useColor ? "\x1b[32m" : "",
   magenta: useColor ? "\x1b[35m" : "",
   blue: useColor ? "\x1b[34m" : "",
+  red: useColor ? "\x1b[31m" : "",
 };
 
 // Terminal cursor control