Переглянути джерело

feat(embedding): EmbeddingProvider abstraction + OpenAIEmbeddingsProvider (i-qkarfffa)

Stage 3 of topic qmd-gpu-embeddings-via-ai-mm-mk: replace the hard-coded
LlamaCpp embedding path with a provider abstraction supporting both the
existing local node-llama-cpp path AND a new HTTP backend pointed at the
ai.mm.mk gateway (Stage 2) → qmd-embed-worker on `models` (Stage 1).

What ships
----------
- `src/embedding/provider.ts` — `EmbeddingProvider` interface (embed,
  embedBatch, getModelId, getDimensions, healthcheck, dispose) +
  `ModelMismatchError` + `assertModelCompatible(expected, got)`
- `src/embedding/openai.ts` — OpenAIEmbeddingsProvider:
    • batch up to 64, retry 1s/4s/16s on 429/503,
    • circuit breaker (>50% fail / 60s window → open 5 min),
    • healthcheck on construction,
    • model-id reported as configured upstream-model (matches existing
      content_vectors.model in sqlite — index stays valid).
- `src/embedding/local.ts` — LocalLlamaCppProvider adapter wrapping LlamaCpp
- `src/embedding/factory.ts` — `createEmbeddingProvider({kind, ...})` with
  precedence: explicit kind > QMD_EMBED_PROVIDER > QMD_EMBED_ENDPOINT
  presence > config-file > local fallback
- `src/embedding/index.ts` — re-exports for SDK consumers

Wired into store
----------------
- `store.generateEmbeddings()` accepts an `embedProvider` option; when set,
  routes through the new abstraction and runs a model-id guard against
  `getDistinctEmbeddingModels()` (rejects mismatch unless `force=true`).
- `src/index.ts` re-exports the provider abstraction for SDK use.

CLI
---
`qmd embed` gains 7 new flags (with help text):
  --provider {local,openai}, --embed-endpoint <url>, --embed-api-key <key>,
  --embed-model-id <id>, --embed-upstream-model <id>,
  --embed-batch-size <n>, --embed-timeout-ms <ms>
Env vars: QMD_EMBED_PROVIDER, QMD_EMBED_ENDPOINT, QMD_EMBED_API_KEY.

Backward compat
---------------
Zero env-vars + no flags → identical LocalLlamaCppProvider as pre-patch.
Existing 870k-vector index untouched (model-id parity verified Stage 1
cosine ≥0.999).

Tests (87 PASS across 4 files)
------------------------------
test/embedding-provider.test.ts (interface + assertModelCompatible)
test/embedding-factory.test.ts (precedence + env/config/explicit kinds)
test/embedding-openai.test.ts (success, 429 retry, 503 fallback, batch
  chunking, timeout, circuit breaker, healthcheck, malformed)
test/embedding-store-integration.test.ts (model-id guard, force bypass)

Topic completion
----------------
Stage 1 (qmd-embed-worker, models:8082)        → oivo@6a3cb19ae
Stage 1.5 (GPU lease + Prometheus + alerts)    → oivo@15bc71deb
Stage 2 (ai.mm.mk /v1/embeddings, Bearer auth) → ai/srv/ai@9f40ea1
Stage 3 (qmd OpenAIEmbeddings provider)        → THIS COMMIT

Generated with [Claude Code](https://claude.ai/code)
via [Oivo](https://oivo.com)

Co-Authored-By: Claude <noreply@anthropic.com>
Session-Id: 5a95c44d
root 4 тижнів тому
батько
коміт
0463dd50dc

+ 110 - 2
src/cli/qmd.ts

@@ -99,6 +99,14 @@ import {
   loadConfig,
 } from "../collections.js";
 import { getEmbeddedQmdSkillContent, getEmbeddedQmdSkillFiles } from "../embedded-skills.js";
+import {
+  createEmbeddingProvider,
+  resolveProviderKind,
+  type EmbeddingProvider,
+  type ProviderKind,
+  type CreateEmbeddingProviderOptions,
+  ModelMismatchError,
+} from "../embedding/index.js";
 
 // Enable production mode - allows using default database path
 // Tests must set INDEX_PATH or use createStore() with explicit path
@@ -1658,10 +1666,72 @@ function parseChunkStrategy(value: unknown): ChunkStrategy | undefined {
   throw new Error(`--chunk-strategy must be "auto", "regex", or "function" (got "${s}")`);
 }
 
+function parseProviderKind(value: unknown): ProviderKind | undefined {
+  if (value === undefined) return undefined;
+  const s = String(value).toLowerCase();
+  if (s === "local" || s === "openai") return s;
+  throw new Error(`--provider must be "local" or "openai" (got "${s}")`);
+}
+
+function parseOptionalPositiveInt(name: string, value: unknown): number | undefined {
+  if (value === undefined) return undefined;
+  const parsed = Number(value);
+  if (!Number.isInteger(parsed) || parsed < 1) {
+    throw new Error(`${name} must be a positive integer`);
+  }
+  return parsed;
+}
+
+/**
+ * Translate `cli.values` into `CreateEmbeddingProviderOptions`. CLI flags
+ * win over env vars (the factory itself reads env when these are unset).
+ */
+function buildProviderOpts(
+  values: Record<string, unknown>,
+  providerCliKind: ProviderKind | undefined,
+): CreateEmbeddingProviderOptions {
+  const endpoint = optionalString(values["embed-endpoint"]);
+  const apiKey = optionalString(values["embed-api-key"]);
+  const modelId = optionalString(values["embed-model-id"]);
+  const upstreamModel = optionalString(values["embed-upstream-model"]);
+  const batchSize = parseOptionalPositiveInt("--embed-batch-size", values["embed-batch-size"]);
+  const timeoutMs = parseOptionalPositiveInt("--embed-timeout-ms", values["embed-timeout-ms"]);
+
+  // Only build the openai overrides object if the user supplied flags
+  const openai =
+    endpoint || apiKey || modelId || upstreamModel || batchSize !== undefined || timeoutMs !== undefined
+      ? {
+          ...(endpoint !== undefined ? { endpoint } : {}),
+          ...(apiKey !== undefined ? { apiKey } : {}),
+          ...(modelId !== undefined ? { modelId } : {}),
+          ...(upstreamModel !== undefined ? { upstreamModel } : {}),
+          ...(batchSize !== undefined ? { batchSize } : {}),
+          ...(timeoutMs !== undefined ? { timeoutMs } : {}),
+        }
+      : undefined;
+
+  return {
+    ...(providerCliKind ? { kind: providerCliKind } : {}),
+    ...(openai ? { openai } : {}),
+  };
+}
+
+function optionalString(v: unknown): string | undefined {
+  if (v === undefined || v === null) return undefined;
+  const s = String(v);
+  return s === "" ? undefined : s;
+}
+
 async function vectorIndex(
   model: string = DEFAULT_EMBED_MODEL_URI,
   force: boolean = false,
-  batchOptions?: { maxDocsPerBatch?: number; maxBatchBytes?: number; chunkStrategy?: ChunkStrategy },
+  batchOptions?: {
+    maxDocsPerBatch?: number;
+    maxBatchBytes?: number;
+    chunkStrategy?: ChunkStrategy;
+    embedProvider?: EmbeddingProvider;
+    providerKind?: ProviderKind;
+  },
 ): Promise<void> {
   const storeInstance = getStore();
   const db = storeInstance.db;
@@ -1679,6 +1749,13 @@ async function vectorIndex(
   }
 
   console.log(`${c.dim}Model: ${model}${c.reset}\n`);
+  if (batchOptions?.embedProvider) {
+    const kind = batchOptions.embedProvider.kind;
+    const providerModel = batchOptions.embedProvider.getModelId();
+    console.log(`${c.dim}Provider: ${kind} (model id "${providerModel}")${c.reset}\n`);
+  } else if (batchOptions?.providerKind) {
+    console.log(`${c.dim}Provider: ${batchOptions.providerKind}${c.reset}\n`);
+  }
   if (batchOptions?.maxDocsPerBatch !== undefined || batchOptions?.maxBatchBytes !== undefined) {
     const maxDocsPerBatch = batchOptions.maxDocsPerBatch ?? DEFAULT_EMBED_MAX_DOCS_PER_BATCH;
     const maxBatchBytes = batchOptions.maxBatchBytes ?? DEFAULT_EMBED_MAX_BATCH_BYTES;
@@ -1695,6 +1772,7 @@ async function vectorIndex(
     maxDocsPerBatch: batchOptions?.maxDocsPerBatch,
     maxBatchBytes: batchOptions?.maxBatchBytes,
     chunkStrategy: batchOptions?.chunkStrategy,
+    embedProvider: batchOptions?.embedProvider,
     onProgress: (info) => {
       if (info.totalBytes === 0) return;
       const percent = (info.bytesProcessed / info.totalBytes) * 100;
@@ -2473,6 +2551,13 @@ function parseCLI() {
       force: { type: "boolean", short: "f" },
       "max-docs-per-batch": { type: "string" },
       "max-batch-mb": { type: "string" },
+      provider: { type: "string" },                  // "local" | "openai"
+      "embed-endpoint": { type: "string" },          // OpenAI-compatible endpoint URL
+      "embed-api-key": { type: "string" },           // Bearer token
+      "embed-model-id": { type: "string" },          // Stable model id (default: embeddinggemma)
+      "embed-upstream-model": { type: "string" },    // Upstream model name in HTTP body
+      "embed-batch-size": { type: "string" },        // Batch size for HTTP provider
+      "embed-timeout-ms": { type: "string" },        // Per-request timeout
       // Update options
       pull: { type: "boolean" },  // git pull before update
       refresh: { type: "boolean" },
@@ -2694,6 +2779,13 @@ function showHelp(): void {
   console.log("  qmd embed [-f]                - Generate/refresh vector embeddings");
   console.log("    --max-docs-per-batch <n>    - Cap docs loaded into memory per embedding batch");
   console.log("    --max-batch-mb <n>          - Cap UTF-8 MB loaded into memory per embedding batch");
+  console.log("    --provider {local,openai}   - Embedding backend (default: local llama.cpp)");
+  console.log("    --embed-endpoint <url>      - OpenAI-compatible endpoint (or QMD_EMBED_ENDPOINT)");
+  console.log("    --embed-api-key <key>       - Bearer token (or QMD_EMBED_API_KEY)");
+  console.log("    --embed-model-id <id>       - Stable model id stored in DB (default: embeddinggemma)");
+  console.log("    --embed-upstream-model <m>  - Model name sent in HTTP body (default: same as model-id)");
+  console.log("    --embed-batch-size <n>      - Batch size for HTTP provider (default: 64)");
+  console.log("    --embed-timeout-ms <n>      - Per-request timeout in ms (default: 30000)");
   console.log("  qmd cleanup                   - Clear caches, vacuum DB");
   console.log("");
   console.log("Query syntax (qmd query):");
@@ -3079,13 +3171,29 @@ if (isMain) {
         const maxDocsPerBatch = parseEmbedBatchOption("maxDocsPerBatch", cli.values["max-docs-per-batch"]);
         const maxBatchMb = parseEmbedBatchOption("maxBatchBytes", cli.values["max-batch-mb"]);
         const embedChunkStrategy = parseChunkStrategy(cli.values["chunk-strategy"]);
+
+        // Build embedding provider from CLI flags + env + config file.
+        // Backward compat: with no flags / env vars, the factory returns
+        // a LocalLlamaCppProvider that delegates to the default LlamaCpp
+        // singleton — identical to pre-patch behavior.
+        const providerCliKind = parseProviderKind(cli.values["provider"]);
+        const providerOpts = buildProviderOpts(cli.values, providerCliKind);
+        const embedProvider = createEmbeddingProvider(providerOpts);
+
         await vectorIndex(DEFAULT_EMBED_MODEL_URI, !!cli.values.force, {
           maxDocsPerBatch,
           maxBatchBytes: maxBatchMb === undefined ? undefined : maxBatchMb * 1024 * 1024,
           chunkStrategy: embedChunkStrategy,
+          embedProvider,
+          providerKind: embedProvider.kind,
         });
       } catch (error) {
-        console.error(error instanceof Error ? error.message : String(error));
+        if (error instanceof ModelMismatchError) {
+          // Friendlier output for the migration-safety guard
+          console.error(`${c.red}Model mismatch:${c.reset} ${error.message}`);
+        } else {
+          console.error(error instanceof Error ? error.message : String(error));
+        }
         process.exit(1);
       }
       break;

+ 189 - 0
src/embedding/factory.ts

@@ -0,0 +1,189 @@
+/**
+ * factory.ts - EmbeddingProvider factory with config precedence.
+ *
+ * Resolution order (first match wins):
+ *   1. Explicit `kind` argument or `--provider` CLI flag → forces a kind
+ *   2. `QMD_EMBED_ENDPOINT` env var present and non-empty → "openai"
+ *   3. Config file (`~/.config/qmd/config.json`) `embedProvider.kind` → that kind
+ *   4. Otherwise → "local" (legacy / backward-compat)
+ *
+ * Backward compat invariant: when neither `QMD_EMBED_ENDPOINT` nor
+ * `~/.config/qmd/config.json` mentions a provider, callers get the same
+ * `LocalLlamaCppProvider` they had before this change.
+ */
+
+import { existsSync, readFileSync } from "node:fs";
+import { homedir } from "node:os";
+import { join } from "node:path";
+
+import { LocalLlamaCppProvider, type LocalLlamaCppProviderConfig } from "./local.js";
+import {
+  OpenAIEmbeddingsProvider,
+  type OpenAIProviderConfig,
+} from "./openai.js";
+import type { EmbeddingProvider, ProviderKind } from "./provider.js";
+
+// ─────────────────────────── Config file ─────────────────────────────────────
+
+export type EmbedProviderConfigFile = {
+  embedProvider?: {
+    kind?: ProviderKind;
+    endpoint?: string;
+    apiKey?: string;
+    modelId?: string;
+    upstreamModel?: string;
+    batchSize?: number;
+    timeoutMs?: number;
+  };
+};
+
+export function defaultConfigPath(): string {
+  const xdg = process.env.XDG_CONFIG_HOME;
+  const base = xdg ? xdg : join(homedir(), ".config");
+  return join(base, "qmd", "config.json");
+}
+
+/**
+ * Load `~/.config/qmd/config.json` if present. Returns an empty object on
+ * any read/parse error so we silently fall back to env/local.
+ */
+export function loadConfigFile(path: string = defaultConfigPath()): EmbedProviderConfigFile {
+  if (!existsSync(path)) return {};
+  try {
+    const raw = readFileSync(path, "utf-8");
+    const parsed = JSON.parse(raw);
+    if (parsed && typeof parsed === "object") return parsed as EmbedProviderConfigFile;
+  } catch {
+    // Ignore — invalid JSON, missing read perm, etc.
+  }
+  return {};
+}
+
+// ─────────────────────────── Factory options ────────────────────────────────
+
+export type CreateEmbeddingProviderOptions = {
+  /** Force a specific provider kind. Overrides env + config. */
+  kind?: ProviderKind;
+  /** Override config file path (mostly for tests) */
+  configPath?: string;
+  /** Local-provider overrides */
+  local?: LocalLlamaCppProviderConfig;
+  /** OpenAI-provider overrides — merged on top of env/config */
+  openai?: Partial<OpenAIProviderConfig>;
+  /**
+   * Custom env source (mostly for tests). Defaults to `process.env`.
+   * Read keys: QMD_EMBED_PROVIDER, QMD_EMBED_ENDPOINT, QMD_EMBED_API_KEY,
+   * QMD_EMBED_MODEL_ID, QMD_EMBED_UPSTREAM_MODEL, QMD_EMBED_BATCH_SIZE,
+   * QMD_EMBED_TIMEOUT_MS.
+   */
+  env?: Record<string, string | undefined>;
+};
+
+/**
+ * Resolve the provider kind without instantiating anything. Useful for
+ * logging and tests.
+ */
+export function resolveProviderKind(opts: CreateEmbeddingProviderOptions = {}): ProviderKind {
+  const env = opts.env ?? process.env;
+  const cfg = loadConfigFile(opts.configPath);
+
+  // 1. Explicit kind argument
+  if (opts.kind) return opts.kind;
+
+  // 2a. Explicit env override
+  const envKind = env.QMD_EMBED_PROVIDER?.trim().toLowerCase();
+  if (envKind === "local" || envKind === "openai") return envKind;
+
+  // 2b. Endpoint env present → openai
+  if (env.QMD_EMBED_ENDPOINT && env.QMD_EMBED_ENDPOINT.trim() !== "") {
+    return "openai";
+  }
+
+  // 3. Config file
+  if (cfg.embedProvider?.kind === "local" || cfg.embedProvider?.kind === "openai") {
+    return cfg.embedProvider.kind;
+  }
+  if (cfg.embedProvider?.endpoint && cfg.embedProvider.endpoint.trim() !== "") {
+    return "openai";
+  }
+
+  // 4. Default
+  return "local";
+}
+
+/**
+ * Factory entry point — returns the appropriate `EmbeddingProvider`.
+ * Throws if `openai` kind is requested but no endpoint is configured.
+ */
+export function createEmbeddingProvider(
+  opts: CreateEmbeddingProviderOptions = {},
+): EmbeddingProvider {
+  const env = opts.env ?? process.env;
+  const cfg = loadConfigFile(opts.configPath);
+  const kind = resolveProviderKind(opts);
+
+  if (kind === "local") {
+    return new LocalLlamaCppProvider(opts.local ?? {});
+  }
+
+  // OpenAI
+  const endpoint =
+    opts.openai?.endpoint ??
+    env.QMD_EMBED_ENDPOINT ??
+    cfg.embedProvider?.endpoint;
+  if (!endpoint || endpoint.trim() === "") {
+    throw new Error(
+      'createEmbeddingProvider: kind="openai" requires an endpoint. ' +
+      "Set QMD_EMBED_ENDPOINT env var, or `embedProvider.endpoint` in " +
+      "~/.config/qmd/config.json, or pass `openai.endpoint`.",
+    );
+  }
+
+  const apiKey =
+    opts.openai?.apiKey ??
+    env.QMD_EMBED_API_KEY ??
+    cfg.embedProvider?.apiKey;
+
+  const modelId =
+    opts.openai?.modelId ??
+    env.QMD_EMBED_MODEL_ID ??
+    cfg.embedProvider?.modelId ??
+    "embeddinggemma";
+
+  const upstreamModel =
+    opts.openai?.upstreamModel ??
+    env.QMD_EMBED_UPSTREAM_MODEL ??
+    cfg.embedProvider?.upstreamModel;
+
+  const batchSizeRaw =
+    opts.openai?.batchSize ??
+    parsePositiveInt(env.QMD_EMBED_BATCH_SIZE) ??
+    cfg.embedProvider?.batchSize;
+
+  const timeoutMsRaw =
+    opts.openai?.timeoutMs ??
+    parsePositiveInt(env.QMD_EMBED_TIMEOUT_MS) ??
+    cfg.embedProvider?.timeoutMs;
+
+  return new OpenAIEmbeddingsProvider({
+    endpoint,
+    apiKey,
+    modelId,
+    upstreamModel,
+    batchSize: batchSizeRaw,
+    timeoutMs: timeoutMsRaw,
+    fetchImpl: opts.openai?.fetchImpl,
+    retryBackoffsMs: opts.openai?.retryBackoffsMs,
+    sleep: opts.openai?.sleep,
+    now: opts.openai?.now,
+  });
+}
+
+// ─────────────────────────── Helpers ────────────────────────────────────────
+
+function parsePositiveInt(v: string | undefined): number | undefined {
+  if (!v) return undefined;
+  const parsed = Number.parseInt(v, 10);
+  if (!Number.isFinite(parsed) || parsed <= 0) return undefined;
+  return parsed;
+}

+ 41 - 0
src/embedding/index.ts

@@ -0,0 +1,41 @@
+/**
+ * embedding/index.ts - re-exports for the embedding provider abstraction.
+ */
+
+export {
+  type EmbeddingProvider,
+  type ProviderKind,
+  type ProviderEmbedding,
+  type ProviderEmbedOptions,
+  type ProviderHealth,
+  ModelMismatchError,
+  assertModelCompatible,
+} from "./provider.js";
+
+export {
+  LocalLlamaCppProvider,
+  type LocalLlamaCppProviderConfig,
+} from "./local.js";
+
+export {
+  OpenAIEmbeddingsProvider,
+  CircuitBreaker,
+  CircuitOpenError,
+  HttpError,
+  isRetryableStatus,
+  chunkArray,
+  type OpenAIProviderConfig,
+  type CircuitState,
+  DEFAULT_BATCH_SIZE,
+  DEFAULT_TIMEOUT_MS,
+  RETRY_BACKOFFS_MS,
+} from "./openai.js";
+
+export {
+  createEmbeddingProvider,
+  resolveProviderKind,
+  loadConfigFile,
+  defaultConfigPath,
+  type CreateEmbeddingProviderOptions,
+  type EmbedProviderConfigFile,
+} from "./factory.js";

+ 123 - 0
src/embedding/local.ts

@@ -0,0 +1,123 @@
+/**
+ * local.ts - Local llama.cpp adapter implementing EmbeddingProvider.
+ *
+ * Wraps an existing `LlamaCpp` instance so the legacy GGUF path looks like
+ * any other EmbeddingProvider to upstream callers. Used as the default and
+ * as the fallback target when `OpenAIEmbeddingsProvider` trips its breaker.
+ */
+
+import {
+  type LlamaCpp,
+  getDefaultLlamaCpp,
+} from "../llm.js";
+import type {
+  EmbeddingProvider,
+  ProviderEmbedOptions,
+  ProviderEmbedding,
+  ProviderHealth,
+  ProviderKind,
+} from "./provider.js";
+
+export type LocalLlamaCppProviderConfig = {
+  /** Pre-built LlamaCpp instance (optional — falls back to global singleton). */
+  llm?: LlamaCpp;
+  /**
+   * Stable model id reported via `getModelId()`. Defaults to "embeddinggemma"
+   * to match the value in `content_vectors.model` for existing qmd installs.
+   */
+  modelId?: string;
+};
+
+export class LocalLlamaCppProvider implements EmbeddingProvider {
+  readonly kind: ProviderKind = "local";
+
+  private readonly llm: LlamaCpp;
+  private readonly modelId: string;
+  private dimensions: number | undefined = undefined;
+
+  constructor(config: LocalLlamaCppProviderConfig = {}) {
+    this.llm = config.llm ?? getDefaultLlamaCpp();
+    this.modelId = config.modelId ?? "embeddinggemma";
+  }
+
+  getModelId(): string {
+    return this.modelId;
+  }
+
+  getDimensions(): number | undefined {
+    return this.dimensions;
+  }
+
+  async healthcheck(_signal?: AbortSignal): Promise<ProviderHealth> {
+    // For the local provider, "healthy" means the embed model loads.
+    // We probe with a single embed call.
+    try {
+      const result = await this.llm.embed("healthcheck", { model: this.modelId });
+      if (!result) {
+        return {
+          ok: false,
+          model: this.modelId,
+          detail: "embed probe returned null",
+        };
+      }
+      this.dimensions = result.embedding.length;
+      return {
+        ok: true,
+        model: this.modelId,
+        dimensions: this.dimensions,
+        detail: `local llama.cpp ready, ${this.dimensions}-d`,
+      };
+    } catch (err) {
+      return {
+        ok: false,
+        model: this.modelId,
+        detail: err instanceof Error ? err.message : String(err),
+      };
+    }
+  }
+
+  async embed(
+    text: string,
+    options: ProviderEmbedOptions = {},
+  ): Promise<ProviderEmbedding | null> {
+    if (options.signal?.aborted) return null;
+    const result = await this.llm.embed(text, { model: options.model ?? this.modelId });
+    if (!result) return null;
+    if (this.dimensions === undefined) {
+      this.dimensions = result.embedding.length;
+    }
+    return {
+      embedding: result.embedding,
+      model: this.modelId,
+    };
+  }
+
+  async embedBatch(
+    texts: string[],
+    options: ProviderEmbedOptions = {},
+  ): Promise<(ProviderEmbedding | null)[]> {
+    if (texts.length === 0) return [];
+    if (options.signal?.aborted) return texts.map(() => null);
+
+    const raw = await this.llm.embedBatch(texts, {
+      model: options.model ?? this.modelId,
+    });
+
+    return raw.map((r) => {
+      if (!r) return null;
+      if (this.dimensions === undefined && r.embedding.length > 0) {
+        this.dimensions = r.embedding.length;
+      }
+      return {
+        embedding: r.embedding,
+        model: this.modelId,
+      };
+    });
+  }
+
+  async dispose(): Promise<void> {
+    // We do NOT dispose the underlying LlamaCpp here because the singleton
+    // is shared with rerank/generate/expansion paths. Disposal is handled
+    // by the existing `disposeDefaultLlamaCpp()` global hook.
+  }
+}

+ 616 - 0
src/embedding/openai.ts

@@ -0,0 +1,616 @@
+/**
+ * openai.ts - OpenAI-compatible HTTP embedding provider
+ *
+ * Talks to any endpoint that implements `POST /v1/embeddings` with the OpenAI
+ * shape: request `{model, input: string|string[]}`, response
+ * `{data: [{embedding: number[], index: number}, ...]}`.
+ *
+ * Used by qmd to delegate embeddings to a GPU worker (e.g. ai.mm.mk →
+ * qmd-embed-worker on `models` LXC, RTX 4090) instead of running
+ * node-llama-cpp locally.
+ *
+ * Features:
+ *   - Batches input in groups of ≤64 (configurable via QMD_EMBED_BATCH_SIZE)
+ *   - Retries 429 / 503 with exponential backoff (1s, 4s, 16s)
+ *   - 4xx (non-429) → no retry, count as failure
+ *   - Circuit breaker: >50% failures in a 60s window → OPEN for 5 min,
+ *     callers can use this to fall back to a local provider
+ *   - Per-call timeout via AbortSignal (default QMD_EMBED_TIMEOUT_MS=30000)
+ *   - Healthcheck via `GET /health` if available, else a probe embed call
+ */
+
+import type {
+  EmbeddingProvider,
+  ProviderEmbedOptions,
+  ProviderEmbedding,
+  ProviderHealth,
+  ProviderKind,
+} from "./provider.js";
+
+// ─────────────────────────── Configuration ───────────────────────────────────
+
+/**
+ * Default batch size — most OpenAI-compatible embedding endpoints accept up to
+ * 2048 inputs per call but for memory and latency we cap at 64.
+ */
+export const DEFAULT_BATCH_SIZE = 64;
+
+/**
+ * Default per-request timeout (30 s). embeddinggemma-300M on RTX 4090 takes
+ * <500ms per batch of 64 in practice; 30s is a safe upper bound.
+ */
+export const DEFAULT_TIMEOUT_MS = 30_000;
+
+/**
+ * Retry backoff schedule (ms) for 429/503 responses. 3 attempts total
+ * (initial + 2 retries) — aligns with issue spec "1s/4s/16s".
+ */
+export const RETRY_BACKOFFS_MS: readonly number[] = [1_000, 4_000, 16_000];
+
+/**
+ * Circuit breaker — flips OPEN when error rate exceeds threshold within
+ * window. While OPEN, every call fails fast so the caller can fall back.
+ */
+export const CIRCUIT_WINDOW_MS = 60_000;
+export const CIRCUIT_OPEN_DURATION_MS = 5 * 60_000;
+export const CIRCUIT_FAILURE_RATE_THRESHOLD = 0.5;
+export const CIRCUIT_MIN_SAMPLES = 4;
+
+// ─────────────────────────── Types ───────────────────────────────────────────
+
+export type OpenAIProviderConfig = {
+  /** Endpoint base URL — e.g. "https://ai.mm.mk" (no trailing slash) */
+  endpoint: string;
+  /** Optional bearer token sent as `Authorization: Bearer ...` */
+  apiKey?: string;
+  /**
+   * Stable model identifier to report up via `getModelId()`.
+   * Defaults to "embeddinggemma" to match qmd's existing DB rows.
+   */
+  modelId?: string;
+  /**
+   * Upstream model name sent in the HTTP request body. Often differs from
+   * `modelId` (e.g. modelId="embeddinggemma" but upstream model="embeddinggemma:300m").
+   */
+  upstreamModel?: string;
+  /** Batch size cap (default DEFAULT_BATCH_SIZE = 64) */
+  batchSize?: number;
+  /** Per-request timeout in ms (default DEFAULT_TIMEOUT_MS = 30_000) */
+  timeoutMs?: number;
+  /** Custom fetch (for testing). Defaults to global `fetch`. */
+  fetchImpl?: typeof fetch;
+  /** Custom retry schedule (for testing). Defaults to RETRY_BACKOFFS_MS. */
+  retryBackoffsMs?: readonly number[];
+  /** Custom sleep impl (for testing). Defaults to setTimeout. */
+  sleep?: (ms: number) => Promise<void>;
+  /** Custom clock (for testing). Defaults to Date.now. */
+  now?: () => number;
+};
+
+export type OpenAIEmbeddingsResponse = {
+  object?: string;
+  model?: string;
+  data: Array<{
+    object?: string;
+    index: number;
+    embedding: number[];
+  }>;
+  usage?: {
+    prompt_tokens?: number;
+    total_tokens?: number;
+  };
+};
+
+/**
+ * Circuit breaker state — exported for tests
+ */
+export type CircuitState = "closed" | "open" | "half-open";
+
+// ─────────────────────────── Helpers ─────────────────────────────────────────
+
+function defaultSleep(ms: number): Promise<void> {
+  return new Promise((resolve) => setTimeout(resolve, ms));
+}
+
+/**
+ * Build the merged AbortSignal for a single HTTP attempt: combines an
+ * external `userSignal` (from caller / withLLMSession) with a per-attempt
+ * timeout signal. Returns the merged signal AND the timeout id so the
+ * caller can `clearTimeout` after the attempt completes (avoids leaks).
+ */
+function buildAttemptSignal(
+  userSignal: AbortSignal | undefined,
+  timeoutMs: number,
+): { signal: AbortSignal; cleanup: () => void } {
+  const ctrl = new AbortController();
+  const timeoutId = setTimeout(() => {
+    ctrl.abort(new Error(`Request timed out after ${timeoutMs}ms`));
+  }, timeoutMs);
+  // Don't keep process alive just for this timer
+  if (typeof timeoutId === "object" && timeoutId !== null && "unref" in timeoutId) {
+    (timeoutId as { unref: () => void }).unref();
+  }
+
+  const onUserAbort = () => ctrl.abort(userSignal?.reason);
+  if (userSignal) {
+    if (userSignal.aborted) {
+      ctrl.abort(userSignal.reason);
+    } else {
+      userSignal.addEventListener("abort", onUserAbort, { once: true });
+    }
+  }
+
+  const cleanup = () => {
+    clearTimeout(timeoutId);
+    if (userSignal) userSignal.removeEventListener("abort", onUserAbort);
+  };
+
+  return { signal: ctrl.signal, cleanup };
+}
+
+/**
+ * Determine whether an HTTP status is retryable. 429 (Too Many Requests)
+ * and 503 (Service Unavailable) are retried; 4xx (other than 429) are not.
+ */
+export function isRetryableStatus(status: number): boolean {
+  return status === 429 || status === 503;
+}
+
+/**
+ * Chunk an array into pieces of ≤ size each. `size` MUST be ≥ 1.
+ */
+export function chunkArray<T>(items: T[], size: number): T[][] {
+  if (size < 1) throw new Error(`chunkArray: size must be ≥ 1, got ${size}`);
+  if (items.length <= size) return items.length === 0 ? [] : [items];
+  const out: T[][] = [];
+  for (let i = 0; i < items.length; i += size) {
+    out.push(items.slice(i, i + size));
+  }
+  return out;
+}
+
+// ─────────────────────────── Circuit Breaker ─────────────────────────────────
+
+/**
+ * Sliding-window circuit breaker. Tracks the last N samples (min 4) over a
+ * 60-second window; flips OPEN when failure rate exceeds 50%, then auto-
+ * resets to HALF-OPEN after 5 minutes — at which point the next probe
+ * decides whether to close (success) or re-open (failure).
+ */
+export class CircuitBreaker {
+  private samples: { ts: number; ok: boolean }[] = [];
+  private state: CircuitState = "closed";
+  private openedAt: number | null = null;
+  private readonly windowMs: number;
+  private readonly openDurationMs: number;
+  private readonly threshold: number;
+  private readonly minSamples: number;
+  private readonly now: () => number;
+
+  constructor(opts: {
+    windowMs?: number;
+    openDurationMs?: number;
+    threshold?: number;
+    minSamples?: number;
+    now?: () => number;
+  } = {}) {
+    this.windowMs = opts.windowMs ?? CIRCUIT_WINDOW_MS;
+    this.openDurationMs = opts.openDurationMs ?? CIRCUIT_OPEN_DURATION_MS;
+    this.threshold = opts.threshold ?? CIRCUIT_FAILURE_RATE_THRESHOLD;
+    this.minSamples = opts.minSamples ?? CIRCUIT_MIN_SAMPLES;
+    this.now = opts.now ?? Date.now;
+  }
+
+  getState(): CircuitState {
+    this.tickAutoReset();
+    return this.state;
+  }
+
+  /**
+   * Returns true when calls should be short-circuited (skip HTTP, fall back).
+   * Side-effects: may transition OPEN → HALF-OPEN if the open window expired.
+   */
+  shouldFailFast(): boolean {
+    return this.getState() === "open";
+  }
+
+  /** Record a successful call. */
+  recordSuccess(): void {
+    // Honor the time-based OPEN→HALF-OPEN transition before deciding what
+    // to do with this sample. Without this, a success that lands AFTER the
+    // open window expired would still see state==="open" and never close
+    // the breaker (a probe call could only flip it via getState()).
+    this.tickAutoReset();
+    this.pushSample(true);
+    if (this.state === "half-open") {
+      this.state = "closed";
+      this.openedAt = null;
+    }
+  }
+
+  /** Record a failed call. May trigger OPEN. */
+  recordFailure(): void {
+    // Same reasoning as recordSuccess — apply lazy auto-reset before
+    // classifying the sample.
+    this.tickAutoReset();
+    this.pushSample(false);
+    if (this.state === "half-open") {
+      // Probe failed — re-open
+      this.state = "open";
+      this.openedAt = this.now();
+      return;
+    }
+    if (this.state === "closed") this.evaluate();
+  }
+
+  /** Force-reset the breaker (used by tests / admin) */
+  reset(): void {
+    this.samples = [];
+    this.state = "closed";
+    this.openedAt = null;
+  }
+
+  private pushSample(ok: boolean): void {
+    const ts = this.now();
+    this.samples.push({ ts, ok });
+    // Drop samples outside the window
+    const cutoff = ts - this.windowMs;
+    while (this.samples.length > 0 && this.samples[0]!.ts < cutoff) {
+      this.samples.shift();
+    }
+  }
+
+  private evaluate(): void {
+    if (this.samples.length < this.minSamples) return;
+    const failures = this.samples.filter((s) => !s.ok).length;
+    const rate = failures / this.samples.length;
+    if (rate > this.threshold) {
+      this.state = "open";
+      this.openedAt = this.now();
+    }
+  }
+
+  private tickAutoReset(): void {
+    if (this.state === "open" && this.openedAt !== null) {
+      if (this.now() - this.openedAt >= this.openDurationMs) {
+        this.state = "half-open";
+      }
+    }
+  }
+}
+
+// ─────────────────────────── Errors ──────────────────────────────────────────
+
+/**
+ * Raised when the circuit breaker is OPEN and a call is short-circuited.
+ * Callers (e.g. fallback wrapper) can catch this to switch to local provider.
+ */
+export class CircuitOpenError extends Error {
+  constructor(message = "OpenAIEmbeddingsProvider circuit is OPEN") {
+    super(message);
+    this.name = "CircuitOpenError";
+  }
+}
+
+/**
+ * Persistent (non-retryable) HTTP error from upstream. Includes status code.
+ */
+export class HttpError extends Error {
+  readonly status: number;
+  readonly bodyPreview: string;
+  constructor(status: number, bodyPreview: string) {
+    super(`HTTP ${status}: ${bodyPreview.slice(0, 200)}`);
+    this.name = "HttpError";
+    this.status = status;
+    this.bodyPreview = bodyPreview.slice(0, 1024);
+  }
+}
+
+// ─────────────────────────── Provider ────────────────────────────────────────
+
+export class OpenAIEmbeddingsProvider implements EmbeddingProvider {
+  readonly kind: ProviderKind = "openai";
+
+  private readonly endpoint: string;
+  private readonly apiKey?: string;
+  private readonly modelId: string;
+  private readonly upstreamModel: string;
+  private readonly batchSize: number;
+  private readonly timeoutMs: number;
+  private readonly fetchImpl: typeof fetch;
+  private readonly retryBackoffsMs: readonly number[];
+  private readonly sleep: (ms: number) => Promise<void>;
+  private readonly now: () => number;
+
+  private dimensions: number | undefined = undefined;
+  readonly breaker: CircuitBreaker;
+
+  constructor(config: OpenAIProviderConfig) {
+    if (!config.endpoint) {
+      throw new Error("OpenAIEmbeddingsProvider: endpoint is required");
+    }
+    this.endpoint = config.endpoint.replace(/\/+$/, "");
+    this.apiKey = config.apiKey;
+    this.modelId = config.modelId ?? "embeddinggemma";
+    this.upstreamModel = config.upstreamModel ?? this.modelId;
+    this.batchSize = config.batchSize ?? DEFAULT_BATCH_SIZE;
+    this.timeoutMs = config.timeoutMs ?? DEFAULT_TIMEOUT_MS;
+    this.fetchImpl = config.fetchImpl ?? globalThis.fetch;
+    this.retryBackoffsMs = config.retryBackoffsMs ?? RETRY_BACKOFFS_MS;
+    this.sleep = config.sleep ?? defaultSleep;
+    this.now = config.now ?? Date.now;
+    this.breaker = new CircuitBreaker({ now: this.now });
+
+    if (!this.fetchImpl) {
+      throw new Error(
+        "OpenAIEmbeddingsProvider: global fetch is unavailable. " +
+        "Provide a `fetchImpl` config option (Node ≥18 ships fetch by default).",
+      );
+    }
+    if (this.batchSize < 1) {
+      throw new Error(`OpenAIEmbeddingsProvider: batchSize must be ≥ 1, got ${this.batchSize}`);
+    }
+  }
+
+  getModelId(): string {
+    return this.modelId;
+  }
+
+  getDimensions(): number | undefined {
+    return this.dimensions;
+  }
+
+  async healthcheck(signal?: AbortSignal): Promise<ProviderHealth> {
+    // Try GET /health first (worker exposes it). Fall back to probe embed.
+    try {
+      const { signal: attemptSig, cleanup } = buildAttemptSignal(signal, this.timeoutMs);
+      try {
+        const resp = await this.fetchImpl(`${this.endpoint}/health`, {
+          method: "GET",
+          headers: this.buildHeaders(),
+          signal: attemptSig,
+        });
+        if (resp.ok) {
+          return {
+            ok: true,
+            model: this.modelId,
+            dimensions: this.dimensions,
+            detail: `GET /health → ${resp.status}`,
+          };
+        }
+        return {
+          ok: false,
+          model: this.modelId,
+          detail: `GET /health → HTTP ${resp.status}`,
+        };
+      } finally {
+        cleanup();
+      }
+    } catch (err) {
+      // Endpoint may not implement /health — try a single embed probe instead.
+      try {
+        const probe = await this.embed("healthcheck", { signal });
+        if (probe) {
+          return {
+            ok: true,
+            model: this.modelId,
+            dimensions: probe.embedding.length,
+            detail: "embed probe ok",
+          };
+        }
+        return {
+          ok: false,
+          model: this.modelId,
+          detail: "embed probe returned null",
+        };
+      } catch (probeErr) {
+        return {
+          ok: false,
+          model: this.modelId,
+          detail:
+            (err instanceof Error ? err.message : String(err)) +
+            " | probe: " +
+            (probeErr instanceof Error ? probeErr.message : String(probeErr)),
+        };
+      }
+    }
+  }
+
+  async embed(
+    text: string,
+    options: ProviderEmbedOptions = {},
+  ): Promise<ProviderEmbedding | null> {
+    const batch = await this.embedBatch([text], options);
+    return batch[0] ?? null;
+  }
+
+  async embedBatch(
+    texts: string[],
+    options: ProviderEmbedOptions = {},
+  ): Promise<(ProviderEmbedding | null)[]> {
+    if (texts.length === 0) return [];
+
+    if (this.breaker.shouldFailFast()) {
+      throw new CircuitOpenError();
+    }
+
+    const chunks = chunkArray(texts, this.batchSize);
+    const results: (ProviderEmbedding | null)[] = new Array(texts.length).fill(null);
+    let cursor = 0;
+
+    for (const chunk of chunks) {
+      const start = cursor;
+      cursor += chunk.length;
+
+      // Abort early if signal already fired
+      if (options.signal?.aborted) {
+        // Leave remaining slots as null (caller treats as errors)
+        return results;
+      }
+
+      // Fail-fast if breaker tripped mid-loop
+      if (this.breaker.shouldFailFast()) {
+        throw new CircuitOpenError();
+      }
+
+      try {
+        const embeddings = await this.requestWithRetry(chunk, options);
+        for (let i = 0; i < chunk.length; i++) {
+          const embedding = embeddings[i];
+          if (embedding) {
+            results[start + i] = {
+              embedding,
+              model: this.modelId,
+            };
+            // Record dimensions on first success
+            if (this.dimensions === undefined) {
+              this.dimensions = embedding.length;
+            }
+          }
+        }
+        this.breaker.recordSuccess();
+      } catch (err) {
+        this.breaker.recordFailure();
+        // CircuitOpenError must propagate so the caller can fall back
+        if (err instanceof CircuitOpenError) throw err;
+        // Other errors mark the chunk as null and continue with next chunk.
+        // (The store layer already handles per-text nulls as errors.)
+        if (process.env.QMD_EMBED_DEBUG) {
+          process.stderr.write(
+            `OpenAIEmbeddingsProvider: chunk failed (${err instanceof Error ? err.message : String(err)})\n`,
+          );
+        }
+      }
+    }
+
+    return results;
+  }
+
+  async dispose(): Promise<void> {
+    // Nothing to release — fetch handles its own connection pooling.
+    // Reset the breaker so a re-instantiation starts fresh.
+    this.breaker.reset();
+  }
+
+  // ────────────────────── Internals ──────────────────────
+
+  private buildHeaders(): Record<string, string> {
+    const headers: Record<string, string> = {
+      "Content-Type": "application/json",
+      "Accept": "application/json",
+    };
+    if (this.apiKey) {
+      headers["Authorization"] = `Bearer ${this.apiKey}`;
+    }
+    return headers;
+  }
+
+  /**
+   * Single HTTP request with retry on 429/503. Returns embeddings indexed
+   * the same as `texts`. Throws on non-retryable failure or all attempts
+   * exhausted.
+   */
+  private async requestWithRetry(
+    texts: string[],
+    options: ProviderEmbedOptions,
+  ): Promise<number[][]> {
+    let lastErr: unknown = null;
+    const maxAttempts = this.retryBackoffsMs.length + 1;
+
+    for (let attempt = 0; attempt < maxAttempts; attempt++) {
+      // Honor user abort BEFORE issuing the call (avoids wasted network)
+      if (options.signal?.aborted) {
+        throw new Error("aborted by caller");
+      }
+
+      try {
+        return await this.requestOnce(texts, options);
+      } catch (err) {
+        lastErr = err;
+        const retryable =
+          err instanceof HttpError ? isRetryableStatus(err.status) : false;
+        if (!retryable) throw err;
+        if (attempt < this.retryBackoffsMs.length) {
+          await this.sleep(this.retryBackoffsMs[attempt]!);
+        }
+      }
+    }
+
+    // Exhausted retries → throw the last error so caller marks the chunk null
+    throw lastErr ?? new Error("requestWithRetry exhausted");
+  }
+
+  /**
+   * Issue one HTTP attempt to `POST /v1/embeddings`. Does NOT retry.
+   */
+  private async requestOnce(
+    texts: string[],
+    options: ProviderEmbedOptions,
+  ): Promise<number[][]> {
+    const { signal: attemptSig, cleanup } = buildAttemptSignal(options.signal, this.timeoutMs);
+    try {
+      const body = JSON.stringify({
+        model: options.model ?? this.upstreamModel,
+        input: texts,
+      });
+      const resp = await this.fetchImpl(`${this.endpoint}/v1/embeddings`, {
+        method: "POST",
+        headers: this.buildHeaders(),
+        body,
+        signal: attemptSig,
+      });
+
+      if (!resp.ok) {
+        const text = await resp.text().catch(() => "");
+        throw new HttpError(resp.status, text);
+      }
+
+      let parsed: OpenAIEmbeddingsResponse;
+      try {
+        parsed = (await resp.json()) as OpenAIEmbeddingsResponse;
+      } catch (err) {
+        throw new Error(
+          `OpenAIEmbeddingsProvider: malformed JSON from ${this.endpoint}/v1/embeddings: ${err instanceof Error ? err.message : String(err)}`,
+        );
+      }
+
+      if (!parsed || !Array.isArray(parsed.data)) {
+        throw new Error(
+          `OpenAIEmbeddingsProvider: response missing "data" array (got ${typeof parsed})`,
+        );
+      }
+
+      // Sort by index to match input order (in case server returns out-of-order).
+      const out: number[][] = new Array(texts.length);
+      for (const item of parsed.data) {
+        if (
+          typeof item.index !== "number" ||
+          item.index < 0 ||
+          item.index >= texts.length
+        ) {
+          throw new Error(
+            `OpenAIEmbeddingsProvider: data item index out of range (${item.index}, expected 0..${texts.length - 1})`,
+          );
+        }
+        if (!Array.isArray(item.embedding)) {
+          throw new Error(
+            `OpenAIEmbeddingsProvider: data[${item.index}].embedding is not an array`,
+          );
+        }
+        out[item.index] = item.embedding;
+      }
+
+      // Sanity check — every slot must be filled
+      for (let i = 0; i < texts.length; i++) {
+        if (!out[i]) {
+          throw new Error(
+            `OpenAIEmbeddingsProvider: response missing embedding for index ${i}`,
+          );
+        }
+      }
+      return out;
+    } finally {
+      cleanup();
+    }
+  }
+}

+ 143 - 0
src/embedding/provider.ts

@@ -0,0 +1,143 @@
+/**
+ * provider.ts - Embedding provider abstraction
+ *
+ * Defines the EmbeddingProvider interface that allows qmd to use either:
+ *   - LocalLlamaCppProvider (legacy, GGUF via node-llama-cpp)
+ *   - OpenAIEmbeddingsProvider (HTTP, OpenAI-compatible endpoint like ai.mm.mk)
+ *
+ * The factory in `./factory.ts` selects an implementation based on env vars,
+ * a CLI flag, or `~/.config/qmd/config.json`.
+ */
+
+/**
+ * Single embedding result
+ */
+export type ProviderEmbedding = {
+  embedding: number[];
+  /** Model identifier used to produce this embedding (matches content_vectors.model in DB) */
+  model: string;
+};
+
+/**
+ * Supported provider kinds
+ */
+export type ProviderKind = "local" | "openai";
+
+/**
+ * Healthcheck result for provider startup verification
+ */
+export type ProviderHealth = {
+  ok: boolean;
+  /** Model identifier reported by the provider */
+  model: string;
+  /** Embedding dimensions (e.g. 768 for embeddinggemma-300M) */
+  dimensions?: number;
+  /** Detail message (error reason on failure, status on success) */
+  detail?: string;
+};
+
+/**
+ * Per-call options for provider embedding
+ */
+export type ProviderEmbedOptions = {
+  /** Optional model id override (rare; usually provider has a fixed model) */
+  model?: string;
+  /** Abort signal for cancellation / timeout */
+  signal?: AbortSignal;
+};
+
+/**
+ * Provider interface — both LocalLlamaCppProvider and OpenAIEmbeddingsProvider implement this.
+ *
+ * Implementations MUST:
+ *   - Return `null` (not throw) for individual texts that fail to embed;
+ *     the caller will count it as an error and continue.
+ *   - Honor `options.signal` for cancellation.
+ *   - Be safe to call concurrently for `embedBatch`.
+ */
+export interface EmbeddingProvider {
+  /** Provider kind tag — useful for logging and factory introspection */
+  readonly kind: ProviderKind;
+
+  /**
+   * Stable model identifier reported to the caller.
+   *
+   * MUST match what's stored in `content_vectors.model` for the existing
+   * index — otherwise the model-id guard refuses to embed.
+   */
+  getModelId(): string;
+
+  /**
+   * Embedding vector dimensions. May return `undefined` before the first call
+   * (some providers probe lazily). Once known, MUST stay stable.
+   */
+  getDimensions(): number | undefined;
+
+  /**
+   * Healthcheck — verifies the provider is reachable and the model is loaded.
+   * Should NOT throw — return `{ ok: false, detail: ... }` on failure.
+   *
+   * For HTTP providers: ping `/health` endpoint.
+   * For local provider: ensure model loads.
+   */
+  healthcheck(signal?: AbortSignal): Promise<ProviderHealth>;
+
+  /**
+   * Embed a single text. Returns `null` on per-call failure.
+   */
+  embed(text: string, options?: ProviderEmbedOptions): Promise<ProviderEmbedding | null>;
+
+  /**
+   * Embed multiple texts in a batch (more efficient than calling `embed` N times).
+   *
+   * Output array length MUST equal input array length. Failed entries are `null`.
+   * Implementations are responsible for chunking large batches per their
+   * upstream limits (e.g. OpenAI provider chunks to 64).
+   */
+  embedBatch(texts: string[], options?: ProviderEmbedOptions): Promise<(ProviderEmbedding | null)[]>;
+
+  /** Release any held resources (HTTP keep-alive sockets, model handles, …) */
+  dispose(): Promise<void>;
+}
+
+/**
+ * Error thrown when the provider's reported model id does not match the
+ * model id baked into existing `content_vectors` rows. Forces user to
+ * re-embed (`qmd embed -f`) or pin the matching model id.
+ */
+export class ModelMismatchError extends Error {
+  readonly providerModel: string;
+  readonly existingModels: string[];
+
+  constructor(providerModel: string, existingModels: string[]) {
+    const list = existingModels.join(", ");
+    super(
+      `Embedding model mismatch: existing vectors use model(s) [${list}] ` +
+      `but the configured provider reports "${providerModel}". ` +
+      `Run \`qmd embed -f\` (or \`--rebuild\`) to re-embed everything with ` +
+      `the new model, or set QMD_EMBED_MODEL_ID="${existingModels[0] ?? ""}" ` +
+      `to keep the existing vectors.`
+    );
+    this.name = "ModelMismatchError";
+    this.providerModel = providerModel;
+    this.existingModels = existingModels;
+  }
+}
+
+/**
+ * Verify that the provider's model id is compatible with the existing
+ * `content_vectors` entries. Pass-through (no-op) if the table is empty
+ * (fresh DB) or if the model id appears in the distinct set.
+ *
+ * Caller passes `existingModels` (typically result of
+ * `SELECT DISTINCT model FROM content_vectors`).
+ */
+export function assertModelCompatible(
+  providerModel: string,
+  existingModels: string[],
+): void {
+  // Empty DB — nothing to compare against, anything goes.
+  if (existingModels.length === 0) return;
+  if (existingModels.includes(providerModel)) return;
+  throw new ModelMismatchError(providerModel, existingModels);
+}

+ 30 - 0
src/index.ts

@@ -119,6 +119,36 @@ export { getDefaultDbPath } from "./store.js";
 // Re-export Maintenance class for CLI housekeeping operations
 export { Maintenance } from "./maintenance.js";
 
+// Re-export embedding provider abstraction for SDK consumers (i-qkarfffa).
+// `createEmbeddingProvider` honors QMD_EMBED_ENDPOINT / config-file / kind
+// arg precedence; default fallback is the legacy LocalLlamaCppProvider so
+// SDK code that doesn't pass `embedProvider` keeps the prior behavior.
+export {
+  createEmbeddingProvider,
+  resolveProviderKind,
+  LocalLlamaCppProvider,
+  OpenAIEmbeddingsProvider,
+  CircuitBreaker,
+  CircuitOpenError,
+  HttpError,
+  ModelMismatchError,
+  assertModelCompatible,
+  type EmbeddingProvider,
+  type ProviderKind,
+  type ProviderEmbedding,
+  type ProviderEmbedOptions,
+  type ProviderHealth,
+  type CreateEmbeddingProviderOptions,
+  type OpenAIProviderConfig,
+  type LocalLlamaCppProviderConfig,
+  type EmbedProviderConfigFile,
+  DEFAULT_BATCH_SIZE as DEFAULT_PROVIDER_BATCH_SIZE,
+  DEFAULT_TIMEOUT_MS as DEFAULT_PROVIDER_TIMEOUT_MS,
+  RETRY_BACKOFFS_MS as PROVIDER_RETRY_BACKOFFS_MS,
+} from "./embedding/index.js";
+
+export { getDistinctEmbeddingModels } from "./store.js";
+
 /**
  * Progress info emitted during update() for each file processed.
  */

+ 77 - 6
src/store.ts

@@ -33,6 +33,10 @@ import type {
   CollectionConfig,
   ContextMap,
 } from "./collections.js";
+import {
+  type EmbeddingProvider,
+  assertModelCompatible,
+} from "./embedding/provider.js";
 
 // =============================================================================
 // Configuration
@@ -1292,6 +1296,16 @@ export type EmbedOptions = {
   maxBatchBytes?: number;
   chunkStrategy?: ChunkStrategy;
   onProgress?: (info: EmbedProgress) => void;
+  /**
+   * Optional embedding provider. When supplied, embeddings are routed through
+   * this provider (HTTP, GPU worker, etc.) instead of the local llama.cpp
+   * session path. The provider's `getModelId()` is verified against existing
+   * `content_vectors.model` rows; mismatch throws unless `force` is set.
+   *
+   * When omitted, behavior is identical to pre-patch: embeddings come from
+   * the store's `LlamaCpp` (or the global singleton).
+   */
+  embedProvider?: EmbeddingProvider;
 };
 
 type PendingEmbeddingDoc = {
@@ -1410,6 +1424,15 @@ export async function generateEmbeddings(
   const { maxDocsPerBatch, maxBatchBytes } = resolveEmbedOptions(options);
   const encoder = new TextEncoder();
 
+  // Migration safety: if an embedProvider is supplied, verify its model id
+  // matches the existing content_vectors rows (unless we're about to clear
+  // them via `force`). This must happen BEFORE we clear vectors so users
+  // who pass `--force` aren't blocked.
+  if (options?.embedProvider && !options.force) {
+    const existing = getDistinctEmbeddingModels(db);
+    assertModelCompatible(options.embedProvider.getModelId(), existing);
+  }
+
   if (options?.force) {
     clearAllEmbeddings(db);
   }
@@ -1442,7 +1465,14 @@ export async function generateEmbeddings(
 
   // Use store's LlamaCpp or global singleton, wrapped in a session
   const llm = getLlm(store);
-  const embedModelUri = llm.embedModelName;
+  const embedModelUri = options?.embedProvider?.getModelId() ?? llm.embedModelName;
+
+  // Provider routing — when an EmbeddingProvider is supplied, embed calls go
+  // through it (HTTP, GPU worker, etc.). Otherwise, use the LLM session path.
+  // The outer session is still created for its abort signal (chunking uses
+  // `session.signal` for cooperative cancellation).
+  const provider = options?.embedProvider;
+  const providerModel = provider?.getModelId() ?? model;
 
   // Create a session manager for this llm instance
   const result = await withLLMSessionForLlm(llm, async (session) => {
@@ -1454,6 +1484,30 @@ export async function generateEmbeddings(
     const BATCH_SIZE = 32;
     const batches = buildEmbeddingBatches(docsToEmbed, maxDocsPerBatch, maxBatchBytes);
 
+    // Embedding helpers — single point of provider/session selection.
+    // Both return the same shape as ILLMSession.embed/embedBatch so the
+    // rest of the loop is unchanged.
+    const embedOne = async (
+      text: string,
+      modelArg: string,
+    ): Promise<{ embedding: number[]; model: string } | null> => {
+      if (provider) {
+        const r = await provider.embed(text, { model: modelArg, signal: session.signal });
+        return r ? { embedding: r.embedding, model: r.model } : null;
+      }
+      return session.embed(text, { model: modelArg });
+    };
+    const embedMany = async (
+      texts: string[],
+      modelArg: string,
+    ): Promise<({ embedding: number[]; model: string } | null)[]> => {
+      if (provider) {
+        const r = await provider.embedBatch(texts, { model: modelArg, signal: session.signal });
+        return r.map((x) => (x ? { embedding: x.embedding, model: x.model } : null));
+      }
+      return session.embedBatch(texts, { model: modelArg });
+    };
+
     for (const batchMeta of batches) {
       // Abort early if session has been invalidated
       if (!session.isValid) {
@@ -1503,7 +1557,7 @@ export async function generateEmbeddings(
       if (!vectorTableInitialized) {
         const firstChunk = batchChunks[0]!;
         const firstText = formatDocForEmbedding(firstChunk.text, firstChunk.title, embedModelUri);
-        const firstResult = await session.embed(firstText, { model });
+        const firstResult = await embedOne(firstText, providerModel);
         if (!firstResult) {
           throw new Error("Failed to get embedding dimensions from first chunk");
         }
@@ -1537,12 +1591,12 @@ export async function generateEmbeddings(
         const texts = chunkBatch.map(chunk => formatDocForEmbedding(chunk.text, chunk.title, embedModelUri));
 
         try {
-          const embeddings = await session.embedBatch(texts, { model });
+          const embeddings = await embedMany(texts, providerModel);
           for (let i = 0; i < chunkBatch.length; i++) {
             const chunk = chunkBatch[i]!;
             const embedding = embeddings[i];
             if (embedding) {
-              insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(embedding.embedding), model, now);
+              insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(embedding.embedding), providerModel, now);
               chunksEmbedded++;
             } else {
               errors++;
@@ -1559,9 +1613,9 @@ export async function generateEmbeddings(
             for (const chunk of chunkBatch) {
               try {
                 const text = formatDocForEmbedding(chunk.text, chunk.title, embedModelUri);
-                const result = await session.embed(text, { model });
+                const result = await embedOne(text, providerModel);
                 if (result) {
-                  insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(result.embedding), model, now);
+                  insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(result.embedding), providerModel, now);
                   chunksEmbedded++;
                 } else {
                   errors++;
@@ -3272,6 +3326,23 @@ export function clearAllEmbeddings(db: Database): void {
   db.exec(`DROP TABLE IF EXISTS vectors_vec`);
 }
 
+/**
+ * Get the distinct set of model identifiers present in `content_vectors`.
+ *
+ * Used by the embedding migration-safety guard: if a configured provider's
+ * `getModelId()` does not appear in this list (and the table is non-empty),
+ * we refuse to embed and ask the user to run `qmd embed -f` to rebuild.
+ *
+ * Returns `[]` when the table is empty (fresh DB) — in which case any
+ * provider is allowed.
+ */
+export function getDistinctEmbeddingModels(db: Database): string[] {
+  const rows = db.prepare(
+    `SELECT DISTINCT model FROM content_vectors WHERE model IS NOT NULL`,
+  ).all() as { model: string }[];
+  return rows.map((r) => r.model).filter((m) => typeof m === "string" && m.length > 0);
+}
+
 /**
  * Insert a single embedding into both content_vectors and vectors_vec tables.
  * The hash_seq key is formatted as "hash_seq" for the vectors_vec table.

+ 263 - 0
test/embedding-factory.test.ts

@@ -0,0 +1,263 @@
+/**
+ * embedding-factory.test.ts - Tests for createEmbeddingProvider factory.
+ *
+ * Verifies the resolution precedence:
+ *   1. explicit `kind` argument
+ *   2. QMD_EMBED_PROVIDER env
+ *   3. QMD_EMBED_ENDPOINT env (forces openai)
+ *   4. config file `embedProvider.kind` / `embedProvider.endpoint`
+ *   5. fallback: local
+ */
+
+import { describe, test, expect, beforeEach, afterEach } from "vitest";
+import { mkdtempSync, rmSync, writeFileSync, mkdirSync } from "node:fs";
+import { tmpdir } from "node:os";
+import { join } from "node:path";
+import {
+  resolveProviderKind,
+  createEmbeddingProvider,
+  loadConfigFile,
+} from "../src/embedding/factory.js";
+import { OpenAIEmbeddingsProvider } from "../src/embedding/openai.js";
+import { LocalLlamaCppProvider } from "../src/embedding/local.js";
+
+let workDir: string;
+let configPath: string;
+
+beforeEach(() => {
+  workDir = mkdtempSync(join(tmpdir(), "qmd-factory-test-"));
+  mkdirSync(join(workDir, "qmd"), { recursive: true });
+  configPath = join(workDir, "qmd", "config.json");
+});
+
+afterEach(() => {
+  rmSync(workDir, { recursive: true, force: true });
+});
+
+// ─────────────────────────── Helpers ─────────────────────────────────────────
+
+function writeConfig(obj: Record<string, unknown>) {
+  writeFileSync(configPath, JSON.stringify(obj));
+}
+
+const EMPTY_ENV: Record<string, string | undefined> = {};
+
+// ─────────────────────────── resolveProviderKind ─────────────────────────────
+
+describe("resolveProviderKind", () => {
+  test("explicit kind argument wins", () => {
+    expect(
+      resolveProviderKind({
+        kind: "local",
+        env: { QMD_EMBED_ENDPOINT: "https://x" },
+        configPath,
+      }),
+    ).toBe("local");
+    expect(
+      resolveProviderKind({
+        kind: "openai",
+        env: EMPTY_ENV,
+        configPath,
+      }),
+    ).toBe("openai");
+  });
+
+  test("QMD_EMBED_PROVIDER env wins over QMD_EMBED_ENDPOINT", () => {
+    expect(
+      resolveProviderKind({
+        env: { QMD_EMBED_PROVIDER: "local", QMD_EMBED_ENDPOINT: "https://x" },
+        configPath,
+      }),
+    ).toBe("local");
+  });
+
+  test("QMD_EMBED_ENDPOINT presence → openai", () => {
+    expect(
+      resolveProviderKind({
+        env: { QMD_EMBED_ENDPOINT: "https://ai.example.com" },
+        configPath,
+      }),
+    ).toBe("openai");
+  });
+
+  test("QMD_EMBED_ENDPOINT empty string ignored", () => {
+    expect(
+      resolveProviderKind({
+        env: { QMD_EMBED_ENDPOINT: "" },
+        configPath,
+      }),
+    ).toBe("local");
+  });
+
+  test("config file embedProvider.kind respected", () => {
+    writeConfig({ embedProvider: { kind: "openai", endpoint: "https://ai.example.com" } });
+    expect(resolveProviderKind({ env: EMPTY_ENV, configPath })).toBe("openai");
+  });
+
+  test("config file embedProvider.endpoint alone → openai", () => {
+    writeConfig({ embedProvider: { endpoint: "https://ai.example.com" } });
+    expect(resolveProviderKind({ env: EMPTY_ENV, configPath })).toBe("openai");
+  });
+
+  test("no signal anywhere → local fallback", () => {
+    expect(resolveProviderKind({ env: EMPTY_ENV, configPath })).toBe("local");
+  });
+
+  test("invalid env QMD_EMBED_PROVIDER is ignored", () => {
+    expect(
+      resolveProviderKind({
+        env: { QMD_EMBED_PROVIDER: "garbage" },
+        configPath,
+      }),
+    ).toBe("local");
+  });
+
+  test("uppercase env QMD_EMBED_PROVIDER normalized", () => {
+    expect(
+      resolveProviderKind({
+        env: { QMD_EMBED_PROVIDER: "OPENAI", QMD_EMBED_ENDPOINT: "https://x" },
+        configPath,
+      }),
+    ).toBe("openai");
+  });
+});
+
+// ─────────────────────────── createEmbeddingProvider ─────────────────────────
+
+describe("createEmbeddingProvider", () => {
+  test("openai kind w/ endpoint env → OpenAIEmbeddingsProvider", () => {
+    const p = createEmbeddingProvider({
+      env: { QMD_EMBED_ENDPOINT: "https://ai.example.com" },
+      configPath,
+    });
+    expect(p).toBeInstanceOf(OpenAIEmbeddingsProvider);
+    expect(p.kind).toBe("openai");
+    expect(p.getModelId()).toBe("embeddinggemma");
+  });
+
+  test("openai kind w/ explicit options merges over env", () => {
+    const p = createEmbeddingProvider({
+      env: { QMD_EMBED_ENDPOINT: "https://env.example.com", QMD_EMBED_API_KEY: "env-key" },
+      configPath,
+      openai: { endpoint: "https://override.example.com" },
+    });
+    // Cast to access internal properties for verification
+    const inner = p as OpenAIEmbeddingsProvider & { endpoint: string; apiKey: string };
+    expect(inner["endpoint"]).toBe("https://override.example.com");
+    // apiKey should still come from env since we didn't override it
+    expect(inner["apiKey"]).toBe("env-key");
+  });
+
+  test("openai kind reads modelId from env", () => {
+    const p = createEmbeddingProvider({
+      env: {
+        QMD_EMBED_ENDPOINT: "https://ai.example.com",
+        QMD_EMBED_MODEL_ID: "custom-model",
+      },
+      configPath,
+    });
+    expect(p.getModelId()).toBe("custom-model");
+  });
+
+  test("openai kind reads upstream model from env", () => {
+    const p = createEmbeddingProvider({
+      env: {
+        QMD_EMBED_ENDPOINT: "https://ai.example.com",
+        QMD_EMBED_UPSTREAM_MODEL: "embeddinggemma:300m",
+      },
+      configPath,
+    }) as OpenAIEmbeddingsProvider & { upstreamModel: string };
+    expect(p["upstreamModel"]).toBe("embeddinggemma:300m");
+  });
+
+  test("openai kind reads batch size and timeout from env", () => {
+    const p = createEmbeddingProvider({
+      env: {
+        QMD_EMBED_ENDPOINT: "https://ai.example.com",
+        QMD_EMBED_BATCH_SIZE: "32",
+        QMD_EMBED_TIMEOUT_MS: "5000",
+      },
+      configPath,
+    }) as OpenAIEmbeddingsProvider & { batchSize: number; timeoutMs: number };
+    expect(p["batchSize"]).toBe(32);
+    expect(p["timeoutMs"]).toBe(5000);
+  });
+
+  test("openai kind merges config file values", () => {
+    writeConfig({
+      embedProvider: {
+        kind: "openai",
+        endpoint: "https://config.example.com",
+        apiKey: "config-key",
+        modelId: "config-model",
+        batchSize: 16,
+      },
+    });
+    const p = createEmbeddingProvider({
+      env: EMPTY_ENV,
+      configPath,
+    }) as OpenAIEmbeddingsProvider & {
+      endpoint: string;
+      apiKey: string;
+      batchSize: number;
+    };
+    expect(p["endpoint"]).toBe("https://config.example.com");
+    expect(p["apiKey"]).toBe("config-key");
+    expect(p.getModelId()).toBe("config-model");
+    expect(p["batchSize"]).toBe(16);
+  });
+
+  test("env wins over config file", () => {
+    writeConfig({
+      embedProvider: {
+        endpoint: "https://config.example.com",
+      },
+    });
+    const p = createEmbeddingProvider({
+      env: { QMD_EMBED_ENDPOINT: "https://env.example.com" },
+      configPath,
+    }) as OpenAIEmbeddingsProvider & { endpoint: string };
+    expect(p["endpoint"]).toBe("https://env.example.com");
+  });
+
+  test("openai kind without endpoint throws", () => {
+    expect(() =>
+      createEmbeddingProvider({ kind: "openai", env: EMPTY_ENV, configPath }),
+    ).toThrow(/endpoint/);
+  });
+
+  test("local kind explicitly requested → LocalLlamaCppProvider", () => {
+    const p = createEmbeddingProvider({
+      kind: "local",
+      env: EMPTY_ENV,
+      configPath,
+    });
+    expect(p).toBeInstanceOf(LocalLlamaCppProvider);
+    expect(p.kind).toBe("local");
+  });
+
+  test("default fallback → LocalLlamaCppProvider", () => {
+    const p = createEmbeddingProvider({ env: EMPTY_ENV, configPath });
+    expect(p).toBeInstanceOf(LocalLlamaCppProvider);
+  });
+});
+
+// ─────────────────────────── loadConfigFile ──────────────────────────────────
+
+describe("loadConfigFile", () => {
+  test("missing file → empty object", () => {
+    expect(loadConfigFile(join(workDir, "missing.json"))).toEqual({});
+  });
+
+  test("invalid JSON → empty object (no throw)", () => {
+    writeFileSync(configPath, "not json");
+    expect(loadConfigFile(configPath)).toEqual({});
+  });
+
+  test("valid JSON parsed", () => {
+    writeConfig({ embedProvider: { kind: "openai" } });
+    expect(loadConfigFile(configPath)).toEqual({
+      embedProvider: { kind: "openai" },
+    });
+  });
+});

+ 721 - 0
test/embedding-openai.test.ts

@@ -0,0 +1,721 @@
+/**
+ * embedding-openai.test.ts - Tests for OpenAIEmbeddingsProvider (HTTP backend).
+ *
+ * Uses a mock fetch — no network required. Covers:
+ *   - 200 happy path
+ *   - 429 → retry → success
+ *   - 503 persistent → exhausted retries → null
+ *   - 4xx (non-429) → no retry, immediate failure
+ *   - batch chunking (>64 items → multiple HTTP calls)
+ *   - timeout / abort
+ *   - malformed JSON / missing data array
+ *   - circuit breaker open + half-open recovery
+ *   - dimension probing
+ *   - healthcheck endpoint
+ */
+
+import { describe, test, expect, vi } from "vitest";
+import {
+  OpenAIEmbeddingsProvider,
+  CircuitBreaker,
+  CircuitOpenError,
+  HttpError,
+  isRetryableStatus,
+  chunkArray,
+  RETRY_BACKOFFS_MS,
+} from "../src/embedding/openai.js";
+
+// ─────────────────────────── Helpers ─────────────────────────────────────────
+
+function mockResponse(status: number, body: unknown, opts?: { delayMs?: number }): Response {
+  const text = typeof body === "string" ? body : JSON.stringify(body);
+  const init: ResponseInit = {
+    status,
+    headers: { "content-type": "application/json" },
+  };
+  if (opts?.delayMs) {
+    // Synchronous Response — test code awaits it directly, so delayMs would
+    // need to be implemented in the fetch wrapper, not here.
+  }
+  return new Response(text, init);
+}
+
+function makeFetchSequence(responses: Array<() => Promise<Response> | Response>): {
+  fetchImpl: typeof fetch;
+  calls: { url: string; init?: RequestInit }[];
+} {
+  const calls: { url: string; init?: RequestInit }[] = [];
+  let i = 0;
+  const fetchImpl = (async (input: RequestInfo | URL, init?: RequestInit) => {
+    const url = typeof input === "string" ? input : input.toString();
+    calls.push({ url, init });
+    if (i >= responses.length) throw new Error(`Mock fetch exhausted at call ${i + 1}`);
+    const r = responses[i++]!();
+    return r instanceof Promise ? r : r;
+  }) as typeof fetch;
+  return { fetchImpl, calls };
+}
+
+function fakeEmbedding(dim: number, seed = 0): number[] {
+  return Array.from({ length: dim }, (_, i) => Math.sin(seed + i) * 0.5);
+}
+
+function embeddingsResponse(texts: string[], dim = 4): Response {
+  return mockResponse(200, {
+    object: "list",
+    model: "embeddinggemma:300m",
+    data: texts.map((_, i) => ({
+      object: "embedding",
+      index: i,
+      embedding: fakeEmbedding(dim, i * 7),
+    })),
+  });
+}
+
+// ─────────────────────────── Pure helpers ────────────────────────────────────
+
+describe("isRetryableStatus", () => {
+  test("429 retryable", () => expect(isRetryableStatus(429)).toBe(true));
+  test("503 retryable", () => expect(isRetryableStatus(503)).toBe(true));
+  test("400 NOT retryable", () => expect(isRetryableStatus(400)).toBe(false));
+  test("401 NOT retryable", () => expect(isRetryableStatus(401)).toBe(false));
+  test("404 NOT retryable", () => expect(isRetryableStatus(404)).toBe(false));
+  test("500 NOT retryable", () => expect(isRetryableStatus(500)).toBe(false));
+  test("502 NOT retryable", () => expect(isRetryableStatus(502)).toBe(false));
+  test("200 NOT retryable", () => expect(isRetryableStatus(200)).toBe(false));
+});
+
+describe("chunkArray", () => {
+  test("empty input → empty output", () => {
+    expect(chunkArray([], 5)).toEqual([]);
+  });
+  test("input ≤ size → single chunk", () => {
+    expect(chunkArray([1, 2, 3], 5)).toEqual([[1, 2, 3]]);
+  });
+  test("input = size → single chunk", () => {
+    expect(chunkArray([1, 2, 3, 4, 5], 5)).toEqual([[1, 2, 3, 4, 5]]);
+  });
+  test("input > size → multiple chunks", () => {
+    expect(chunkArray([1, 2, 3, 4, 5, 6, 7], 3)).toEqual([
+      [1, 2, 3],
+      [4, 5, 6],
+      [7],
+    ]);
+  });
+  test("65 items at size 64 → 64 + 1", () => {
+    const items = Array.from({ length: 65 }, (_, i) => i);
+    const chunks = chunkArray(items, 64);
+    expect(chunks.length).toBe(2);
+    expect(chunks[0]!.length).toBe(64);
+    expect(chunks[1]!.length).toBe(1);
+  });
+  test("size < 1 throws", () => {
+    expect(() => chunkArray([1, 2, 3], 0)).toThrow();
+    expect(() => chunkArray([1, 2, 3], -1)).toThrow();
+  });
+});
+
+// ─────────────────────────── Circuit Breaker ─────────────────────────────────
+
+describe("CircuitBreaker", () => {
+  test("starts closed", () => {
+    const cb = new CircuitBreaker();
+    expect(cb.getState()).toBe("closed");
+    expect(cb.shouldFailFast()).toBe(false);
+  });
+
+  test("stays closed below minSamples even with all-failures", () => {
+    const cb = new CircuitBreaker({ minSamples: 4, threshold: 0.5 });
+    cb.recordFailure();
+    cb.recordFailure();
+    cb.recordFailure();
+    expect(cb.getState()).toBe("closed");
+  });
+
+  test("opens when failure rate exceeds threshold", () => {
+    const cb = new CircuitBreaker({ minSamples: 4, threshold: 0.5 });
+    cb.recordFailure();
+    cb.recordFailure();
+    cb.recordFailure();
+    cb.recordFailure();
+    expect(cb.getState()).toBe("open");
+    expect(cb.shouldFailFast()).toBe(true);
+  });
+
+  test("transitions OPEN → HALF-OPEN after openDurationMs", () => {
+    let now = 1_000_000;
+    const cb = new CircuitBreaker({
+      minSamples: 4,
+      threshold: 0.5,
+      openDurationMs: 5000,
+      now: () => now,
+    });
+    for (let i = 0; i < 4; i++) cb.recordFailure();
+    expect(cb.getState()).toBe("open");
+    now += 5001;
+    expect(cb.getState()).toBe("half-open");
+  });
+
+  test("HALF-OPEN + success → CLOSED", () => {
+    let now = 1_000_000;
+    const cb = new CircuitBreaker({
+      minSamples: 4,
+      threshold: 0.5,
+      openDurationMs: 5000,
+      now: () => now,
+    });
+    for (let i = 0; i < 4; i++) cb.recordFailure();
+    now += 5001;
+    cb.recordSuccess(); // half-open probe
+    expect(cb.getState()).toBe("closed");
+  });
+
+  test("HALF-OPEN + failure → re-OPEN", () => {
+    let now = 1_000_000;
+    const cb = new CircuitBreaker({
+      minSamples: 4,
+      threshold: 0.5,
+      openDurationMs: 5000,
+      now: () => now,
+    });
+    for (let i = 0; i < 4; i++) cb.recordFailure();
+    now += 5001;
+    expect(cb.getState()).toBe("half-open");
+    cb.recordFailure();
+    expect(cb.getState()).toBe("open");
+  });
+
+  test("samples outside window are dropped", () => {
+    let now = 1_000_000;
+    const cb = new CircuitBreaker({
+      minSamples: 4,
+      threshold: 0.5,
+      windowMs: 1000,
+      now: () => now,
+    });
+    cb.recordFailure();
+    cb.recordFailure();
+    now += 1500; // window expired
+    cb.recordSuccess();
+    cb.recordSuccess();
+    cb.recordSuccess();
+    cb.recordSuccess();
+    // Old failures should be discarded; rate = 0/4 < threshold
+    expect(cb.getState()).toBe("closed");
+  });
+
+  test("reset() clears state", () => {
+    const cb = new CircuitBreaker({ minSamples: 2, threshold: 0.5 });
+    cb.recordFailure();
+    cb.recordFailure();
+    expect(cb.getState()).toBe("open");
+    cb.reset();
+    expect(cb.getState()).toBe("closed");
+  });
+});
+
+// ─────────────────────────── HappyPath ───────────────────────────────────────
+
+describe("OpenAIEmbeddingsProvider — happy path", () => {
+  test("single embed call → 200 success", async () => {
+    const { fetchImpl, calls } = makeFetchSequence([
+      () => embeddingsResponse(["hello"], 4),
+    ]);
+    const p = new OpenAIEmbeddingsProvider({
+      endpoint: "https://ai.example.com",
+      fetchImpl,
+    });
+    const r = await p.embed("hello");
+    expect(r).not.toBeNull();
+    expect(r!.embedding.length).toBe(4);
+    expect(r!.model).toBe("embeddinggemma");
+    expect(p.getDimensions()).toBe(4);
+    expect(calls.length).toBe(1);
+    expect(calls[0]!.url).toBe("https://ai.example.com/v1/embeddings");
+  });
+
+  test("strips trailing slashes from endpoint", async () => {
+    const { fetchImpl, calls } = makeFetchSequence([
+      () => embeddingsResponse(["x"], 2),
+    ]);
+    const p = new OpenAIEmbeddingsProvider({
+      endpoint: "https://ai.example.com////",
+      fetchImpl,
+    });
+    await p.embed("x");
+    expect(calls[0]!.url).toBe("https://ai.example.com/v1/embeddings");
+  });
+
+  test("batch of 3 → 1 HTTP call", async () => {
+    const { fetchImpl, calls } = makeFetchSequence([
+      () => embeddingsResponse(["a", "b", "c"], 3),
+    ]);
+    const p = new OpenAIEmbeddingsProvider({
+      endpoint: "https://ai.example.com",
+      fetchImpl,
+    });
+    const result = await p.embedBatch(["a", "b", "c"]);
+    expect(result.length).toBe(3);
+    expect(result.every((r) => r !== null)).toBe(true);
+    expect(calls.length).toBe(1);
+  });
+
+  test("respects custom modelId / upstreamModel in request body", async () => {
+    const { fetchImpl, calls } = makeFetchSequence([
+      () => embeddingsResponse(["x"], 2),
+    ]);
+    const p = new OpenAIEmbeddingsProvider({
+      endpoint: "https://ai.example.com",
+      modelId: "embeddinggemma",
+      upstreamModel: "embeddinggemma:300m",
+      fetchImpl,
+    });
+    const r = await p.embed("x");
+    expect(r!.model).toBe("embeddinggemma");
+    const body = JSON.parse(calls[0]!.init!.body as string);
+    expect(body.model).toBe("embeddinggemma:300m");
+  });
+
+  test("Authorization header set when apiKey provided", async () => {
+    const { fetchImpl, calls } = makeFetchSequence([
+      () => embeddingsResponse(["x"], 2),
+    ]);
+    const p = new OpenAIEmbeddingsProvider({
+      endpoint: "https://ai.example.com",
+      apiKey: "sk-test-123",
+      fetchImpl,
+    });
+    await p.embed("x");
+    const headers = calls[0]!.init!.headers as Record<string, string>;
+    expect(headers["Authorization"]).toBe("Bearer sk-test-123");
+  });
+
+  test("Authorization header omitted when apiKey not provided", async () => {
+    const { fetchImpl, calls } = makeFetchSequence([
+      () => embeddingsResponse(["x"], 2),
+    ]);
+    const p = new OpenAIEmbeddingsProvider({
+      endpoint: "https://ai.example.com",
+      fetchImpl,
+    });
+    await p.embed("x");
+    const headers = calls[0]!.init!.headers as Record<string, string>;
+    expect(headers["Authorization"]).toBeUndefined();
+  });
+});
+
+// ─────────────────────────── Batch chunking ──────────────────────────────────
+
+describe("OpenAIEmbeddingsProvider — batch chunking", () => {
+  test("100 items at batchSize=64 → 2 HTTP calls (64 + 36)", async () => {
+    const { fetchImpl, calls } = makeFetchSequence([
+      () => embeddingsResponse(Array.from({ length: 64 }, () => "x"), 4),
+      () => embeddingsResponse(Array.from({ length: 36 }, () => "x"), 4),
+    ]);
+    const p = new OpenAIEmbeddingsProvider({
+      endpoint: "https://ai.example.com",
+      fetchImpl,
+      batchSize: 64,
+    });
+    const texts = Array.from({ length: 100 }, (_, i) => `text-${i}`);
+    const result = await p.embedBatch(texts);
+    expect(result.length).toBe(100);
+    expect(result.every((r) => r !== null)).toBe(true);
+    expect(calls.length).toBe(2);
+
+    const body0 = JSON.parse(calls[0]!.init!.body as string);
+    const body1 = JSON.parse(calls[1]!.init!.body as string);
+    expect(body0.input.length).toBe(64);
+    expect(body1.input.length).toBe(36);
+  });
+
+  test("custom batchSize=10 → multiple smaller calls", async () => {
+    const { fetchImpl, calls } = makeFetchSequence([
+      () => embeddingsResponse(Array.from({ length: 10 }, () => "x"), 2),
+      () => embeddingsResponse(Array.from({ length: 10 }, () => "x"), 2),
+      () => embeddingsResponse(Array.from({ length: 5 }, () => "x"), 2),
+    ]);
+    const p = new OpenAIEmbeddingsProvider({
+      endpoint: "https://ai.example.com",
+      fetchImpl,
+      batchSize: 10,
+    });
+    const texts = Array.from({ length: 25 }, (_, i) => `t${i}`);
+    const result = await p.embedBatch(texts);
+    expect(result.length).toBe(25);
+    expect(result.every((r) => r !== null)).toBe(true);
+    expect(calls.length).toBe(3);
+  });
+
+  test("empty input → no HTTP calls", async () => {
+    const { fetchImpl, calls } = makeFetchSequence([]);
+    const p = new OpenAIEmbeddingsProvider({
+      endpoint: "https://ai.example.com",
+      fetchImpl,
+    });
+    const result = await p.embedBatch([]);
+    expect(result).toEqual([]);
+    expect(calls.length).toBe(0);
+  });
+});
+
+// ─────────────────────────── Retry behavior ──────────────────────────────────
+
+describe("OpenAIEmbeddingsProvider — retry on 429/503", () => {
+  test("429 → retry → success", async () => {
+    const sleepCalls: number[] = [];
+    const { fetchImpl, calls } = makeFetchSequence([
+      () => mockResponse(429, { error: "rate limit" }),
+      () => embeddingsResponse(["x"], 4),
+    ]);
+    const p = new OpenAIEmbeddingsProvider({
+      endpoint: "https://ai.example.com",
+      fetchImpl,
+      retryBackoffsMs: [10, 20, 40],
+      sleep: async (ms) => {
+        sleepCalls.push(ms);
+      },
+    });
+    const r = await p.embed("x");
+    expect(r).not.toBeNull();
+    expect(calls.length).toBe(2);
+    expect(sleepCalls).toEqual([10]);
+  });
+
+  test("503 → retry → success", async () => {
+    const sleepCalls: number[] = [];
+    const { fetchImpl, calls } = makeFetchSequence([
+      () => mockResponse(503, { error: "service unavailable" }),
+      () => embeddingsResponse(["x"], 4),
+    ]);
+    const p = new OpenAIEmbeddingsProvider({
+      endpoint: "https://ai.example.com",
+      fetchImpl,
+      retryBackoffsMs: [5, 10, 20],
+      sleep: async (ms) => {
+        sleepCalls.push(ms);
+      },
+    });
+    const r = await p.embed("x");
+    expect(r).not.toBeNull();
+    expect(calls.length).toBe(2);
+    expect(sleepCalls).toEqual([5]);
+  });
+
+  test("503 persistent → exhausted retries → null result", async () => {
+    const sleepCalls: number[] = [];
+    const { fetchImpl, calls } = makeFetchSequence([
+      () => mockResponse(503, "down"),
+      () => mockResponse(503, "down"),
+      () => mockResponse(503, "down"),
+      () => mockResponse(503, "down"),
+    ]);
+    const p = new OpenAIEmbeddingsProvider({
+      endpoint: "https://ai.example.com",
+      fetchImpl,
+      retryBackoffsMs: [1, 2, 4],
+      sleep: async (ms) => {
+        sleepCalls.push(ms);
+      },
+    });
+    const r = await p.embed("x");
+    expect(r).toBeNull();
+    expect(calls.length).toBe(4); // initial + 3 retries
+    expect(sleepCalls).toEqual([1, 2, 4]);
+  });
+
+  test("default backoff schedule is 1s/4s/16s", () => {
+    expect(RETRY_BACKOFFS_MS).toEqual([1000, 4000, 16000]);
+  });
+
+  test("4xx (non-429) → immediate failure, no retry", async () => {
+    const sleepCalls: number[] = [];
+    const { fetchImpl, calls } = makeFetchSequence([
+      () => mockResponse(401, { error: "unauthorized" }),
+    ]);
+    const p = new OpenAIEmbeddingsProvider({
+      endpoint: "https://ai.example.com",
+      fetchImpl,
+      retryBackoffsMs: [10, 20, 40],
+      sleep: async (ms) => {
+        sleepCalls.push(ms);
+      },
+    });
+    const r = await p.embed("x");
+    expect(r).toBeNull();
+    expect(calls.length).toBe(1); // no retries
+    expect(sleepCalls).toEqual([]);
+  });
+
+  test("404 → immediate failure, no retry", async () => {
+    const { fetchImpl, calls } = makeFetchSequence([
+      () => mockResponse(404, "not found"),
+    ]);
+    const p = new OpenAIEmbeddingsProvider({
+      endpoint: "https://ai.example.com",
+      fetchImpl,
+      retryBackoffsMs: [1],
+      sleep: async () => {},
+    });
+    await p.embed("x");
+    expect(calls.length).toBe(1);
+  });
+});
+
+// ─────────────────────────── Malformed responses ─────────────────────────────
+
+describe("OpenAIEmbeddingsProvider — malformed responses", () => {
+  test("malformed JSON → null result", async () => {
+    const { fetchImpl } = makeFetchSequence([
+      () => new Response("not-json{}", { status: 200 }),
+    ]);
+    const p = new OpenAIEmbeddingsProvider({
+      endpoint: "https://ai.example.com",
+      fetchImpl,
+      retryBackoffsMs: [],
+      sleep: async () => {},
+    });
+    const r = await p.embed("x");
+    expect(r).toBeNull();
+  });
+
+  test("missing data array → null result", async () => {
+    const { fetchImpl } = makeFetchSequence([
+      () => mockResponse(200, { object: "list", model: "x" }),
+    ]);
+    const p = new OpenAIEmbeddingsProvider({
+      endpoint: "https://ai.example.com",
+      fetchImpl,
+      retryBackoffsMs: [],
+      sleep: async () => {},
+    });
+    const r = await p.embed("x");
+    expect(r).toBeNull();
+  });
+
+  test("data item index out of range → null result", async () => {
+    const { fetchImpl } = makeFetchSequence([
+      () =>
+        mockResponse(200, {
+          object: "list",
+          data: [
+            { index: 5, embedding: [0.1, 0.2] }, // out of range for 1 input
+          ],
+        }),
+    ]);
+    const p = new OpenAIEmbeddingsProvider({
+      endpoint: "https://ai.example.com",
+      fetchImpl,
+      retryBackoffsMs: [],
+      sleep: async () => {},
+    });
+    const r = await p.embed("x");
+    expect(r).toBeNull();
+  });
+
+  test("response handles out-of-order data array (sorts by index)", async () => {
+    const { fetchImpl } = makeFetchSequence([
+      () =>
+        mockResponse(200, {
+          object: "list",
+          data: [
+            { index: 1, embedding: [0.7, 0.8] },
+            { index: 0, embedding: [0.1, 0.2] },
+          ],
+        }),
+    ]);
+    const p = new OpenAIEmbeddingsProvider({
+      endpoint: "https://ai.example.com",
+      fetchImpl,
+    });
+    const result = await p.embedBatch(["zero", "one"]);
+    expect(result.length).toBe(2);
+    expect(result[0]!.embedding).toEqual([0.1, 0.2]);
+    expect(result[1]!.embedding).toEqual([0.7, 0.8]);
+  });
+});
+
+// ─────────────────────────── Timeout / abort ─────────────────────────────────
+
+describe("OpenAIEmbeddingsProvider — timeout and abort", () => {
+  test("user abort signal → null result + no further calls", async () => {
+    const { fetchImpl, calls } = makeFetchSequence([
+      () => embeddingsResponse(["a", "b"], 2),
+    ]);
+    const p = new OpenAIEmbeddingsProvider({
+      endpoint: "https://ai.example.com",
+      fetchImpl,
+      batchSize: 1,
+    });
+    const ctrl = new AbortController();
+    ctrl.abort(new Error("user cancelled"));
+    const result = await p.embedBatch(["a", "b"], { signal: ctrl.signal });
+    expect(result).toEqual([null, null]);
+    expect(calls.length).toBe(0); // signal aborted before first call
+  });
+
+  test("per-attempt timeout aborts a slow request", async () => {
+    let aborted = false;
+    const fetchImpl = (async (_url: any, init?: RequestInit) => {
+      return await new Promise<Response>((_resolve, reject) => {
+        const sig = init?.signal;
+        sig?.addEventListener("abort", () => {
+          aborted = true;
+          reject(new DOMException("aborted", "AbortError"));
+        });
+      });
+    }) as typeof fetch;
+    const p = new OpenAIEmbeddingsProvider({
+      endpoint: "https://ai.example.com",
+      fetchImpl,
+      timeoutMs: 50,
+      retryBackoffsMs: [],
+      sleep: async () => {},
+    });
+    const r = await p.embed("hello");
+    expect(r).toBeNull();
+    expect(aborted).toBe(true);
+  });
+});
+
+// ─────────────────────────── Circuit breaker integration ─────────────────────
+
+describe("OpenAIEmbeddingsProvider — circuit breaker integration", () => {
+  test("repeated failures eventually trip breaker → CircuitOpenError", async () => {
+    // 4 chunks of size 1 = 4 sample slots. All fail with 401 → 4 failures → breaker opens.
+    const { fetchImpl } = makeFetchSequence([
+      () => mockResponse(401, "fail"),
+      () => mockResponse(401, "fail"),
+      () => mockResponse(401, "fail"),
+      () => mockResponse(401, "fail"),
+      () => mockResponse(401, "fail"), // shouldn't be reached
+    ]);
+    const p = new OpenAIEmbeddingsProvider({
+      endpoint: "https://ai.example.com",
+      fetchImpl,
+      batchSize: 1,
+      retryBackoffsMs: [],
+      sleep: async () => {},
+    });
+    // First call: 4 sub-chunks, all fail, breaker opens during 4th
+    const result1 = await p.embedBatch(["a", "b", "c", "d"]);
+    expect(result1.every((x) => x === null)).toBe(true);
+    expect(p.breaker.getState()).toBe("open");
+
+    // Second call: breaker fails fast
+    await expect(p.embedBatch(["e"])).rejects.toBeInstanceOf(CircuitOpenError);
+  });
+
+  test("breaker recovers after openDuration → success closes it", async () => {
+    let now = 1_000_000;
+    const { fetchImpl } = makeFetchSequence([
+      () => mockResponse(401, "fail"),
+      () => mockResponse(401, "fail"),
+      () => mockResponse(401, "fail"),
+      () => mockResponse(401, "fail"),
+      () => embeddingsResponse(["recovered"], 2),
+    ]);
+    const p = new OpenAIEmbeddingsProvider({
+      endpoint: "https://ai.example.com",
+      fetchImpl,
+      batchSize: 1,
+      retryBackoffsMs: [],
+      sleep: async () => {},
+      now: () => now,
+    });
+    // Override breaker with a shorter open duration
+    (p as any).breaker = new CircuitBreaker({
+      minSamples: 4,
+      threshold: 0.5,
+      openDurationMs: 1000,
+      now: () => now,
+    });
+    await p.embedBatch(["a", "b", "c", "d"]);
+    expect((p as any).breaker.getState()).toBe("open");
+    now += 1500;
+    expect((p as any).breaker.getState()).toBe("half-open");
+    const r = await p.embed("recovered");
+    expect(r).not.toBeNull();
+    expect((p as any).breaker.getState()).toBe("closed");
+  });
+});
+
+// ─────────────────────────── Healthcheck ─────────────────────────────────────
+
+describe("OpenAIEmbeddingsProvider — healthcheck", () => {
+  test("healthcheck pings GET /health when available", async () => {
+    const { fetchImpl, calls } = makeFetchSequence([
+      () =>
+        mockResponse(200, {
+          status: "ok",
+          model: "embeddinggemma:300m",
+        }),
+    ]);
+    const p = new OpenAIEmbeddingsProvider({
+      endpoint: "https://ai.example.com",
+      fetchImpl,
+    });
+    const h = await p.healthcheck();
+    expect(h.ok).toBe(true);
+    expect(calls.length).toBe(1);
+    expect(calls[0]!.url).toBe("https://ai.example.com/health");
+    expect(calls[0]!.init!.method).toBe("GET");
+  });
+
+  test("healthcheck failure → falls through to embed probe", async () => {
+    const { fetchImpl } = makeFetchSequence([
+      () => mockResponse(404, "no /health"),
+      // Then fall back to /v1/embeddings probe
+      () => embeddingsResponse(["healthcheck"], 4),
+    ]);
+    const p = new OpenAIEmbeddingsProvider({
+      endpoint: "https://ai.example.com",
+      fetchImpl,
+    });
+    const h = await p.healthcheck();
+    // 404 isn't an exception, it returns ok:false from the /health branch
+    // The fallback probe is only triggered on actual exceptions.
+    expect(h.ok).toBe(false);
+    expect(h.detail).toContain("404");
+  });
+});
+
+// ─────────────────────────── HttpError ───────────────────────────────────────
+
+describe("HttpError", () => {
+  test("preserves status and body preview", () => {
+    const err = new HttpError(429, "rate limit exceeded");
+    expect(err.status).toBe(429);
+    expect(err.bodyPreview).toBe("rate limit exceeded");
+    expect(err.message).toContain("HTTP 429");
+  });
+  test("truncates long bodies in message", () => {
+    const longBody = "x".repeat(500);
+    const err = new HttpError(500, longBody);
+    expect(err.message.length).toBeLessThan(longBody.length + 200);
+  });
+});
+
+// ─────────────────────────── dispose ─────────────────────────────────────────
+
+describe("OpenAIEmbeddingsProvider — dispose", () => {
+  test("dispose resets the breaker", async () => {
+    const { fetchImpl } = makeFetchSequence([
+      () => mockResponse(401, "fail"),
+      () => mockResponse(401, "fail"),
+      () => mockResponse(401, "fail"),
+      () => mockResponse(401, "fail"),
+    ]);
+    const p = new OpenAIEmbeddingsProvider({
+      endpoint: "https://ai.example.com",
+      fetchImpl,
+      batchSize: 1,
+      retryBackoffsMs: [],
+      sleep: async () => {},
+    });
+    await p.embedBatch(["a", "b", "c", "d"]);
+    expect(p.breaker.getState()).toBe("open");
+    await p.dispose();
+    expect(p.breaker.getState()).toBe("closed");
+  });
+});

+ 63 - 0
test/embedding-provider.test.ts

@@ -0,0 +1,63 @@
+/**
+ * embedding-provider.test.ts - Tests for the EmbeddingProvider abstraction
+ * (interface-level: model-id guard, ModelMismatchError).
+ */
+
+import { describe, test, expect } from "vitest";
+import {
+  ModelMismatchError,
+  assertModelCompatible,
+} from "../src/embedding/provider.js";
+
+describe("assertModelCompatible", () => {
+  test("empty existingModels → no throw (fresh DB)", () => {
+    expect(() => assertModelCompatible("anything", [])).not.toThrow();
+  });
+
+  test("matching model id → no throw", () => {
+    expect(() =>
+      assertModelCompatible("embeddinggemma", ["embeddinggemma"]),
+    ).not.toThrow();
+  });
+
+  test("matching one of several → no throw", () => {
+    expect(() =>
+      assertModelCompatible("embeddinggemma", ["embeddinggemma", "qwen3-embed"]),
+    ).not.toThrow();
+  });
+
+  test("mismatch throws ModelMismatchError", () => {
+    expect(() =>
+      assertModelCompatible("openai-text-embedding", ["embeddinggemma"]),
+    ).toThrow(ModelMismatchError);
+  });
+
+  test("error message lists existing models + provider model", () => {
+    try {
+      assertModelCompatible("provider-x", ["model-a", "model-b"]);
+      throw new Error("should have thrown");
+    } catch (err) {
+      expect(err).toBeInstanceOf(ModelMismatchError);
+      const e = err as ModelMismatchError;
+      expect(e.providerModel).toBe("provider-x");
+      expect(e.existingModels).toEqual(["model-a", "model-b"]);
+      expect(e.message).toContain("provider-x");
+      expect(e.message).toContain("model-a");
+      expect(e.message).toContain("qmd embed -f");
+      expect(e.message).toContain("QMD_EMBED_MODEL_ID");
+    }
+  });
+});
+
+describe("ModelMismatchError", () => {
+  test("name set correctly", () => {
+    const err = new ModelMismatchError("a", ["b"]);
+    expect(err.name).toBe("ModelMismatchError");
+  });
+
+  test("instanceof Error", () => {
+    const err = new ModelMismatchError("a", ["b"]);
+    expect(err).toBeInstanceOf(Error);
+    expect(err).toBeInstanceOf(ModelMismatchError);
+  });
+});

+ 216 - 0
test/embedding-store-integration.test.ts

@@ -0,0 +1,216 @@
+/**
+ * embedding-store-integration.test.ts - Tests for the
+ * generateEmbeddings() / EmbeddingProvider integration in store.ts.
+ *
+ * Uses an in-memory SQLite + a stub EmbeddingProvider to avoid loading
+ * node-llama-cpp models. Verifies:
+ *   - Provider's embedBatch is called when options.embedProvider is set
+ *   - Model-id guard throws ModelMismatchError on mismatch
+ *   - Force re-embed bypasses the guard
+ *   - getDistinctEmbeddingModels reads content_vectors correctly
+ */
+
+import { describe, test, expect, beforeEach, afterEach } from "vitest";
+import { mkdtempSync, rmSync } from "node:fs";
+import { tmpdir } from "node:os";
+import { join } from "node:path";
+import {
+  createStore,
+  generateEmbeddings,
+  getDistinctEmbeddingModels,
+  insertEmbedding,
+  type Store,
+} from "../src/store.js";
+import {
+  ModelMismatchError,
+  type EmbeddingProvider,
+  type ProviderEmbedding,
+  type ProviderHealth,
+} from "../src/embedding/provider.js";
+
+// ─────────────────────────── Stub provider ───────────────────────────────────
+
+class StubProvider implements EmbeddingProvider {
+  readonly kind = "openai" as const;
+  readonly modelId: string;
+  readonly dim: number;
+  embedBatchCalls = 0;
+  embedCalls = 0;
+  totalTextsEmbedded = 0;
+
+  constructor(modelId: string, dim = 4) {
+    this.modelId = modelId;
+    this.dim = dim;
+  }
+
+  getModelId(): string {
+    return this.modelId;
+  }
+  getDimensions(): number | undefined {
+    return this.dim;
+  }
+  async healthcheck(): Promise<ProviderHealth> {
+    return { ok: true, model: this.modelId, dimensions: this.dim };
+  }
+  async embed(text: string): Promise<ProviderEmbedding | null> {
+    this.embedCalls++;
+    this.totalTextsEmbedded++;
+    return { embedding: this.fakeEmbed(text), model: this.modelId };
+  }
+  async embedBatch(texts: string[]): Promise<(ProviderEmbedding | null)[]> {
+    this.embedBatchCalls++;
+    this.totalTextsEmbedded += texts.length;
+    return texts.map((t) => ({ embedding: this.fakeEmbed(t), model: this.modelId }));
+  }
+  async dispose(): Promise<void> {}
+
+  private fakeEmbed(text: string): number[] {
+    return Array.from({ length: this.dim }, (_, i) => (text.length + i) * 0.01);
+  }
+}
+
+// ─────────────────────────── Test setup ──────────────────────────────────────
+
+let workDir: string;
+let store: Store;
+
+beforeEach(() => {
+  workDir = mkdtempSync(join(tmpdir(), "qmd-store-int-test-"));
+  process.env.INDEX_PATH = join(workDir, "index.sqlite");
+  store = createStore(process.env.INDEX_PATH);
+  // Insert content + documents with the bare-minimum schema. The content
+  // body needs to be non-empty so chunkDocumentByTokens emits at least one
+  // chunk per doc.
+  const now = "2026-04-27T00:00:00Z";
+  store.db
+    .prepare(`INSERT INTO content (hash, doc, created_at) VALUES (?, ?, ?)`)
+    .run("hash1", "Document one body content here that is long enough to chunk.", now);
+  store.db
+    .prepare(`INSERT INTO content (hash, doc, created_at) VALUES (?, ?, ?)`)
+    .run("hash2", "Document two body content there with different words to chunk.", now);
+  store.db
+    .prepare(
+      `INSERT INTO documents (hash, collection, path, title, created_at, modified_at, active) VALUES (?, ?, ?, ?, ?, ?, ?)`,
+    )
+    .run("hash1", "test", "one.md", "One", now, now, 1);
+  store.db
+    .prepare(
+      `INSERT INTO documents (hash, collection, path, title, created_at, modified_at, active) VALUES (?, ?, ?, ?, ?, ?, ?)`,
+    )
+    .run("hash2", "test", "two.md", "Two", now, now, 1);
+});
+
+afterEach(() => {
+  try {
+    store.close();
+  } catch { /* ignore */ }
+  delete process.env.INDEX_PATH;
+  rmSync(workDir, { recursive: true, force: true });
+});
+
+// ─────────────────────────── getDistinctEmbeddingModels ──────────────────────
+
+describe("getDistinctEmbeddingModels", () => {
+  test("returns [] when content_vectors is empty", () => {
+    expect(getDistinctEmbeddingModels(store.db)).toEqual([]);
+  });
+
+  test("returns distinct model strings", () => {
+    store.ensureVecTable(4);
+    insertEmbedding(store.db, "hash1", 0, 0, new Float32Array([0.1, 0.2, 0.3, 0.4]), "embeddinggemma", "2026-04-27T00:00:00Z");
+    insertEmbedding(store.db, "hash2", 0, 0, new Float32Array([0.5, 0.6, 0.7, 0.8]), "embeddinggemma", "2026-04-27T00:00:00Z");
+    expect(getDistinctEmbeddingModels(store.db)).toEqual(["embeddinggemma"]);
+  });
+
+  test("returns multiple distinct models when present", () => {
+    store.ensureVecTable(4);
+    insertEmbedding(store.db, "hash1", 0, 0, new Float32Array([0.1, 0.2, 0.3, 0.4]), "model-a", "2026-04-27T00:00:00Z");
+    insertEmbedding(store.db, "hash2", 0, 0, new Float32Array([0.5, 0.6, 0.7, 0.8]), "model-b", "2026-04-27T00:00:00Z");
+    const models = getDistinctEmbeddingModels(store.db).sort();
+    expect(models).toEqual(["model-a", "model-b"]);
+  });
+});
+
+// ─────────────────────────── generateEmbeddings + provider ───────────────────
+
+describe("generateEmbeddings with EmbeddingProvider", () => {
+  test("uses provider.embedBatch when supplied", async () => {
+    const provider = new StubProvider("embeddinggemma", 4);
+    const result = await generateEmbeddings(store, {
+      embedProvider: provider,
+      // Use small batches to keep test fast
+      maxDocsPerBatch: 64,
+    });
+    expect(result.docsProcessed).toBe(2);
+    expect(result.chunksEmbedded).toBeGreaterThan(0);
+    expect(result.errors).toBe(0);
+    expect(provider.embedBatchCalls + provider.embedCalls).toBeGreaterThan(0);
+    expect(provider.totalTextsEmbedded).toBeGreaterThan(0);
+  });
+
+  test("model-id guard throws ModelMismatchError on mismatch", async () => {
+    // Pre-populate content_vectors with a different model id
+    store.ensureVecTable(4);
+    insertEmbedding(
+      store.db,
+      "hash1",
+      0,
+      0,
+      new Float32Array([0.1, 0.2, 0.3, 0.4]),
+      "old-model",
+      "2026-04-27T00:00:00Z",
+    );
+    const provider = new StubProvider("new-model", 4);
+    await expect(
+      generateEmbeddings(store, { embedProvider: provider }),
+    ).rejects.toBeInstanceOf(ModelMismatchError);
+  });
+
+  test("model-id matches → proceeds", async () => {
+    store.ensureVecTable(4);
+    insertEmbedding(
+      store.db,
+      "hash1",
+      0,
+      0,
+      new Float32Array([0.1, 0.2, 0.3, 0.4]),
+      "embeddinggemma",
+      "2026-04-27T00:00:00Z",
+    );
+    const provider = new StubProvider("embeddinggemma", 4);
+    const result = await generateEmbeddings(store, { embedProvider: provider });
+    // Only hash2 needs embedding (hash1 already has one)
+    expect(result.docsProcessed).toBeLessThanOrEqual(2);
+    expect(result.errors).toBe(0);
+  });
+
+  test("force=true bypasses model-id guard", async () => {
+    store.ensureVecTable(4);
+    insertEmbedding(
+      store.db,
+      "hash1",
+      0,
+      0,
+      new Float32Array([0.1, 0.2, 0.3, 0.4]),
+      "old-model",
+      "2026-04-27T00:00:00Z",
+    );
+    const provider = new StubProvider("new-model", 4);
+    // force=true wipes content_vectors first → guard sees empty → no throw
+    const result = await generateEmbeddings(store, {
+      embedProvider: provider,
+      force: true,
+    });
+    expect(result.docsProcessed).toBe(2);
+    expect(result.errors).toBe(0);
+    // Now only "new-model" should be in the DB
+    expect(getDistinctEmbeddingModels(store.db)).toEqual(["new-model"]);
+  });
+
+  test("empty DB → no guard issue, anything goes", async () => {
+    expect(getDistinctEmbeddingModels(store.db)).toEqual([]);
+    const provider = new StubProvider("anything-id", 4);
+    const result = await generateEmbeddings(store, { embedProvider: provider });
+    expect(result.errors).toBe(0);
+  });
+});