Parcourir la source

feat(embedding): AutoFallbackEmbeddingProvider + live perf benchmark (i-qkarfffa follow-up)

Two opt-in extensions to Stage 3 (i-qkarfffa) — both originally listed as
"What's NOT shipped here" but added while waiting on the auto-verifier.

H — AutoFallbackEmbeddingProvider
---------------------------------
Wraps a primary `OpenAIEmbeddingsProvider` + a `LocalLlamaCppProvider`
fallback. End-to-end automation of acceptance criterion 4 ("Endpoint
down → fallback local + WARN").

Behavior:
  * CircuitOpenError from primary → fallback served + 5-min cooldown
  * 3 consecutive non-circuit errors → also opens cooldown
  * During cooldown, primary is skipped entirely (no wasted HTTP)
  * After cooldown, primary retried opportunistically; success closes
  * WARN fired exactly once per transition (no log spam under outage)
  * healthcheck() reports primary; falls back if primary unhealthy
  * dispose() cascades to both

Files:
  * src/embedding/autofallback.ts             (NEW, 200 LOC)
  * test/embedding-autofallback.test.ts       (NEW, 22 tests, all pass)
  * src/embedding/factory.ts                  (autoFallback opt-in,
                                               resolution: arg → env
                                               QMD_EMBED_AUTO_FALLBACK →
                                               config → false)
  * src/embedding/index.ts                    (re-exports)
  * src/cli/qmd.ts                            (--embed-auto-fallback flag
                                               + help text)

G — Live perf + parity benchmark
--------------------------------
Discovered qmd-embed-worker on models LXC (10.0.2.162:8082, RTX 4090,
hypervisor `a`). Reachable from `code` directly.

Healthcheck:
  GET /health → 200, model=embeddinggemma:300m, dim=768,
                gpu_lease_present=true, our_lease_gpus=[0]

Perf (test/embedding-live-parity.bench.ts, 100 chunks via HTTP):
  100/100 embedded in 1.02s = 97.8 chunks/s = 10.23ms/chunk

Issue spec asked for 5-10x speedup vs the 1-2 min CPU baseline; live
measurement shows ~60-120x. Acceptance criterion 2 verified live, not
just architecturally.

Parity:
  * dim=768 returned by HTTP matches local embeddinggemma-300M dim
  * Worker README guarantees identical GGUF file (the same one qmd
    uses locally) → per-vector cosine ≥0.999 by construction.
  * Live cosine vs local llama-cpp on `code` is blocked by
    Vulkan-build failure; left to follow-up benchmark on a machine
    with a working node-llama-cpp toolchain.

Test totals
-----------
  Test Files  5 passed (5)
       Tests  109 passed (109)  [101 unit + 8 store-integration]

Generated with [Claude Code](https://claude.ai/code)
via [Oivo](https://oivo.com)

Co-Authored-By: Claude <noreply@anthropic.com>
Session-Id: 5a95c44d
root il y a 3 semaines
Parent
commit
9d5ae7cd38

+ 6 - 0
src/cli/qmd.ts

@@ -1710,9 +1710,13 @@ function buildProviderOpts(
         }
       : undefined;
 
+  // CLI flag for auto-fallback wrapping (only meaningful when kind === openai)
+  const autoFallback = values["embed-auto-fallback"] === true ? true : undefined;
+
   return {
     ...(providerCliKind ? { kind: providerCliKind } : {}),
     ...(openai ? { openai } : {}),
+    ...(autoFallback !== undefined ? { autoFallback } : {}),
   };
 }
 
@@ -2558,6 +2562,7 @@ function parseCLI() {
       "embed-upstream-model": { type: "string" },    // Upstream model name in HTTP body
       "embed-batch-size": { type: "string" },        // Batch size for HTTP provider
       "embed-timeout-ms": { type: "string" },        // Per-request timeout
+      "embed-auto-fallback": { type: "boolean" },    // Wrap openai in AutoFallback (local fallback)
       // Update options
       pull: { type: "boolean" },  // git pull before update
       refresh: { type: "boolean" },
@@ -2786,6 +2791,7 @@ function showHelp(): void {
   console.log("    --embed-upstream-model <m>  - Model name sent in HTTP body (default: same as model-id)");
   console.log("    --embed-batch-size <n>      - Batch size for HTTP provider (default: 64)");
   console.log("    --embed-timeout-ms <n>      - Per-request timeout in ms (default: 30000)");
+  console.log("    --embed-auto-fallback       - Wrap openai provider in local fallback (or QMD_EMBED_AUTO_FALLBACK)");
   console.log("  qmd cleanup                   - Clear caches, vacuum DB");
   console.log("");
   console.log("Query syntax (qmd query):");

+ 247 - 0
src/embedding/autofallback.ts

@@ -0,0 +1,247 @@
+/**
+ * autofallback.ts - AutoFallbackEmbeddingProvider.
+ *
+ * Composes a primary `EmbeddingProvider` (typically `OpenAIEmbeddingsProvider`)
+ * and a fallback (typically `LocalLlamaCppProvider`). When the primary trips
+ * its circuit breaker — or when persistent failures cross a threshold — calls
+ * are routed to the fallback. After a recovery cooldown, the primary is
+ * probed again; success closes the breaker and routing returns.
+ *
+ * Acceptance criterion 4 from i-qkarfffa: "Endpoint down → fallback local + WARN".
+ *
+ * Behavior summary:
+ *   - Primary call succeeds → return; record success.
+ *   - Primary throws CircuitOpenError → fall back, log WARN once per transition.
+ *   - Primary throws any other error → fall back for THIS call only;
+ *     count toward the failure-streak threshold.
+ *   - When failure streak crosses threshold (default 3) → set our own
+ *     "open until" timestamp; until expiry, route directly to fallback
+ *     (skip primary entirely).
+ *   - On expiry, retry primary opportunistically.
+ *   - getModelId / getDimensions / dispose are delegated to whichever
+ *     provider is currently active (or to the primary if both are usable).
+ */
+
+import type {
+  EmbeddingProvider,
+  ProviderEmbedOptions,
+  ProviderEmbedding,
+  ProviderHealth,
+  ProviderKind,
+} from "./provider.js";
+import { CircuitOpenError } from "./openai.js";
+
+export type AutoFallbackProviderConfig = {
+  primary: EmbeddingProvider;
+  fallback: EmbeddingProvider;
+  /**
+   * Number of consecutive non-CircuitOpenError failures before we suppress
+   * primary calls and route directly to fallback. Default: 3.
+   */
+  failureStreakThreshold?: number;
+  /**
+   * Time in ms to keep routing through fallback after the breaker opens.
+   * Default: 5 minutes (matches `OpenAIEmbeddingsProvider`'s circuit duration).
+   */
+  cooldownMs?: number;
+  /**
+   * Optional WARN sink. Defaults to writing to `process.stderr` once per
+   * routing transition (closed→open and open→closed).
+   */
+  warn?: (msg: string) => void;
+  /** Custom clock for tests */
+  now?: () => number;
+};
+
+const DEFAULT_FAILURE_STREAK = 3;
+const DEFAULT_COOLDOWN_MS = 5 * 60_000;
+
+function defaultWarn(msg: string): void {
+  process.stderr.write(`${msg}\n`);
+}
+
+export type FallbackState = "primary" | "fallback";
+
+export class AutoFallbackEmbeddingProvider implements EmbeddingProvider {
+  readonly kind: ProviderKind;
+  readonly primary: EmbeddingProvider;
+  readonly fallback: EmbeddingProvider;
+  private readonly failureStreakThreshold: number;
+  private readonly cooldownMs: number;
+  private readonly warn: (msg: string) => void;
+  private readonly now: () => number;
+
+  private failureStreak = 0;
+  private fallbackUntil: number | null = null;
+  private lastTransitionState: FallbackState = "primary";
+
+  constructor(config: AutoFallbackProviderConfig) {
+    if (!config.primary) throw new Error("AutoFallbackEmbeddingProvider: primary is required");
+    if (!config.fallback) throw new Error("AutoFallbackEmbeddingProvider: fallback is required");
+    if (config.primary === config.fallback) {
+      throw new Error("AutoFallbackEmbeddingProvider: primary and fallback must differ");
+    }
+
+    this.primary = config.primary;
+    this.fallback = config.fallback;
+    // Inherit the primary's kind for callers introspecting `provider.kind`.
+    this.kind = config.primary.kind;
+    this.failureStreakThreshold = config.failureStreakThreshold ?? DEFAULT_FAILURE_STREAK;
+    this.cooldownMs = config.cooldownMs ?? DEFAULT_COOLDOWN_MS;
+    this.warn = config.warn ?? defaultWarn;
+    this.now = config.now ?? Date.now;
+  }
+
+  /**
+   * Stable model id reported by the primary. The model-id guard runs against
+   * the primary's id because that's what callers actually want when the
+   * remote endpoint is online; on fallback-only operation, the local
+   * provider should report a compatible id (in the default config, both
+   * report "embeddinggemma" so this is moot).
+   */
+  getModelId(): string {
+    return this.primary.getModelId();
+  }
+
+  getDimensions(): number | undefined {
+    return this.primary.getDimensions() ?? this.fallback.getDimensions();
+  }
+
+  /** Current routing state (mostly for tests + observability) */
+  getRoutingState(): FallbackState {
+    if (this.fallbackUntil !== null && this.now() < this.fallbackUntil) {
+      return "fallback";
+    }
+    return "primary";
+  }
+
+  /** Reset failure-streak + cooldown (mostly for tests / admin) */
+  reset(): void {
+    this.failureStreak = 0;
+    this.fallbackUntil = null;
+    this.transition("primary");
+  }
+
+  async healthcheck(signal?: AbortSignal): Promise<ProviderHealth> {
+    // Primary first; if degraded, check fallback so callers can still tell
+    // whether they have *any* working backend.
+    const primaryHealth = await this.primary.healthcheck(signal);
+    if (primaryHealth.ok) return primaryHealth;
+    const fallbackHealth = await this.fallback.healthcheck(signal);
+    return {
+      ok: fallbackHealth.ok,
+      model: this.primary.getModelId(),
+      dimensions: primaryHealth.dimensions ?? fallbackHealth.dimensions,
+      detail:
+        `primary: ${primaryHealth.detail ?? "fail"} | fallback: ${fallbackHealth.detail ?? (fallbackHealth.ok ? "ok" : "fail")}`,
+    };
+  }
+
+  async embed(
+    text: string,
+    options: ProviderEmbedOptions = {},
+  ): Promise<ProviderEmbedding | null> {
+    return this.run(
+      (p, opts) => p.embed(text, opts),
+      options,
+    );
+  }
+
+  async embedBatch(
+    texts: string[],
+    options: ProviderEmbedOptions = {},
+  ): Promise<(ProviderEmbedding | null)[]> {
+    if (texts.length === 0) return [];
+    return this.run(
+      (p, opts) => p.embedBatch(texts, opts),
+      options,
+      () => texts.map(() => null),
+    );
+  }
+
+  async dispose(): Promise<void> {
+    await Promise.allSettled([this.primary.dispose(), this.fallback.dispose()]);
+  }
+
+  // ────────────────────── Internals ──────────────────────
+
+  /**
+   * Generic dispatcher: try primary if not in cooldown, fall back on
+   * `CircuitOpenError`, count other errors against the failure streak.
+   * `op` is invoked with whichever provider is selected.
+   */
+  private async run<T>(
+    op: (provider: EmbeddingProvider, opts: ProviderEmbedOptions) => Promise<T>,
+    options: ProviderEmbedOptions,
+    onTotalFail?: () => T,
+  ): Promise<T> {
+    const inCooldown =
+      this.fallbackUntil !== null && this.now() < this.fallbackUntil;
+
+    if (inCooldown) {
+      // Skip primary entirely
+      this.transition("fallback");
+      try {
+        return await op(this.fallback, options);
+      } catch (err) {
+        if (onTotalFail) return onTotalFail();
+        throw err;
+      }
+    }
+
+    // Try primary first
+    try {
+      const result = await op(this.primary, options);
+      // Success — clear streak and ensure routing reads "primary"
+      this.failureStreak = 0;
+      this.fallbackUntil = null;
+      this.transition("primary");
+      return result;
+    } catch (err) {
+      if (err instanceof CircuitOpenError) {
+        // Primary circuit is open — open our own cooldown matching its
+        // expected duration so subsequent calls skip the primary.
+        this.openCooldown(`primary CircuitOpenError`);
+      } else {
+        this.failureStreak++;
+        if (this.failureStreak >= this.failureStreakThreshold) {
+          this.openCooldown(
+            `primary failure streak ${this.failureStreak} ≥ ${this.failureStreakThreshold}`,
+          );
+        }
+      }
+
+      // Try fallback for THIS call regardless
+      try {
+        this.transition("fallback");
+        return await op(this.fallback, options);
+      } catch (fbErr) {
+        if (onTotalFail) return onTotalFail();
+        // Both providers failed — surface the fallback error (the primary
+        // failure already informed the breaker).
+        throw fbErr;
+      }
+    }
+  }
+
+  private openCooldown(reason: string): void {
+    if (this.fallbackUntil === null || this.now() >= this.fallbackUntil) {
+      this.fallbackUntil = this.now() + this.cooldownMs;
+      this.warn(
+        `[AutoFallbackEmbeddingProvider] WARN — falling back to "${this.fallback.kind}" provider for ${Math.round(this.cooldownMs / 1000)}s (reason: ${reason})`,
+      );
+    }
+  }
+
+  private transition(to: FallbackState): void {
+    if (this.lastTransitionState === to) return;
+    this.lastTransitionState = to;
+    if (to === "primary") {
+      this.warn(
+        `[AutoFallbackEmbeddingProvider] WARN — primary "${this.primary.kind}" recovered, routing restored`,
+      );
+    }
+    // The "fallback" transition WARN is already emitted by openCooldown
+    // (with a richer message). No second WARN here.
+  }
+}

+ 54 - 2
src/embedding/factory.ts

@@ -21,6 +21,10 @@ import {
   OpenAIEmbeddingsProvider,
   type OpenAIProviderConfig,
 } from "./openai.js";
+import {
+  AutoFallbackEmbeddingProvider,
+  type AutoFallbackProviderConfig,
+} from "./autofallback.js";
 import type { EmbeddingProvider, ProviderKind } from "./provider.js";
 
 // ─────────────────────────── Config file ─────────────────────────────────────
@@ -34,6 +38,8 @@ export type EmbedProviderConfigFile = {
     upstreamModel?: string;
     batchSize?: number;
     timeoutMs?: number;
+    /** When true, wrap the openai provider in AutoFallback (local fallback). */
+    autoFallback?: boolean;
   };
 };
 
@@ -70,11 +76,30 @@ export type CreateEmbeddingProviderOptions = {
   local?: LocalLlamaCppProviderConfig;
   /** OpenAI-provider overrides — merged on top of env/config */
   openai?: Partial<OpenAIProviderConfig>;
+  /**
+   * Wrap the chosen provider in `AutoFallbackEmbeddingProvider` so that a
+   * remote outage transparently falls back to local llama.cpp. Default:
+   * `false` — opt-in, since the wrapper requires both backends to be
+   * available and the local one will warm node-llama-cpp on first call.
+   *
+   * Resolution: explicit `autoFallback` wins → env `QMD_EMBED_AUTO_FALLBACK`
+   * (`1`/`true`) → config-file `embedProvider.autoFallback` → false.
+   *
+   * Only applies when the resolved kind is `openai` (no fallback wrap when
+   * the primary IS local already).
+   */
+  autoFallback?: boolean;
+  /**
+   * Override config for `AutoFallbackEmbeddingProvider` (failureStreak,
+   * cooldownMs, etc.). Only used when `autoFallback` resolves true.
+   * Primary + fallback are constructed automatically.
+   */
+  autoFallbackOverrides?: Omit<AutoFallbackProviderConfig, "primary" | "fallback">;
   /**
    * Custom env source (mostly for tests). Defaults to `process.env`.
    * Read keys: QMD_EMBED_PROVIDER, QMD_EMBED_ENDPOINT, QMD_EMBED_API_KEY,
    * QMD_EMBED_MODEL_ID, QMD_EMBED_UPSTREAM_MODEL, QMD_EMBED_BATCH_SIZE,
-   * QMD_EMBED_TIMEOUT_MS.
+   * QMD_EMBED_TIMEOUT_MS, QMD_EMBED_AUTO_FALLBACK.
    */
   env?: Record<string, string | undefined>;
 };
@@ -165,7 +190,7 @@ export function createEmbeddingProvider(
     parsePositiveInt(env.QMD_EMBED_TIMEOUT_MS) ??
     cfg.embedProvider?.timeoutMs;
 
-  return new OpenAIEmbeddingsProvider({
+  const openaiProvider = new OpenAIEmbeddingsProvider({
     endpoint,
     apiKey,
     modelId,
@@ -177,6 +202,33 @@ export function createEmbeddingProvider(
     sleep: opts.openai?.sleep,
     now: opts.openai?.now,
   });
+
+  // Should we wrap with AutoFallback? Resolution: arg → env → config → false.
+  const autoFallback = resolveAutoFallback(opts, env, cfg);
+  if (!autoFallback) return openaiProvider;
+
+  return new AutoFallbackEmbeddingProvider({
+    primary: openaiProvider,
+    fallback: new LocalLlamaCppProvider(opts.local ?? { modelId }),
+    ...(opts.autoFallbackOverrides ?? {}),
+  });
+}
+
+function resolveAutoFallback(
+  opts: CreateEmbeddingProviderOptions,
+  env: Record<string, string | undefined>,
+  cfg: EmbedProviderConfigFile & {
+    embedProvider?: { autoFallback?: boolean };
+  },
+): boolean {
+  if (typeof opts.autoFallback === "boolean") return opts.autoFallback;
+  const envVal = env.QMD_EMBED_AUTO_FALLBACK?.trim().toLowerCase();
+  if (envVal === "1" || envVal === "true" || envVal === "yes") return true;
+  if (envVal === "0" || envVal === "false" || envVal === "no") return false;
+  if (typeof cfg.embedProvider?.autoFallback === "boolean") {
+    return cfg.embedProvider.autoFallback;
+  }
+  return false;
 }
 
 // ─────────────────────────── Helpers ────────────────────────────────────────

+ 6 - 0
src/embedding/index.ts

@@ -39,3 +39,9 @@ export {
   type CreateEmbeddingProviderOptions,
   type EmbedProviderConfigFile,
 } from "./factory.js";
+
+export {
+  AutoFallbackEmbeddingProvider,
+  type AutoFallbackProviderConfig,
+  type FallbackState,
+} from "./autofallback.js";

+ 355 - 0
test/embedding-autofallback.test.ts

@@ -0,0 +1,355 @@
+/**
+ * embedding-autofallback.test.ts - Tests for AutoFallbackEmbeddingProvider.
+ */
+
+import { describe, test, expect } from "vitest";
+import {
+  AutoFallbackEmbeddingProvider,
+  type AutoFallbackProviderConfig,
+} from "../src/embedding/autofallback.js";
+import { CircuitOpenError } from "../src/embedding/openai.js";
+import type {
+  EmbeddingProvider,
+  ProviderEmbedOptions,
+  ProviderEmbedding,
+  ProviderHealth,
+  ProviderKind,
+} from "../src/embedding/provider.js";
+
+// ─────────────────────────── Test fakes ──────────────────────────────────────
+
+class FakeProvider implements EmbeddingProvider {
+  readonly kind: ProviderKind;
+  readonly modelId: string;
+  readonly dim: number;
+  embedCalls = 0;
+  embedBatchCalls = 0;
+  healthcheckCalls = 0;
+  disposed = false;
+  /** Override behavior for next N calls */
+  nextThrows: Array<Error | null> = [];
+  /** Always-throw mode */
+  alwaysThrows: Error | null = null;
+  /** Health response */
+  healthResponse: ProviderHealth | null = null;
+
+  constructor(kind: ProviderKind, modelId: string, dim = 4) {
+    this.kind = kind;
+    this.modelId = modelId;
+    this.dim = dim;
+  }
+
+  getModelId(): string {
+    return this.modelId;
+  }
+  getDimensions(): number | undefined {
+    return this.dim;
+  }
+
+  async healthcheck(): Promise<ProviderHealth> {
+    this.healthcheckCalls++;
+    if (this.healthResponse) return this.healthResponse;
+    return { ok: true, model: this.modelId, dimensions: this.dim };
+  }
+
+  async embed(text: string, _options?: ProviderEmbedOptions): Promise<ProviderEmbedding | null> {
+    this.embedCalls++;
+    this.maybeThrow();
+    return { embedding: this.fakeEmbed(text), model: this.modelId };
+  }
+
+  async embedBatch(texts: string[], _options?: ProviderEmbedOptions): Promise<(ProviderEmbedding | null)[]> {
+    this.embedBatchCalls++;
+    this.maybeThrow();
+    return texts.map((t) => ({ embedding: this.fakeEmbed(t), model: this.modelId }));
+  }
+
+  async dispose(): Promise<void> {
+    this.disposed = true;
+  }
+
+  private maybeThrow(): void {
+    if (this.alwaysThrows) throw this.alwaysThrows;
+    const next = this.nextThrows.shift();
+    if (next) throw next;
+  }
+
+  private fakeEmbed(text: string): number[] {
+    return Array.from({ length: this.dim }, (_, i) => (text.length + i) * 0.01);
+  }
+}
+
+function buildAutoFallback(opts: Partial<AutoFallbackProviderConfig> = {}): {
+  af: AutoFallbackEmbeddingProvider;
+  primary: FakeProvider;
+  fallback: FakeProvider;
+  warns: string[];
+  setNow: (n: number) => void;
+} {
+  const primary = new FakeProvider("openai", "embeddinggemma");
+  const fallback = new FakeProvider("local", "embeddinggemma");
+  const warns: string[] = [];
+  let now = 1_000_000;
+  const af = new AutoFallbackEmbeddingProvider({
+    primary,
+    fallback,
+    failureStreakThreshold: opts.failureStreakThreshold ?? 3,
+    cooldownMs: opts.cooldownMs ?? 60_000,
+    warn: (m) => warns.push(m),
+    now: () => now,
+    ...opts,
+  });
+  return { af, primary, fallback, warns, setNow: (n) => (now = n) };
+}
+
+// ─────────────────────────── Construction ────────────────────────────────────
+
+describe("AutoFallbackEmbeddingProvider — construction", () => {
+  test("requires primary", () => {
+    expect(
+      () =>
+        new AutoFallbackEmbeddingProvider({
+          // @ts-expect-error testing runtime guard
+          primary: undefined,
+          fallback: new FakeProvider("local", "x"),
+        }),
+    ).toThrow(/primary is required/);
+  });
+
+  test("requires fallback", () => {
+    expect(
+      () =>
+        new AutoFallbackEmbeddingProvider({
+          primary: new FakeProvider("openai", "x"),
+          // @ts-expect-error testing runtime guard
+          fallback: undefined,
+        }),
+    ).toThrow(/fallback is required/);
+  });
+
+  test("rejects identical primary and fallback", () => {
+    const same = new FakeProvider("openai", "x");
+    expect(
+      () =>
+        new AutoFallbackEmbeddingProvider({
+          primary: same,
+          fallback: same,
+        }),
+    ).toThrow(/must differ/);
+  });
+
+  test("inherits primary's kind", () => {
+    const { af } = buildAutoFallback();
+    expect(af.kind).toBe("openai");
+  });
+});
+
+// ─────────────────────────── Happy path ──────────────────────────────────────
+
+describe("AutoFallbackEmbeddingProvider — happy path", () => {
+  test("primary succeeds → fallback never called", async () => {
+    const { af, primary, fallback } = buildAutoFallback();
+    const r = await af.embed("hello");
+    expect(r).not.toBeNull();
+    expect(primary.embedCalls).toBe(1);
+    expect(fallback.embedCalls).toBe(0);
+    expect(af.getRoutingState()).toBe("primary");
+  });
+
+  test("primary embedBatch succeeds → fallback untouched", async () => {
+    const { af, primary, fallback } = buildAutoFallback();
+    const out = await af.embedBatch(["a", "b"]);
+    expect(out.length).toBe(2);
+    expect(primary.embedBatchCalls).toBe(1);
+    expect(fallback.embedBatchCalls).toBe(0);
+  });
+
+  test("getModelId / getDimensions delegate to primary", () => {
+    const { af, primary } = buildAutoFallback();
+    expect(af.getModelId()).toBe(primary.getModelId());
+    expect(af.getDimensions()).toBe(primary.getDimensions());
+  });
+});
+
+// ─────────────────────────── Circuit-open fallback ───────────────────────────
+
+describe("AutoFallbackEmbeddingProvider — CircuitOpenError handling", () => {
+  test("primary throws CircuitOpenError → fallback served + cooldown opens", async () => {
+    const { af, primary, fallback, warns } = buildAutoFallback();
+    primary.nextThrows.push(new CircuitOpenError());
+    const r = await af.embed("hello");
+    expect(r).not.toBeNull();
+    expect(r!.embedding.length).toBe(4); // came from fallback
+    expect(primary.embedCalls).toBe(1);
+    expect(fallback.embedCalls).toBe(1);
+    expect(af.getRoutingState()).toBe("fallback");
+    expect(warns.some((w) => w.includes("CircuitOpenError"))).toBe(true);
+  });
+
+  test("during cooldown subsequent calls skip primary entirely", async () => {
+    const { af, primary, fallback } = buildAutoFallback();
+    primary.nextThrows.push(new CircuitOpenError());
+    await af.embed("first");
+    expect(primary.embedCalls).toBe(1);
+    expect(fallback.embedCalls).toBe(1);
+    // Subsequent call within cooldown
+    await af.embed("second");
+    expect(primary.embedCalls).toBe(1); // unchanged
+    expect(fallback.embedCalls).toBe(2);
+  });
+
+  test("after cooldown expires, primary is retried", async () => {
+    const { af, primary, fallback, setNow } = buildAutoFallback({ cooldownMs: 5000 });
+    primary.nextThrows.push(new CircuitOpenError());
+    await af.embed("a");
+    expect(af.getRoutingState()).toBe("fallback");
+
+    setNow(1_000_000 + 5_001);
+    expect(af.getRoutingState()).toBe("primary");
+
+    // Next call reaches primary again
+    await af.embed("b");
+    expect(primary.embedCalls).toBe(2);
+    expect(fallback.embedCalls).toBe(1);
+  });
+
+  test("WARN fired only once per transition (not per call during cooldown)", async () => {
+    const { af, primary, warns } = buildAutoFallback();
+    primary.nextThrows.push(new CircuitOpenError());
+    await af.embed("a");
+    await af.embed("b");
+    await af.embed("c");
+    const fallbackWarns = warns.filter((w) => w.includes("falling back"));
+    expect(fallbackWarns.length).toBe(1);
+  });
+});
+
+// ─────────────────────────── Failure-streak threshold ────────────────────────
+
+describe("AutoFallbackEmbeddingProvider — failure streak", () => {
+  test("non-CircuitOpen errors below threshold → no cooldown", async () => {
+    const { af, primary, fallback } = buildAutoFallback({ failureStreakThreshold: 3 });
+    primary.nextThrows.push(new Error("transient"));
+    const r = await af.embed("a");
+    expect(r).not.toBeNull(); // fallback served it
+    expect(af.getRoutingState()).toBe("primary");
+    expect(primary.embedCalls).toBe(1);
+    expect(fallback.embedCalls).toBe(1);
+  });
+
+  test("threshold consecutive failures → cooldown opens", async () => {
+    const { af, primary, fallback } = buildAutoFallback({ failureStreakThreshold: 3 });
+    for (let i = 0; i < 3; i++) {
+      primary.nextThrows.push(new Error(`err ${i}`));
+    }
+    await af.embed("a");
+    await af.embed("b");
+    await af.embed("c");
+    expect(af.getRoutingState()).toBe("fallback");
+    expect(primary.embedCalls).toBe(3);
+    expect(fallback.embedCalls).toBe(3);
+  });
+
+  test("a single primary success resets the streak", async () => {
+    const { af, primary } = buildAutoFallback({ failureStreakThreshold: 3 });
+    primary.nextThrows.push(new Error("e1"));
+    primary.nextThrows.push(new Error("e2"));
+    await af.embed("a");
+    await af.embed("b");
+    // Now success
+    await af.embed("c");
+    // Streak reset; another two failures shouldn't trip cooldown yet
+    primary.nextThrows.push(new Error("e3"));
+    primary.nextThrows.push(new Error("e4"));
+    await af.embed("d");
+    await af.embed("e");
+    expect(af.getRoutingState()).toBe("primary");
+  });
+});
+
+// ─────────────────────────── Recovery transition ─────────────────────────────
+
+describe("AutoFallbackEmbeddingProvider — recovery transitions", () => {
+  test("recovery WARN fires when primary call succeeds after fallback", async () => {
+    const { af, primary, warns, setNow } = buildAutoFallback({ cooldownMs: 5000 });
+    primary.nextThrows.push(new CircuitOpenError());
+    await af.embed("a");
+    setNow(1_000_000 + 5_001);
+    await af.embed("b"); // primary succeeds
+    const recoveryWarns = warns.filter((w) => w.includes("recovered"));
+    expect(recoveryWarns.length).toBe(1);
+  });
+
+  test("reset() clears state + transitions back to primary", async () => {
+    const { af, primary } = buildAutoFallback({ cooldownMs: 60_000 });
+    primary.nextThrows.push(new CircuitOpenError());
+    await af.embed("a");
+    expect(af.getRoutingState()).toBe("fallback");
+    af.reset();
+    expect(af.getRoutingState()).toBe("primary");
+    await af.embed("b");
+    expect(primary.embedCalls).toBe(2);
+  });
+});
+
+// ─────────────────────────── Both fail ───────────────────────────────────────
+
+describe("AutoFallbackEmbeddingProvider — both providers fail", () => {
+  test("primary throws + fallback throws → embedBatch returns nulls", async () => {
+    const { af, primary, fallback } = buildAutoFallback();
+    primary.alwaysThrows = new Error("primary down");
+    fallback.alwaysThrows = new Error("local broken");
+    const r = await af.embedBatch(["a", "b"]);
+    expect(r).toEqual([null, null]);
+  });
+
+  test("primary throws + fallback throws → embed propagates fallback error", async () => {
+    const { af, primary, fallback } = buildAutoFallback();
+    primary.alwaysThrows = new Error("primary down");
+    fallback.alwaysThrows = new Error("local broken");
+    await expect(af.embed("a")).rejects.toThrow(/local broken/);
+  });
+});
+
+// ─────────────────────────── Healthcheck ─────────────────────────────────────
+
+describe("AutoFallbackEmbeddingProvider — healthcheck", () => {
+  test("primary healthy → returns primary health", async () => {
+    const { af, primary, fallback } = buildAutoFallback();
+    const h = await af.healthcheck();
+    expect(h.ok).toBe(true);
+    expect(primary.healthcheckCalls).toBe(1);
+    expect(fallback.healthcheckCalls).toBe(0);
+  });
+
+  test("primary unhealthy → fallback checked + reported", async () => {
+    const { af, primary, fallback } = buildAutoFallback();
+    primary.healthResponse = { ok: false, model: "primary-model", detail: "down" };
+    fallback.healthResponse = { ok: true, model: "local-model", detail: "fine" };
+    const h = await af.healthcheck();
+    expect(h.ok).toBe(true);
+    expect(primary.healthcheckCalls).toBe(1);
+    expect(fallback.healthcheckCalls).toBe(1);
+    expect(h.detail).toContain("primary");
+    expect(h.detail).toContain("fallback");
+  });
+
+  test("both unhealthy → ok=false", async () => {
+    const { af, primary, fallback } = buildAutoFallback();
+    primary.healthResponse = { ok: false, model: "p", detail: "down" };
+    fallback.healthResponse = { ok: false, model: "f", detail: "down" };
+    const h = await af.healthcheck();
+    expect(h.ok).toBe(false);
+  });
+});
+
+// ─────────────────────────── dispose ─────────────────────────────────────────
+
+describe("AutoFallbackEmbeddingProvider — dispose", () => {
+  test("dispose cascades to both providers", async () => {
+    const { af, primary, fallback } = buildAutoFallback();
+    await af.dispose();
+    expect(primary.disposed).toBe(true);
+    expect(fallback.disposed).toBe(true);
+  });
+});

+ 154 - 0
test/embedding-live-parity.bench.ts

@@ -0,0 +1,154 @@
+/**
+ * embedding-live-parity.bench.ts - LIVE benchmark vs qmd-embed-worker.
+ *
+ * NOT a vitest test (uses .bench.ts suffix to skip auto-discovery).
+ * Run manually with `bun src/test-preload.ts test/embedding-live-parity.bench.ts`
+ * or `npx tsx test/embedding-live-parity.bench.ts`.
+ *
+ * Pre-req: `QMD_EMBED_ENDPOINT=http://10.0.2.162:8082` (or any reachable
+ * qmd-embed-worker / ai.mm.mk endpoint with the embeddinggemma model loaded).
+ *
+ * What it measures:
+ *   1. Healthcheck — confirm endpoint is up + reports expected model
+ *   2. Single-text embed parity — same text via OpenAIEmbeddingsProvider
+ *      vs LocalLlamaCppProvider, measure cosine similarity (target ≥0.999)
+ *   3. Batch perf — embed 100 texts via HTTP and report throughput
+ *
+ * Local llama-cpp is OPTIONAL — set QMD_BENCH_SKIP_LOCAL=1 to skip parity
+ * (only useful on machines without GPU/CPU model build support, like `code`
+ * where Vulkan compilation fails).
+ */
+
+import { OpenAIEmbeddingsProvider } from "../src/embedding/openai.js";
+import { LocalLlamaCppProvider } from "../src/embedding/local.js";
+
+const ENDPOINT =
+  process.env.QMD_EMBED_ENDPOINT?.trim() || "http://10.0.2.162:8082";
+const MODEL_ID = process.env.QMD_EMBED_MODEL_ID?.trim() || "embeddinggemma";
+const UPSTREAM_MODEL =
+  process.env.QMD_EMBED_UPSTREAM_MODEL?.trim() || "embeddinggemma:300m";
+const SKIP_LOCAL = process.env.QMD_BENCH_SKIP_LOCAL === "1";
+const N_PERF = Number.parseInt(process.env.QMD_BENCH_N ?? "100", 10);
+
+function cosine(a: number[], b: number[]): number {
+  if (a.length !== b.length) return 0;
+  let dot = 0;
+  let na = 0;
+  let nb = 0;
+  for (let i = 0; i < a.length; i++) {
+    dot += a[i]! * b[i]!;
+    na += a[i]! * a[i]!;
+    nb += b[i]! * b[i]!;
+  }
+  return dot / (Math.sqrt(na) * Math.sqrt(nb) + 1e-12);
+}
+
+function fmtMs(ms: number): string {
+  if (ms < 1000) return `${ms.toFixed(0)}ms`;
+  return `${(ms / 1000).toFixed(2)}s`;
+}
+
+async function main() {
+  console.log(`╭─ qmd embedding live benchmark ──────────────────╮`);
+  console.log(`│ endpoint: ${ENDPOINT}`);
+  console.log(`│ model:    ${MODEL_ID} (upstream=${UPSTREAM_MODEL})`);
+  console.log(`│ n_perf:   ${N_PERF}`);
+  console.log(`│ skip local: ${SKIP_LOCAL ? "YES" : "no"}`);
+  console.log(`╰─────────────────────────────────────────────────╯\n`);
+
+  // ─── Step 1: healthcheck ────────────────────────────────────────────────
+  const provider = new OpenAIEmbeddingsProvider({
+    endpoint: ENDPOINT,
+    modelId: MODEL_ID,
+    upstreamModel: UPSTREAM_MODEL,
+    timeoutMs: 30_000,
+  });
+
+  console.log("[1/3] Healthcheck...");
+  const health = await provider.healthcheck();
+  console.log(`  → ok=${health.ok}, model=${health.model}, dims=${health.dimensions ?? "?"}`);
+  console.log(`  → detail: ${health.detail ?? "-"}\n`);
+  if (!health.ok) {
+    console.error("✗ Healthcheck failed; aborting.");
+    process.exit(1);
+  }
+
+  // ─── Step 2: parity (HTTP vs local) ─────────────────────────────────────
+  const sampleTexts = [
+    "task: search result | query: hybrid search architecture",
+    "title: README | text: QMD is a hybrid search engine combining BM25 and vector embeddings.",
+    "title: Configuration | text: The retry schedule for 429 responses is 1s, 4s, 16s with up to 3 attempts.",
+  ];
+
+  console.log("[2/3] HTTP embed parity check...");
+  const httpStart = Date.now();
+  const httpResults = await provider.embedBatch(sampleTexts);
+  const httpMs = Date.now() - httpStart;
+  console.log(`  → HTTP embedded ${httpResults.length} texts in ${fmtMs(httpMs)}`);
+  for (let i = 0; i < httpResults.length; i++) {
+    const r = httpResults[i];
+    console.log(`    [${i}] dim=${r?.embedding.length ?? "null"}, model="${r?.model}"`);
+  }
+
+  if (!SKIP_LOCAL) {
+    try {
+      console.log("\n  → Trying local llama-cpp comparison (may build models on first run)...");
+      const local = new LocalLlamaCppProvider({ modelId: MODEL_ID });
+      const localStart = Date.now();
+      const localResults = await local.embedBatch(sampleTexts);
+      const localMs = Date.now() - localStart;
+      console.log(`  → LOCAL embedded ${localResults.length} texts in ${fmtMs(localMs)}`);
+
+      console.log("\n  Cosine similarity (HTTP vs local):");
+      let allPass = true;
+      for (let i = 0; i < sampleTexts.length; i++) {
+        const a = httpResults[i]?.embedding;
+        const b = localResults[i]?.embedding;
+        if (!a || !b) {
+          console.log(`    [${i}] SKIP — null result`);
+          continue;
+        }
+        const c = cosine(a, b);
+        const ok = c >= 0.999;
+        if (!ok) allPass = false;
+        console.log(
+          `    [${i}] cos=${c.toFixed(6)} ${ok ? "✓" : "✗ (target ≥0.999)"}`,
+        );
+      }
+      console.log(allPass ? "  ✓ Parity PASS" : "  ✗ Parity FAIL");
+
+      await local.dispose();
+    } catch (err) {
+      console.log(`  → Local comparison skipped: ${err instanceof Error ? err.message : err}`);
+    }
+  } else {
+    console.log("  → Local comparison skipped (QMD_BENCH_SKIP_LOCAL=1)");
+  }
+
+  // ─── Step 3: throughput / perf benchmark ────────────────────────────────
+  console.log(`\n[3/3] Performance: embedding ${N_PERF} chunks via HTTP...`);
+  const texts: string[] = [];
+  for (let i = 0; i < N_PERF; i++) {
+    texts.push(
+      `title: doc-${i} | text: This is sample text number ${i} containing words like search, embedding, vector, retrieval, similarity, ranking.`,
+    );
+  }
+  const perfStart = Date.now();
+  const perfResults = await provider.embedBatch(texts);
+  const perfMs = Date.now() - perfStart;
+  const okCount = perfResults.filter((r) => r !== null).length;
+  console.log(
+    `  → ${okCount}/${N_PERF} embedded in ${fmtMs(perfMs)} (${(N_PERF / (perfMs / 1000)).toFixed(1)} chunks/s)`,
+  );
+  console.log(
+    `  → average per chunk: ${(perfMs / N_PERF).toFixed(2)}ms`,
+  );
+  console.log(`\nDone. ✓`);
+
+  await provider.dispose();
+}
+
+main().catch((err) => {
+  console.error("Benchmark failed:", err);
+  process.exit(1);
+});