il y a 2 mois · 9d5ae7cd38
--- a/src/cli/qmd.ts
+++ b/src/cli/qmd.ts
@@ -1710,9 +1710,13 @@ function buildProviderOpts(
 
				         }
			
 
				       : undefined;
			
 
				 
			
 
				+  // CLI flag for auto-fallback wrapping (only meaningful when kind === openai)
			
 
				+  const autoFallback = values["embed-auto-fallback"] === true ? true : undefined;
			
 
				+
			
 
				   return {
			
 
				     ...(providerCliKind ? { kind: providerCliKind } : {}),
			
 
				     ...(openai ? { openai } : {}),
			
 
				+    ...(autoFallback !== undefined ? { autoFallback } : {}),
			
 
				   };
			
 
				 }
			
 
				 
			
@@ -2558,6 +2562,7 @@ function parseCLI() {
 
				       "embed-upstream-model": { type: "string" },    // Upstream model name in HTTP body
			
 
				       "embed-batch-size": { type: "string" },        // Batch size for HTTP provider
			
 
				       "embed-timeout-ms": { type: "string" },        // Per-request timeout
			
 
				+      "embed-auto-fallback": { type: "boolean" },    // Wrap openai in AutoFallback (local fallback)
			
 
				       // Update options
			
 
				       pull: { type: "boolean" },  // git pull before update
			
 
				       refresh: { type: "boolean" },
			
@@ -2786,6 +2791,7 @@ function showHelp(): void {
 
				   console.log("    --embed-upstream-model <m>  - Model name sent in HTTP body (default: same as model-id)");
			
 
				   console.log("    --embed-batch-size <n>      - Batch size for HTTP provider (default: 64)");
			
 
				   console.log("    --embed-timeout-ms <n>      - Per-request timeout in ms (default: 30000)");
			
 
				+  console.log("    --embed-auto-fallback       - Wrap openai provider in local fallback (or QMD_EMBED_AUTO_FALLBACK)");
			
 
				   console.log("  qmd cleanup                   - Clear caches, vacuum DB");
			
 
				   console.log("");
			
 
				   console.log("Query syntax (qmd query):");
			
--- a/src/embedding/autofallback.ts
+++ b/src/embedding/autofallback.ts
@@ -0,0 +1,247 @@
 
				+/**
			
 
				+ * autofallback.ts - AutoFallbackEmbeddingProvider.
			
 
				+ *
			
 
				+ * Composes a primary `EmbeddingProvider` (typically `OpenAIEmbeddingsProvider`)
			
 
				+ * and a fallback (typically `LocalLlamaCppProvider`). When the primary trips
			
 
				+ * its circuit breaker — or when persistent failures cross a threshold — calls
			
 
				+ * are routed to the fallback. After a recovery cooldown, the primary is
			
 
				+ * probed again; success closes the breaker and routing returns.
			
 
				+ *
			
 
				+ * Acceptance criterion 4 from i-qkarfffa: "Endpoint down → fallback local + WARN".
			
 
				+ *
			
 
				+ * Behavior summary:
			
 
				+ *   - Primary call succeeds → return; record success.
			
 
				+ *   - Primary throws CircuitOpenError → fall back, log WARN once per transition.
			
 
				+ *   - Primary throws any other error → fall back for THIS call only;
			
 
				+ *     count toward the failure-streak threshold.
			
 
				+ *   - When failure streak crosses threshold (default 3) → set our own
			
 
				+ *     "open until" timestamp; until expiry, route directly to fallback
			
 
				+ *     (skip primary entirely).
			
 
				+ *   - On expiry, retry primary opportunistically.
			
 
				+ *   - getModelId / getDimensions / dispose are delegated to whichever
			
 
				+ *     provider is currently active (or to the primary if both are usable).
			
 
				+ */
			
 
				+
			
 
				+import type {
			
 
				+  EmbeddingProvider,
			
 
				+  ProviderEmbedOptions,
			
 
				+  ProviderEmbedding,
			
 
				+  ProviderHealth,
			
 
				+  ProviderKind,
			
 
				+} from "./provider.js";
			
 
				+import { CircuitOpenError } from "./openai.js";
			
 
				+
			
 
				+export type AutoFallbackProviderConfig = {
			
 
				+  primary: EmbeddingProvider;
			
 
				+  fallback: EmbeddingProvider;
			
 
				+  /**
			
 
				+   * Number of consecutive non-CircuitOpenError failures before we suppress
			
 
				+   * primary calls and route directly to fallback. Default: 3.
			
 
				+   */
			
 
				+  failureStreakThreshold?: number;
			
 
				+  /**
			
 
				+   * Time in ms to keep routing through fallback after the breaker opens.
			
 
				+   * Default: 5 minutes (matches `OpenAIEmbeddingsProvider`'s circuit duration).
			
 
				+   */
			
 
				+  cooldownMs?: number;
			
 
				+  /**
			
 
				+   * Optional WARN sink. Defaults to writing to `process.stderr` once per
			
 
				+   * routing transition (closed→open and open→closed).
			
 
				+   */
			
 
				+  warn?: (msg: string) => void;
			
 
				+  /** Custom clock for tests */
			
 
				+  now?: () => number;
			
 
				+};
			
 
				+
			
 
				+const DEFAULT_FAILURE_STREAK = 3;
			
 
				+const DEFAULT_COOLDOWN_MS = 5 * 60_000;
			
 
				+
			
 
				+function defaultWarn(msg: string): void {
			
 
				+  process.stderr.write(`${msg}\n`);
			
 
				+}
			
 
				+
			
 
				+export type FallbackState = "primary" | "fallback";
			
 
				+
			
 
				+export class AutoFallbackEmbeddingProvider implements EmbeddingProvider {
			
 
				+  readonly kind: ProviderKind;
			
 
				+  readonly primary: EmbeddingProvider;
			
 
				+  readonly fallback: EmbeddingProvider;
			
 
				+  private readonly failureStreakThreshold: number;
			
 
				+  private readonly cooldownMs: number;
			
 
				+  private readonly warn: (msg: string) => void;
			
 
				+  private readonly now: () => number;
			
 
				+
			
 
				+  private failureStreak = 0;
			
 
				+  private fallbackUntil: number | null = null;
			
 
				+  private lastTransitionState: FallbackState = "primary";
			
 
				+
			
 
				+  constructor(config: AutoFallbackProviderConfig) {
			
 
				+    if (!config.primary) throw new Error("AutoFallbackEmbeddingProvider: primary is required");
			
 
				+    if (!config.fallback) throw new Error("AutoFallbackEmbeddingProvider: fallback is required");
			
 
				+    if (config.primary === config.fallback) {
			
 
				+      throw new Error("AutoFallbackEmbeddingProvider: primary and fallback must differ");
			
 
				+    }
			
 
				+
			
 
				+    this.primary = config.primary;
			
 
				+    this.fallback = config.fallback;
			
 
				+    // Inherit the primary's kind for callers introspecting `provider.kind`.
			
 
				+    this.kind = config.primary.kind;
			
 
				+    this.failureStreakThreshold = config.failureStreakThreshold ?? DEFAULT_FAILURE_STREAK;
			
 
				+    this.cooldownMs = config.cooldownMs ?? DEFAULT_COOLDOWN_MS;
			
 
				+    this.warn = config.warn ?? defaultWarn;
			
 
				+    this.now = config.now ?? Date.now;
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Stable model id reported by the primary. The model-id guard runs against
			
 
				+   * the primary's id because that's what callers actually want when the
			
 
				+   * remote endpoint is online; on fallback-only operation, the local
			
 
				+   * provider should report a compatible id (in the default config, both
			
 
				+   * report "embeddinggemma" so this is moot).
			
 
				+   */
			
 
				+  getModelId(): string {
			
 
				+    return this.primary.getModelId();
			
 
				+  }
			
 
				+
			
 
				+  getDimensions(): number | undefined {
			
 
				+    return this.primary.getDimensions() ?? this.fallback.getDimensions();
			
 
				+  }
			
 
				+
			
 
				+  /** Current routing state (mostly for tests + observability) */
			
 
				+  getRoutingState(): FallbackState {
			
 
				+    if (this.fallbackUntil !== null && this.now() < this.fallbackUntil) {
			
 
				+      return "fallback";
			
 
				+    }
			
 
				+    return "primary";
			
 
				+  }
			
 
				+
			
 
				+  /** Reset failure-streak + cooldown (mostly for tests / admin) */
			
 
				+  reset(): void {
			
 
				+    this.failureStreak = 0;
			
 
				+    this.fallbackUntil = null;
			
 
				+    this.transition("primary");
			
 
				+  }
			
 
				+
			
 
				+  async healthcheck(signal?: AbortSignal): Promise<ProviderHealth> {
			
 
				+    // Primary first; if degraded, check fallback so callers can still tell
			
 
				+    // whether they have *any* working backend.
			
 
				+    const primaryHealth = await this.primary.healthcheck(signal);
			
 
				+    if (primaryHealth.ok) return primaryHealth;
			
 
				+    const fallbackHealth = await this.fallback.healthcheck(signal);
			
 
				+    return {
			
 
				+      ok: fallbackHealth.ok,
			
 
				+      model: this.primary.getModelId(),
			
 
				+      dimensions: primaryHealth.dimensions ?? fallbackHealth.dimensions,
			
 
				+      detail:
			
 
				+        `primary: ${primaryHealth.detail ?? "fail"} | fallback: ${fallbackHealth.detail ?? (fallbackHealth.ok ? "ok" : "fail")}`,
			
 
				+    };
			
 
				+  }
			
 
				+
			
 
				+  async embed(
			
 
				+    text: string,
			
 
				+    options: ProviderEmbedOptions = {},
			
 
				+  ): Promise<ProviderEmbedding | null> {
			
 
				+    return this.run(
			
 
				+      (p, opts) => p.embed(text, opts),
			
 
				+      options,
			
 
				+    );
			
 
				+  }
			
 
				+
			
 
				+  async embedBatch(
			
 
				+    texts: string[],
			
 
				+    options: ProviderEmbedOptions = {},
			
 
				+  ): Promise<(ProviderEmbedding | null)[]> {
			
 
				+    if (texts.length === 0) return [];
			
 
				+    return this.run(
			
 
				+      (p, opts) => p.embedBatch(texts, opts),
			
 
				+      options,
			
 
				+      () => texts.map(() => null),
			
 
				+    );
			
 
				+  }
			
 
				+
			
 
				+  async dispose(): Promise<void> {
			
 
				+    await Promise.allSettled([this.primary.dispose(), this.fallback.dispose()]);
			
 
				+  }
			
 
				+
			
 
				+  // ────────────────────── Internals ──────────────────────
			
 
				+
			
 
				+  /**
			
 
				+   * Generic dispatcher: try primary if not in cooldown, fall back on
			
 
				+   * `CircuitOpenError`, count other errors against the failure streak.
			
 
				+   * `op` is invoked with whichever provider is selected.
			
 
				+   */
			
 
				+  private async run<T>(
			
 
				+    op: (provider: EmbeddingProvider, opts: ProviderEmbedOptions) => Promise<T>,
			
 
				+    options: ProviderEmbedOptions,
			
 
				+    onTotalFail?: () => T,
			
 
				+  ): Promise<T> {
			
 
				+    const inCooldown =
			
 
				+      this.fallbackUntil !== null && this.now() < this.fallbackUntil;
			
 
				+
			
 
				+    if (inCooldown) {
			
 
				+      // Skip primary entirely
			
 
				+      this.transition("fallback");
			
 
				+      try {
			
 
				+        return await op(this.fallback, options);
			
 
				+      } catch (err) {
			
 
				+        if (onTotalFail) return onTotalFail();
			
 
				+        throw err;
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+    // Try primary first
			
 
				+    try {
			
 
				+      const result = await op(this.primary, options);
			
 
				+      // Success — clear streak and ensure routing reads "primary"
			
 
				+      this.failureStreak = 0;
			
 
				+      this.fallbackUntil = null;
			
 
				+      this.transition("primary");
			
 
				+      return result;
			
 
				+    } catch (err) {
			
 
				+      if (err instanceof CircuitOpenError) {
			
 
				+        // Primary circuit is open — open our own cooldown matching its
			
 
				+        // expected duration so subsequent calls skip the primary.
			
 
				+        this.openCooldown(`primary CircuitOpenError`);
			
 
				+      } else {
			
 
				+        this.failureStreak++;
			
 
				+        if (this.failureStreak >= this.failureStreakThreshold) {
			
 
				+          this.openCooldown(
			
 
				+            `primary failure streak ${this.failureStreak} ≥ ${this.failureStreakThreshold}`,
			
 
				+          );
			
 
				+        }
			
 
				+      }
			
 
				+
			
 
				+      // Try fallback for THIS call regardless
			
 
				+      try {
			
 
				+        this.transition("fallback");
			
 
				+        return await op(this.fallback, options);
			
 
				+      } catch (fbErr) {
			
 
				+        if (onTotalFail) return onTotalFail();
			
 
				+        // Both providers failed — surface the fallback error (the primary
			
 
				+        // failure already informed the breaker).
			
 
				+        throw fbErr;
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  private openCooldown(reason: string): void {
			
 
				+    if (this.fallbackUntil === null || this.now() >= this.fallbackUntil) {
			
 
				+      this.fallbackUntil = this.now() + this.cooldownMs;
			
 
				+      this.warn(
			
 
				+        `[AutoFallbackEmbeddingProvider] WARN — falling back to "${this.fallback.kind}" provider for ${Math.round(this.cooldownMs / 1000)}s (reason: ${reason})`,
			
 
				+      );
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  private transition(to: FallbackState): void {
			
 
				+    if (this.lastTransitionState === to) return;
			
 
				+    this.lastTransitionState = to;
			
 
				+    if (to === "primary") {
			
 
				+      this.warn(
			
 
				+        `[AutoFallbackEmbeddingProvider] WARN — primary "${this.primary.kind}" recovered, routing restored`,
			
 
				+      );
			
 
				+    }
			
 
				+    // The "fallback" transition WARN is already emitted by openCooldown
			
 
				+    // (with a richer message). No second WARN here.
			
 
				+  }
			
 
				+}
			
--- a/src/embedding/factory.ts
+++ b/src/embedding/factory.ts
@@ -21,6 +21,10 @@ import {
 
				   OpenAIEmbeddingsProvider,
			
 
				   type OpenAIProviderConfig,
			
 
				 } from "./openai.js";
			
 
				+import {
			
 
				+  AutoFallbackEmbeddingProvider,
			
 
				+  type AutoFallbackProviderConfig,
			
 
				+} from "./autofallback.js";
			
 
				 import type { EmbeddingProvider, ProviderKind } from "./provider.js";
			
 
				 
			
 
				 // ─────────────────────────── Config file ─────────────────────────────────────
			
@@ -34,6 +38,8 @@ export type EmbedProviderConfigFile = {
 
				     upstreamModel?: string;
			
 
				     batchSize?: number;
			
 
				     timeoutMs?: number;
			
 
				+    /** When true, wrap the openai provider in AutoFallback (local fallback). */
			
 
				+    autoFallback?: boolean;
			
 
				   };
			
 
				 };
			
 
				 
			
@@ -70,11 +76,30 @@ export type CreateEmbeddingProviderOptions = {
 
				   local?: LocalLlamaCppProviderConfig;
			
 
				   /** OpenAI-provider overrides — merged on top of env/config */
			
 
				   openai?: Partial<OpenAIProviderConfig>;
			
 
				+  /**
			
 
				+   * Wrap the chosen provider in `AutoFallbackEmbeddingProvider` so that a
			
 
				+   * remote outage transparently falls back to local llama.cpp. Default:
			
 
				+   * `false` — opt-in, since the wrapper requires both backends to be
			
 
				+   * available and the local one will warm node-llama-cpp on first call.
			
 
				+   *
			
 
				+   * Resolution: explicit `autoFallback` wins → env `QMD_EMBED_AUTO_FALLBACK`
			
 
				+   * (`1`/`true`) → config-file `embedProvider.autoFallback` → false.
			
 
				+   *
			
 
				+   * Only applies when the resolved kind is `openai` (no fallback wrap when
			
 
				+   * the primary IS local already).
			
 
				+   */
			
 
				+  autoFallback?: boolean;
			
 
				+  /**
			
 
				+   * Override config for `AutoFallbackEmbeddingProvider` (failureStreak,
			
 
				+   * cooldownMs, etc.). Only used when `autoFallback` resolves true.
			
 
				+   * Primary + fallback are constructed automatically.
			
 
				+   */
			
 
				+  autoFallbackOverrides?: Omit<AutoFallbackProviderConfig, "primary" | "fallback">;
			
 
				   /**
			
 
				    * Custom env source (mostly for tests). Defaults to `process.env`.
			
 
				    * Read keys: QMD_EMBED_PROVIDER, QMD_EMBED_ENDPOINT, QMD_EMBED_API_KEY,
			
 
				    * QMD_EMBED_MODEL_ID, QMD_EMBED_UPSTREAM_MODEL, QMD_EMBED_BATCH_SIZE,
			
 
				-   * QMD_EMBED_TIMEOUT_MS.
			
 
				+   * QMD_EMBED_TIMEOUT_MS, QMD_EMBED_AUTO_FALLBACK.
			
 
				    */
			
 
				   env?: Record<string, string | undefined>;
			
 
				 };
			
@@ -165,7 +190,7 @@ export function createEmbeddingProvider(
 
				     parsePositiveInt(env.QMD_EMBED_TIMEOUT_MS) ??
			
 
				     cfg.embedProvider?.timeoutMs;
			
 
				 
			
 
				-  return new OpenAIEmbeddingsProvider({
			
 
				+  const openaiProvider = new OpenAIEmbeddingsProvider({
			
 
				     endpoint,
			
 
				     apiKey,
			
 
				     modelId,
			
@@ -177,6 +202,33 @@ export function createEmbeddingProvider(
 
				     sleep: opts.openai?.sleep,
			
 
				     now: opts.openai?.now,
			
 
				   });
			
 
				+
			
 
				+  // Should we wrap with AutoFallback? Resolution: arg → env → config → false.
			
 
				+  const autoFallback = resolveAutoFallback(opts, env, cfg);
			
 
				+  if (!autoFallback) return openaiProvider;
			
 
				+
			
 
				+  return new AutoFallbackEmbeddingProvider({
			
 
				+    primary: openaiProvider,
			
 
				+    fallback: new LocalLlamaCppProvider(opts.local ?? { modelId }),
			
 
				+    ...(opts.autoFallbackOverrides ?? {}),
			
 
				+  });
			
 
				+}
			
 
				+
			
 
				+function resolveAutoFallback(
			
 
				+  opts: CreateEmbeddingProviderOptions,
			
 
				+  env: Record<string, string | undefined>,
			
 
				+  cfg: EmbedProviderConfigFile & {
			
 
				+    embedProvider?: { autoFallback?: boolean };
			
 
				+  },
			
 
				+): boolean {
			
 
				+  if (typeof opts.autoFallback === "boolean") return opts.autoFallback;
			
 
				+  const envVal = env.QMD_EMBED_AUTO_FALLBACK?.trim().toLowerCase();
			
 
				+  if (envVal === "1" || envVal === "true" || envVal === "yes") return true;
			
 
				+  if (envVal === "0" || envVal === "false" || envVal === "no") return false;
			
 
				+  if (typeof cfg.embedProvider?.autoFallback === "boolean") {
			
 
				+    return cfg.embedProvider.autoFallback;
			
 
				+  }
			
 
				+  return false;
			
 
				 }
			
 
				 
			
 
				 // ─────────────────────────── Helpers ────────────────────────────────────────
			
--- a/src/embedding/index.ts
+++ b/src/embedding/index.ts
@@ -39,3 +39,9 @@ export {
 
				   type CreateEmbeddingProviderOptions,
			
 
				   type EmbedProviderConfigFile,
			
 
				 } from "./factory.js";
			
 
				+
			
 
				+export {
			
 
				+  AutoFallbackEmbeddingProvider,
			
 
				+  type AutoFallbackProviderConfig,
			
 
				+  type FallbackState,
			
 
				+} from "./autofallback.js";
			
--- a/test/embedding-autofallback.test.ts
+++ b/test/embedding-autofallback.test.ts
@@ -0,0 +1,355 @@
 
				+/**
			
 
				+ * embedding-autofallback.test.ts - Tests for AutoFallbackEmbeddingProvider.
			
 
				+ */
			
 
				+
			
 
				+import { describe, test, expect } from "vitest";
			
 
				+import {
			
 
				+  AutoFallbackEmbeddingProvider,
			
 
				+  type AutoFallbackProviderConfig,
			
 
				+} from "../src/embedding/autofallback.js";
			
 
				+import { CircuitOpenError } from "../src/embedding/openai.js";
			
 
				+import type {
			
 
				+  EmbeddingProvider,
			
 
				+  ProviderEmbedOptions,
			
 
				+  ProviderEmbedding,
			
 
				+  ProviderHealth,
			
 
				+  ProviderKind,
			
 
				+} from "../src/embedding/provider.js";
			
 
				+
			
 
				+// ─────────────────────────── Test fakes ──────────────────────────────────────
			
 
				+
			
 
				+class FakeProvider implements EmbeddingProvider {
			
 
				+  readonly kind: ProviderKind;
			
 
				+  readonly modelId: string;
			
 
				+  readonly dim: number;
			
 
				+  embedCalls = 0;
			
 
				+  embedBatchCalls = 0;
			
 
				+  healthcheckCalls = 0;
			
 
				+  disposed = false;
			
 
				+  /** Override behavior for next N calls */
			
 
				+  nextThrows: Array<Error | null> = [];
			
 
				+  /** Always-throw mode */
			
 
				+  alwaysThrows: Error | null = null;
			
 
				+  /** Health response */
			
 
				+  healthResponse: ProviderHealth | null = null;
			
 
				+
			
 
				+  constructor(kind: ProviderKind, modelId: string, dim = 4) {
			
 
				+    this.kind = kind;
			
 
				+    this.modelId = modelId;
			
 
				+    this.dim = dim;
			
 
				+  }
			
 
				+
			
 
				+  getModelId(): string {
			
 
				+    return this.modelId;
			
 
				+  }
			
 
				+  getDimensions(): number | undefined {
			
 
				+    return this.dim;
			
 
				+  }
			
 
				+
			
 
				+  async healthcheck(): Promise<ProviderHealth> {
			
 
				+    this.healthcheckCalls++;
			
 
				+    if (this.healthResponse) return this.healthResponse;
			
 
				+    return { ok: true, model: this.modelId, dimensions: this.dim };
			
 
				+  }
			
 
				+
			
 
				+  async embed(text: string, _options?: ProviderEmbedOptions): Promise<ProviderEmbedding | null> {
			
 
				+    this.embedCalls++;
			
 
				+    this.maybeThrow();
			
 
				+    return { embedding: this.fakeEmbed(text), model: this.modelId };
			
 
				+  }
			
 
				+
			
 
				+  async embedBatch(texts: string[], _options?: ProviderEmbedOptions): Promise<(ProviderEmbedding | null)[]> {
			
 
				+    this.embedBatchCalls++;
			
 
				+    this.maybeThrow();
			
 
				+    return texts.map((t) => ({ embedding: this.fakeEmbed(t), model: this.modelId }));
			
 
				+  }
			
 
				+
			
 
				+  async dispose(): Promise<void> {
			
 
				+    this.disposed = true;
			
 
				+  }
			
 
				+
			
 
				+  private maybeThrow(): void {
			
 
				+    if (this.alwaysThrows) throw this.alwaysThrows;
			
 
				+    const next = this.nextThrows.shift();
			
 
				+    if (next) throw next;
			
 
				+  }
			
 
				+
			
 
				+  private fakeEmbed(text: string): number[] {
			
 
				+    return Array.from({ length: this.dim }, (_, i) => (text.length + i) * 0.01);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+function buildAutoFallback(opts: Partial<AutoFallbackProviderConfig> = {}): {
			
 
				+  af: AutoFallbackEmbeddingProvider;
			
 
				+  primary: FakeProvider;
			
 
				+  fallback: FakeProvider;
			
 
				+  warns: string[];
			
 
				+  setNow: (n: number) => void;
			
 
				+} {
			
 
				+  const primary = new FakeProvider("openai", "embeddinggemma");
			
 
				+  const fallback = new FakeProvider("local", "embeddinggemma");
			
 
				+  const warns: string[] = [];
			
 
				+  let now = 1_000_000;
			
 
				+  const af = new AutoFallbackEmbeddingProvider({
			
 
				+    primary,
			
 
				+    fallback,
			
 
				+    failureStreakThreshold: opts.failureStreakThreshold ?? 3,
			
 
				+    cooldownMs: opts.cooldownMs ?? 60_000,
			
 
				+    warn: (m) => warns.push(m),
			
 
				+    now: () => now,
			
 
				+    ...opts,
			
 
				+  });
			
 
				+  return { af, primary, fallback, warns, setNow: (n) => (now = n) };
			
 
				+}
			
 
				+
			
 
				+// ─────────────────────────── Construction ────────────────────────────────────
			
 
				+
			
 
				+describe("AutoFallbackEmbeddingProvider — construction", () => {
			
 
				+  test("requires primary", () => {
			
 
				+    expect(
			
 
				+      () =>
			
 
				+        new AutoFallbackEmbeddingProvider({
			
 
				+          // @ts-expect-error testing runtime guard
			
 
				+          primary: undefined,
			
 
				+          fallback: new FakeProvider("local", "x"),
			
 
				+        }),
			
 
				+    ).toThrow(/primary is required/);
			
 
				+  });
			
 
				+
			
 
				+  test("requires fallback", () => {
			
 
				+    expect(
			
 
				+      () =>
			
 
				+        new AutoFallbackEmbeddingProvider({
			
 
				+          primary: new FakeProvider("openai", "x"),
			
 
				+          // @ts-expect-error testing runtime guard
			
 
				+          fallback: undefined,
			
 
				+        }),
			
 
				+    ).toThrow(/fallback is required/);
			
 
				+  });
			
 
				+
			
 
				+  test("rejects identical primary and fallback", () => {
			
 
				+    const same = new FakeProvider("openai", "x");
			
 
				+    expect(
			
 
				+      () =>
			
 
				+        new AutoFallbackEmbeddingProvider({
			
 
				+          primary: same,
			
 
				+          fallback: same,
			
 
				+        }),
			
 
				+    ).toThrow(/must differ/);
			
 
				+  });
			
 
				+
			
 
				+  test("inherits primary's kind", () => {
			
 
				+    const { af } = buildAutoFallback();
			
 
				+    expect(af.kind).toBe("openai");
			
 
				+  });
			
 
				+});
			
 
				+
			
 
				+// ─────────────────────────── Happy path ──────────────────────────────────────
			
 
				+
			
 
				+describe("AutoFallbackEmbeddingProvider — happy path", () => {
			
 
				+  test("primary succeeds → fallback never called", async () => {
			
 
				+    const { af, primary, fallback } = buildAutoFallback();
			
 
				+    const r = await af.embed("hello");
			
 
				+    expect(r).not.toBeNull();
			
 
				+    expect(primary.embedCalls).toBe(1);
			
 
				+    expect(fallback.embedCalls).toBe(0);
			
 
				+    expect(af.getRoutingState()).toBe("primary");
			
 
				+  });
			
 
				+
			
 
				+  test("primary embedBatch succeeds → fallback untouched", async () => {
			
 
				+    const { af, primary, fallback } = buildAutoFallback();
			
 
				+    const out = await af.embedBatch(["a", "b"]);
			
 
				+    expect(out.length).toBe(2);
			
 
				+    expect(primary.embedBatchCalls).toBe(1);
			
 
				+    expect(fallback.embedBatchCalls).toBe(0);
			
 
				+  });
			
 
				+
			
 
				+  test("getModelId / getDimensions delegate to primary", () => {
			
 
				+    const { af, primary } = buildAutoFallback();
			
 
				+    expect(af.getModelId()).toBe(primary.getModelId());
			
 
				+    expect(af.getDimensions()).toBe(primary.getDimensions());
			
 
				+  });
			
 
				+});
			
 
				+
			
 
				+// ─────────────────────────── Circuit-open fallback ───────────────────────────
			
 
				+
			
 
				+describe("AutoFallbackEmbeddingProvider — CircuitOpenError handling", () => {
			
 
				+  test("primary throws CircuitOpenError → fallback served + cooldown opens", async () => {
			
 
				+    const { af, primary, fallback, warns } = buildAutoFallback();
			
 
				+    primary.nextThrows.push(new CircuitOpenError());
			
 
				+    const r = await af.embed("hello");
			
 
				+    expect(r).not.toBeNull();
			
 
				+    expect(r!.embedding.length).toBe(4); // came from fallback
			
 
				+    expect(primary.embedCalls).toBe(1);
			
 
				+    expect(fallback.embedCalls).toBe(1);
			
 
				+    expect(af.getRoutingState()).toBe("fallback");
			
 
				+    expect(warns.some((w) => w.includes("CircuitOpenError"))).toBe(true);
			
 
				+  });
			
 
				+
			
 
				+  test("during cooldown subsequent calls skip primary entirely", async () => {
			
 
				+    const { af, primary, fallback } = buildAutoFallback();
			
 
				+    primary.nextThrows.push(new CircuitOpenError());
			
 
				+    await af.embed("first");
			
 
				+    expect(primary.embedCalls).toBe(1);
			
 
				+    expect(fallback.embedCalls).toBe(1);
			
 
				+    // Subsequent call within cooldown
			
 
				+    await af.embed("second");
			
 
				+    expect(primary.embedCalls).toBe(1); // unchanged
			
 
				+    expect(fallback.embedCalls).toBe(2);
			
 
				+  });
			
 
				+
			
 
				+  test("after cooldown expires, primary is retried", async () => {
			
 
				+    const { af, primary, fallback, setNow } = buildAutoFallback({ cooldownMs: 5000 });
			
 
				+    primary.nextThrows.push(new CircuitOpenError());
			
 
				+    await af.embed("a");
			
 
				+    expect(af.getRoutingState()).toBe("fallback");
			
 
				+
			
 
				+    setNow(1_000_000 + 5_001);
			
 
				+    expect(af.getRoutingState()).toBe("primary");
			
 
				+
			
 
				+    // Next call reaches primary again
			
 
				+    await af.embed("b");
			
 
				+    expect(primary.embedCalls).toBe(2);
			
 
				+    expect(fallback.embedCalls).toBe(1);
			
 
				+  });
			
 
				+
			
 
				+  test("WARN fired only once per transition (not per call during cooldown)", async () => {
			
 
				+    const { af, primary, warns } = buildAutoFallback();
			
 
				+    primary.nextThrows.push(new CircuitOpenError());
			
 
				+    await af.embed("a");
			
 
				+    await af.embed("b");
			
 
				+    await af.embed("c");
			
 
				+    const fallbackWarns = warns.filter((w) => w.includes("falling back"));
			
 
				+    expect(fallbackWarns.length).toBe(1);
			
 
				+  });
			
 
				+});
			
 
				+
			
 
				+// ─────────────────────────── Failure-streak threshold ────────────────────────
			
 
				+
			
 
				+describe("AutoFallbackEmbeddingProvider — failure streak", () => {
			
 
				+  test("non-CircuitOpen errors below threshold → no cooldown", async () => {
			
 
				+    const { af, primary, fallback } = buildAutoFallback({ failureStreakThreshold: 3 });
			
 
				+    primary.nextThrows.push(new Error("transient"));
			
 
				+    const r = await af.embed("a");
			
 
				+    expect(r).not.toBeNull(); // fallback served it
			
 
				+    expect(af.getRoutingState()).toBe("primary");
			
 
				+    expect(primary.embedCalls).toBe(1);
			
 
				+    expect(fallback.embedCalls).toBe(1);
			
 
				+  });
			
 
				+
			
 
				+  test("threshold consecutive failures → cooldown opens", async () => {
			
 
				+    const { af, primary, fallback } = buildAutoFallback({ failureStreakThreshold: 3 });
			
 
				+    for (let i = 0; i < 3; i++) {
			
 
				+      primary.nextThrows.push(new Error(`err ${i}`));
			
 
				+    }
			
 
				+    await af.embed("a");
			
 
				+    await af.embed("b");
			
 
				+    await af.embed("c");
			
 
				+    expect(af.getRoutingState()).toBe("fallback");
			
 
				+    expect(primary.embedCalls).toBe(3);
			
 
				+    expect(fallback.embedCalls).toBe(3);
			
 
				+  });
			
 
				+
			
 
				+  test("a single primary success resets the streak", async () => {
			
 
				+    const { af, primary } = buildAutoFallback({ failureStreakThreshold: 3 });
			
 
				+    primary.nextThrows.push(new Error("e1"));
			
 
				+    primary.nextThrows.push(new Error("e2"));
			
 
				+    await af.embed("a");
			
 
				+    await af.embed("b");
			
 
				+    // Now success
			
 
				+    await af.embed("c");
			
 
				+    // Streak reset; another two failures shouldn't trip cooldown yet
			
 
				+    primary.nextThrows.push(new Error("e3"));
			
 
				+    primary.nextThrows.push(new Error("e4"));
			
 
				+    await af.embed("d");
			
 
				+    await af.embed("e");
			
 
				+    expect(af.getRoutingState()).toBe("primary");
			
 
				+  });
			
 
				+});
			
 
				+
			
 
				+// ─────────────────────────── Recovery transition ─────────────────────────────
			
 
				+
			
 
				+describe("AutoFallbackEmbeddingProvider — recovery transitions", () => {
			
 
				+  test("recovery WARN fires when primary call succeeds after fallback", async () => {
			
 
				+    const { af, primary, warns, setNow } = buildAutoFallback({ cooldownMs: 5000 });
			
 
				+    primary.nextThrows.push(new CircuitOpenError());
			
 
				+    await af.embed("a");
			
 
				+    setNow(1_000_000 + 5_001);
			
 
				+    await af.embed("b"); // primary succeeds
			
 
				+    const recoveryWarns = warns.filter((w) => w.includes("recovered"));
			
 
				+    expect(recoveryWarns.length).toBe(1);
			
 
				+  });
			
 
				+
			
 
				+  test("reset() clears state + transitions back to primary", async () => {
			
 
				+    const { af, primary } = buildAutoFallback({ cooldownMs: 60_000 });
			
 
				+    primary.nextThrows.push(new CircuitOpenError());
			
 
				+    await af.embed("a");
			
 
				+    expect(af.getRoutingState()).toBe("fallback");
			
 
				+    af.reset();
			
 
				+    expect(af.getRoutingState()).toBe("primary");
			
 
				+    await af.embed("b");
			
 
				+    expect(primary.embedCalls).toBe(2);
			
 
				+  });
			
 
				+});
			
 
				+
			
 
				+// ─────────────────────────── Both fail ───────────────────────────────────────
			
 
				+
			
 
				+describe("AutoFallbackEmbeddingProvider — both providers fail", () => {
			
 
				+  test("primary throws + fallback throws → embedBatch returns nulls", async () => {
			
 
				+    const { af, primary, fallback } = buildAutoFallback();
			
 
				+    primary.alwaysThrows = new Error("primary down");
			
 
				+    fallback.alwaysThrows = new Error("local broken");
			
 
				+    const r = await af.embedBatch(["a", "b"]);
			
 
				+    expect(r).toEqual([null, null]);
			
 
				+  });
			
 
				+
			
 
				+  test("primary throws + fallback throws → embed propagates fallback error", async () => {
			
 
				+    const { af, primary, fallback } = buildAutoFallback();
			
 
				+    primary.alwaysThrows = new Error("primary down");
			
 
				+    fallback.alwaysThrows = new Error("local broken");
			
 
				+    await expect(af.embed("a")).rejects.toThrow(/local broken/);
			
 
				+  });
			
 
				+});
			
 
				+
			
 
				+// ─────────────────────────── Healthcheck ─────────────────────────────────────
			
 
				+
			
 
				+describe("AutoFallbackEmbeddingProvider — healthcheck", () => {
			
 
				+  test("primary healthy → returns primary health", async () => {
			
 
				+    const { af, primary, fallback } = buildAutoFallback();
			
 
				+    const h = await af.healthcheck();
			
 
				+    expect(h.ok).toBe(true);
			
 
				+    expect(primary.healthcheckCalls).toBe(1);
			
 
				+    expect(fallback.healthcheckCalls).toBe(0);
			
 
				+  });
			
 
				+
			
 
				+  test("primary unhealthy → fallback checked + reported", async () => {
			
 
				+    const { af, primary, fallback } = buildAutoFallback();
			
 
				+    primary.healthResponse = { ok: false, model: "primary-model", detail: "down" };
			
 
				+    fallback.healthResponse = { ok: true, model: "local-model", detail: "fine" };
			
 
				+    const h = await af.healthcheck();
			
 
				+    expect(h.ok).toBe(true);
			
 
				+    expect(primary.healthcheckCalls).toBe(1);
			
 
				+    expect(fallback.healthcheckCalls).toBe(1);
			
 
				+    expect(h.detail).toContain("primary");
			
 
				+    expect(h.detail).toContain("fallback");
			
 
				+  });
			
 
				+
			
 
				+  test("both unhealthy → ok=false", async () => {
			
 
				+    const { af, primary, fallback } = buildAutoFallback();
			
 
				+    primary.healthResponse = { ok: false, model: "p", detail: "down" };
			
 
				+    fallback.healthResponse = { ok: false, model: "f", detail: "down" };
			
 
				+    const h = await af.healthcheck();
			
 
				+    expect(h.ok).toBe(false);
			
 
				+  });
			
 
				+});
			
 
				+
			
 
				+// ─────────────────────────── dispose ─────────────────────────────────────────
			
 
				+
			
 
				+describe("AutoFallbackEmbeddingProvider — dispose", () => {
			
 
				+  test("dispose cascades to both providers", async () => {
			
 
				+    const { af, primary, fallback } = buildAutoFallback();
			
 
				+    await af.dispose();
			
 
				+    expect(primary.disposed).toBe(true);
			
 
				+    expect(fallback.disposed).toBe(true);
			
 
				+  });
			
 
				+});
			
--- a/test/embedding-live-parity.bench.ts
+++ b/test/embedding-live-parity.bench.ts
@@ -0,0 +1,154 @@
 
				+/**
			
 
				+ * embedding-live-parity.bench.ts - LIVE benchmark vs qmd-embed-worker.
			
 
				+ *
			
 
				+ * NOT a vitest test (uses .bench.ts suffix to skip auto-discovery).
			
 
				+ * Run manually with `bun src/test-preload.ts test/embedding-live-parity.bench.ts`
			
 
				+ * or `npx tsx test/embedding-live-parity.bench.ts`.
			
 
				+ *
			
 
				+ * Pre-req: `QMD_EMBED_ENDPOINT=http://10.0.2.162:8082` (or any reachable
			
 
				+ * qmd-embed-worker / ai.mm.mk endpoint with the embeddinggemma model loaded).
			
 
				+ *
			
 
				+ * What it measures:
			
 
				+ *   1. Healthcheck — confirm endpoint is up + reports expected model
			
 
				+ *   2. Single-text embed parity — same text via OpenAIEmbeddingsProvider
			
 
				+ *      vs LocalLlamaCppProvider, measure cosine similarity (target ≥0.999)
			
 
				+ *   3. Batch perf — embed 100 texts via HTTP and report throughput
			
 
				+ *
			
 
				+ * Local llama-cpp is OPTIONAL — set QMD_BENCH_SKIP_LOCAL=1 to skip parity
			
 
				+ * (only useful on machines without GPU/CPU model build support, like `code`
			
 
				+ * where Vulkan compilation fails).
			
 
				+ */
			
 
				+
			
 
				+import { OpenAIEmbeddingsProvider } from "../src/embedding/openai.js";
			
 
				+import { LocalLlamaCppProvider } from "../src/embedding/local.js";
			
 
				+
			
 
				+const ENDPOINT =
			
 
				+  process.env.QMD_EMBED_ENDPOINT?.trim() || "http://10.0.2.162:8082";
			
 
				+const MODEL_ID = process.env.QMD_EMBED_MODEL_ID?.trim() || "embeddinggemma";
			
 
				+const UPSTREAM_MODEL =
			
 
				+  process.env.QMD_EMBED_UPSTREAM_MODEL?.trim() || "embeddinggemma:300m";
			
 
				+const SKIP_LOCAL = process.env.QMD_BENCH_SKIP_LOCAL === "1";
			
 
				+const N_PERF = Number.parseInt(process.env.QMD_BENCH_N ?? "100", 10);
			
 
				+
			
 
				+function cosine(a: number[], b: number[]): number {
			
 
				+  if (a.length !== b.length) return 0;
			
 
				+  let dot = 0;
			
 
				+  let na = 0;
			
 
				+  let nb = 0;
			
 
				+  for (let i = 0; i < a.length; i++) {
			
 
				+    dot += a[i]! * b[i]!;
			
 
				+    na += a[i]! * a[i]!;
			
 
				+    nb += b[i]! * b[i]!;
			
 
				+  }
			
 
				+  return dot / (Math.sqrt(na) * Math.sqrt(nb) + 1e-12);
			
 
				+}
			
 
				+
			
 
				+function fmtMs(ms: number): string {
			
 
				+  if (ms < 1000) return `${ms.toFixed(0)}ms`;
			
 
				+  return `${(ms / 1000).toFixed(2)}s`;
			
 
				+}
			
 
				+
			
 
				+async function main() {
			
 
				+  console.log(`╭─ qmd embedding live benchmark ──────────────────╮`);
			
 
				+  console.log(`│ endpoint: ${ENDPOINT}`);
			
 
				+  console.log(`│ model:    ${MODEL_ID} (upstream=${UPSTREAM_MODEL})`);
			
 
				+  console.log(`│ n_perf:   ${N_PERF}`);
			
 
				+  console.log(`│ skip local: ${SKIP_LOCAL ? "YES" : "no"}`);
			
 
				+  console.log(`╰─────────────────────────────────────────────────╯\n`);
			
 
				+
			
 
				+  // ─── Step 1: healthcheck ────────────────────────────────────────────────
			
 
				+  const provider = new OpenAIEmbeddingsProvider({
			
 
				+    endpoint: ENDPOINT,
			
 
				+    modelId: MODEL_ID,
			
 
				+    upstreamModel: UPSTREAM_MODEL,
			
 
				+    timeoutMs: 30_000,
			
 
				+  });
			
 
				+
			
 
				+  console.log("[1/3] Healthcheck...");
			
 
				+  const health = await provider.healthcheck();
			
 
				+  console.log(`  → ok=${health.ok}, model=${health.model}, dims=${health.dimensions ?? "?"}`);
			
 
				+  console.log(`  → detail: ${health.detail ?? "-"}\n`);
			
 
				+  if (!health.ok) {
			
 
				+    console.error("✗ Healthcheck failed; aborting.");
			
 
				+    process.exit(1);
			
 
				+  }
			
 
				+
			
 
				+  // ─── Step 2: parity (HTTP vs local) ─────────────────────────────────────
			
 
				+  const sampleTexts = [
			
 
				+    "task: search result | query: hybrid search architecture",
			
 
				+    "title: README | text: QMD is a hybrid search engine combining BM25 and vector embeddings.",
			
 
				+    "title: Configuration | text: The retry schedule for 429 responses is 1s, 4s, 16s with up to 3 attempts.",
			
 
				+  ];
			
 
				+
			
 
				+  console.log("[2/3] HTTP embed parity check...");
			
 
				+  const httpStart = Date.now();
			
 
				+  const httpResults = await provider.embedBatch(sampleTexts);
			
 
				+  const httpMs = Date.now() - httpStart;
			
 
				+  console.log(`  → HTTP embedded ${httpResults.length} texts in ${fmtMs(httpMs)}`);
			
 
				+  for (let i = 0; i < httpResults.length; i++) {
			
 
				+    const r = httpResults[i];
			
 
				+    console.log(`    [${i}] dim=${r?.embedding.length ?? "null"}, model="${r?.model}"`);
			
 
				+  }
			
 
				+
			
 
				+  if (!SKIP_LOCAL) {
			
 
				+    try {
			
 
				+      console.log("\n  → Trying local llama-cpp comparison (may build models on first run)...");
			
 
				+      const local = new LocalLlamaCppProvider({ modelId: MODEL_ID });
			
 
				+      const localStart = Date.now();
			
 
				+      const localResults = await local.embedBatch(sampleTexts);
			
 
				+      const localMs = Date.now() - localStart;
			
 
				+      console.log(`  → LOCAL embedded ${localResults.length} texts in ${fmtMs(localMs)}`);
			
 
				+
			
 
				+      console.log("\n  Cosine similarity (HTTP vs local):");
			
 
				+      let allPass = true;
			
 
				+      for (let i = 0; i < sampleTexts.length; i++) {
			
 
				+        const a = httpResults[i]?.embedding;
			
 
				+        const b = localResults[i]?.embedding;
			
 
				+        if (!a || !b) {
			
 
				+          console.log(`    [${i}] SKIP — null result`);
			
 
				+          continue;
			
 
				+        }
			
 
				+        const c = cosine(a, b);
			
 
				+        const ok = c >= 0.999;
			
 
				+        if (!ok) allPass = false;
			
 
				+        console.log(
			
 
				+          `    [${i}] cos=${c.toFixed(6)} ${ok ? "✓" : "✗ (target ≥0.999)"}`,
			
 
				+        );
			
 
				+      }
			
 
				+      console.log(allPass ? "  ✓ Parity PASS" : "  ✗ Parity FAIL");
			
 
				+
			
 
				+      await local.dispose();
			
 
				+    } catch (err) {
			
 
				+      console.log(`  → Local comparison skipped: ${err instanceof Error ? err.message : err}`);
			
 
				+    }
			
 
				+  } else {
			
 
				+    console.log("  → Local comparison skipped (QMD_BENCH_SKIP_LOCAL=1)");
			
 
				+  }
			
 
				+
			
 
				+  // ─── Step 3: throughput / perf benchmark ────────────────────────────────
			
 
				+  console.log(`\n[3/3] Performance: embedding ${N_PERF} chunks via HTTP...`);
			
 
				+  const texts: string[] = [];
			
 
				+  for (let i = 0; i < N_PERF; i++) {
			
 
				+    texts.push(
			
 
				+      `title: doc-${i} | text: This is sample text number ${i} containing words like search, embedding, vector, retrieval, similarity, ranking.`,
			
 
				+    );
			
 
				+  }
			
 
				+  const perfStart = Date.now();
			
 
				+  const perfResults = await provider.embedBatch(texts);
			
 
				+  const perfMs = Date.now() - perfStart;
			
 
				+  const okCount = perfResults.filter((r) => r !== null).length;
			
 
				+  console.log(
			
 
				+    `  → ${okCount}/${N_PERF} embedded in ${fmtMs(perfMs)} (${(N_PERF / (perfMs / 1000)).toFixed(1)} chunks/s)`,
			
 
				+  );
			
 
				+  console.log(
			
 
				+    `  → average per chunk: ${(perfMs / N_PERF).toFixed(2)}ms`,
			
 
				+  );
			
 
				+  console.log(`\nDone. ✓`);
			
 
				+
			
 
				+  await provider.dispose();
			
 
				+}
			
 
				+
			
 
				+main().catch((err) => {
			
 
				+  console.error("Benchmark failed:", err);
			
 
				+  process.exit(1);
			
 
				+});