/** * openai.ts - OpenAI-compatible HTTP embedding provider * * Talks to any endpoint that implements `POST /v1/embeddings` with the OpenAI * shape: request `{model, input: string|string[]}`, response * `{data: [{embedding: number[], index: number}, ...]}`. * * Used by qmd to delegate embeddings to a GPU worker (e.g. ai.mm.mk → * qmd-embed-worker on `models` LXC, RTX 4090) instead of running * node-llama-cpp locally. * * Features: * - Batches input in groups of ≤64 (configurable via QMD_EMBED_BATCH_SIZE) * - Retries 429 / 503 with exponential backoff (1s, 4s, 16s) * - 4xx (non-429) → no retry, count as failure * - Circuit breaker: >50% failures in a 60s window → OPEN for 5 min, * callers can use this to fall back to a local provider * - Per-call timeout via AbortSignal (default QMD_EMBED_TIMEOUT_MS=30000) * - Healthcheck via `GET /health` if available, else a probe embed call */ import type { EmbeddingProvider, ProviderEmbedOptions, ProviderEmbedding, ProviderHealth, ProviderKind } from "./provider.js"; /** * Default batch size — most OpenAI-compatible embedding endpoints accept up to * 2048 inputs per call but for memory and latency we cap at 64. */ export declare const DEFAULT_BATCH_SIZE = 64; /** * Default in-flight concurrency cap for `embedBatch`. The qmd-embed-worker * exposes a 4-way semaphore (`MAX_CONCURRENT_REQUESTS=4`) and idles at * queue-depth 1.0 under sequential clients (i-fkpnar9i baseline). Defaulting * to 4 matches the worker's advertised concurrency without overshooting the * GPU. Override per-deploy via `QMD_EMBED_CONCURRENCY`. Setting to 1 reverts * to the legacy sequential dispatch. */ export declare const DEFAULT_CONCURRENCY = 4; /** * Default per-request timeout (30 s). embeddinggemma-300M on RTX 4090 takes * <500ms per batch of 64 in practice; 30s is a safe upper bound. */ export declare const DEFAULT_TIMEOUT_MS = 30000; /** * Retry backoff schedule (ms) for 429/503 responses. 3 attempts total * (initial + 2 retries) — aligns with issue spec "1s/4s/16s". */ export declare const RETRY_BACKOFFS_MS: readonly number[]; /** * Circuit breaker — flips OPEN when error rate exceeds threshold within * window. While OPEN, every call fails fast so the caller can fall back. */ export declare const CIRCUIT_WINDOW_MS = 60000; export declare const CIRCUIT_OPEN_DURATION_MS: number; export declare const CIRCUIT_FAILURE_RATE_THRESHOLD = 0.5; export declare const CIRCUIT_MIN_SAMPLES = 4; export type OpenAIProviderConfig = { /** Endpoint base URL — e.g. "https://ai.mm.mk" (no trailing slash) */ endpoint: string; /** Optional bearer token sent as `Authorization: Bearer ...` */ apiKey?: string; /** * Stable model identifier to report up via `getModelId()`. * Defaults to "embeddinggemma" to match qmd's existing DB rows. */ modelId?: string; /** * Upstream model name sent in the HTTP request body. Often differs from * `modelId` (e.g. modelId="embeddinggemma" but upstream model="embeddinggemma:300m"). */ upstreamModel?: string; /** Batch size cap (default DEFAULT_BATCH_SIZE = 64) */ batchSize?: number; /** * Max in-flight HTTP requests during a single `embedBatch` call. Default * `DEFAULT_CONCURRENCY=4` matches the worker semaphore. Set to 1 to force * legacy sequential dispatch (useful for benchmarks / regression bisect). */ concurrency?: number; /** Per-request timeout in ms (default DEFAULT_TIMEOUT_MS = 30_000) */ timeoutMs?: number; /** Custom fetch (for testing). Defaults to global `fetch`. */ fetchImpl?: typeof fetch; /** Custom retry schedule (for testing). Defaults to RETRY_BACKOFFS_MS. */ retryBackoffsMs?: readonly number[]; /** Custom sleep impl (for testing). Defaults to setTimeout. */ sleep?: (ms: number) => Promise; /** Custom clock (for testing). Defaults to Date.now. */ now?: () => number; }; export type OpenAIEmbeddingsResponse = { object?: string; model?: string; data: Array<{ object?: string; index: number; embedding: number[]; }>; usage?: { prompt_tokens?: number; total_tokens?: number; }; }; /** * Circuit breaker state — exported for tests */ export type CircuitState = "closed" | "open" | "half-open"; /** * Determine whether an HTTP status is retryable. 429 (Too Many Requests) * and 503 (Service Unavailable) are retried; 4xx (other than 429) are not. */ export declare function isRetryableStatus(status: number): boolean; /** * Chunk an array into pieces of ≤ size each. `size` MUST be ≥ 1. */ export declare function chunkArray(items: T[], size: number): T[][]; /** * Sliding-window circuit breaker. Tracks the last N samples (min 4) over a * 60-second window; flips OPEN when failure rate exceeds 50%, then auto- * resets to HALF-OPEN after 5 minutes — at which point the next probe * decides whether to close (success) or re-open (failure). */ export declare class CircuitBreaker { private samples; private state; private openedAt; private readonly windowMs; private readonly openDurationMs; private readonly threshold; private readonly minSamples; private readonly now; constructor(opts?: { windowMs?: number; openDurationMs?: number; threshold?: number; minSamples?: number; now?: () => number; }); getState(): CircuitState; /** * Returns true when calls should be short-circuited (skip HTTP, fall back). * Side-effects: may transition OPEN → HALF-OPEN if the open window expired. */ shouldFailFast(): boolean; /** Record a successful call. */ recordSuccess(): void; /** Record a failed call. May trigger OPEN. */ recordFailure(): void; /** Force-reset the breaker (used by tests / admin) */ reset(): void; private pushSample; private evaluate; private tickAutoReset; } /** * Raised when the circuit breaker is OPEN and a call is short-circuited. * Callers (e.g. fallback wrapper) can catch this to switch to local provider. */ export declare class CircuitOpenError extends Error { constructor(message?: string); } /** * Persistent (non-retryable) HTTP error from upstream. Includes status code. */ export declare class HttpError extends Error { readonly status: number; readonly bodyPreview: string; constructor(status: number, bodyPreview: string); } export declare class OpenAIEmbeddingsProvider implements EmbeddingProvider { readonly kind: ProviderKind; private readonly endpoint; private readonly apiKey?; private readonly modelId; private readonly upstreamModel; private readonly batchSize; private readonly concurrency; private readonly timeoutMs; private readonly fetchImpl; private readonly retryBackoffsMs; private readonly sleep; private readonly now; private dimensions; private lastError; readonly breaker: CircuitBreaker; constructor(config: OpenAIProviderConfig); getModelId(): string; getDimensions(): number | undefined; /** * Most recent per-chunk failure message (HTTP status + body preview, malformed * JSON, timeout, abort reason). Returns `undefined` after a successful call * or before the first call. See `EmbeddingProvider.getLastError`. */ getLastError(): string | undefined; /** Endpoint URL configured at construction time — used by callers when * building error messages for failed first-chunk probes. */ getEndpoint(): string; healthcheck(signal?: AbortSignal): Promise; embed(text: string, options?: ProviderEmbedOptions): Promise; embedBatch(texts: string[], options?: ProviderEmbedOptions): Promise<(ProviderEmbedding | null)[]>; dispose(): Promise; /** * Format a request-failure context string for `lastError`. Includes endpoint * + HTTP status + body preview when the error was an `HttpError`, otherwise * falls back to the message of the underlying error (or the value itself * when not an Error). Kept short — body preview is already capped at 1024 * chars by `HttpError`, but we trim further here for the dimension-probe * thrown error which surfaces directly to users. */ private formatErrorContext; private buildHeaders; /** * Single HTTP request with retry on 429/503. Returns embeddings indexed * the same as `texts`. Throws on non-retryable failure or all attempts * exhausted. */ private requestWithRetry; /** * Issue one HTTP attempt to `POST /v1/embeddings`. Does NOT retry. */ private requestOnce; }