openai.d.ts 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219
  1. /**
  2. * openai.ts - OpenAI-compatible HTTP embedding provider
  3. *
  4. * Talks to any endpoint that implements `POST /v1/embeddings` with the OpenAI
  5. * shape: request `{model, input: string|string[]}`, response
  6. * `{data: [{embedding: number[], index: number}, ...]}`.
  7. *
  8. * Used by qmd to delegate embeddings to a GPU worker (e.g. ai.mm.mk →
  9. * qmd-embed-worker on `models` LXC, RTX 4090) instead of running
  10. * node-llama-cpp locally.
  11. *
  12. * Features:
  13. * - Batches input in groups of ≤64 (configurable via QMD_EMBED_BATCH_SIZE)
  14. * - Retries 429 / 503 with exponential backoff (1s, 4s, 16s)
  15. * - 4xx (non-429) → no retry, count as failure
  16. * - Circuit breaker: >50% failures in a 60s window → OPEN for 5 min,
  17. * callers can use this to fall back to a local provider
  18. * - Per-call timeout via AbortSignal (default QMD_EMBED_TIMEOUT_MS=30000)
  19. * - Healthcheck via `GET /health` if available, else a probe embed call
  20. */
  21. import type { EmbeddingProvider, ProviderEmbedOptions, ProviderEmbedding, ProviderHealth, ProviderKind } from "./provider.js";
  22. /**
  23. * Default batch size — most OpenAI-compatible embedding endpoints accept up to
  24. * 2048 inputs per call but for memory and latency we cap at 64.
  25. */
  26. export declare const DEFAULT_BATCH_SIZE = 64;
  27. /**
  28. * Default in-flight concurrency cap for `embedBatch`. The qmd-embed-worker
  29. * exposes a 4-way semaphore (`MAX_CONCURRENT_REQUESTS=4`) and idles at
  30. * queue-depth 1.0 under sequential clients (i-fkpnar9i baseline). Defaulting
  31. * to 4 matches the worker's advertised concurrency without overshooting the
  32. * GPU. Override per-deploy via `QMD_EMBED_CONCURRENCY`. Setting to 1 reverts
  33. * to the legacy sequential dispatch.
  34. */
  35. export declare const DEFAULT_CONCURRENCY = 4;
  36. /**
  37. * Default per-request timeout (30 s). embeddinggemma-300M on RTX 4090 takes
  38. * <500ms per batch of 64 in practice; 30s is a safe upper bound.
  39. */
  40. export declare const DEFAULT_TIMEOUT_MS = 30000;
  41. /**
  42. * Retry backoff schedule (ms) for 429/503 responses. 3 attempts total
  43. * (initial + 2 retries) — aligns with issue spec "1s/4s/16s".
  44. */
  45. export declare const RETRY_BACKOFFS_MS: readonly number[];
  46. /**
  47. * Circuit breaker — flips OPEN when error rate exceeds threshold within
  48. * window. While OPEN, every call fails fast so the caller can fall back.
  49. */
  50. export declare const CIRCUIT_WINDOW_MS = 60000;
  51. export declare const CIRCUIT_OPEN_DURATION_MS: number;
  52. export declare const CIRCUIT_FAILURE_RATE_THRESHOLD = 0.5;
  53. export declare const CIRCUIT_MIN_SAMPLES = 4;
  54. export type OpenAIProviderConfig = {
  55. /** Endpoint base URL — e.g. "https://ai.mm.mk" (no trailing slash) */
  56. endpoint: string;
  57. /** Optional bearer token sent as `Authorization: Bearer ...` */
  58. apiKey?: string;
  59. /**
  60. * Stable model identifier to report up via `getModelId()`.
  61. * Defaults to "embeddinggemma" to match qmd's existing DB rows.
  62. */
  63. modelId?: string;
  64. /**
  65. * Upstream model name sent in the HTTP request body. Often differs from
  66. * `modelId` (e.g. modelId="embeddinggemma" but upstream model="embeddinggemma:300m").
  67. */
  68. upstreamModel?: string;
  69. /** Batch size cap (default DEFAULT_BATCH_SIZE = 64) */
  70. batchSize?: number;
  71. /**
  72. * Max in-flight HTTP requests during a single `embedBatch` call. Default
  73. * `DEFAULT_CONCURRENCY=4` matches the worker semaphore. Set to 1 to force
  74. * legacy sequential dispatch (useful for benchmarks / regression bisect).
  75. */
  76. concurrency?: number;
  77. /** Per-request timeout in ms (default DEFAULT_TIMEOUT_MS = 30_000) */
  78. timeoutMs?: number;
  79. /** Custom fetch (for testing). Defaults to global `fetch`. */
  80. fetchImpl?: typeof fetch;
  81. /** Custom retry schedule (for testing). Defaults to RETRY_BACKOFFS_MS. */
  82. retryBackoffsMs?: readonly number[];
  83. /** Custom sleep impl (for testing). Defaults to setTimeout. */
  84. sleep?: (ms: number) => Promise<void>;
  85. /** Custom clock (for testing). Defaults to Date.now. */
  86. now?: () => number;
  87. };
  88. export type OpenAIEmbeddingsResponse = {
  89. object?: string;
  90. model?: string;
  91. data: Array<{
  92. object?: string;
  93. index: number;
  94. embedding: number[];
  95. }>;
  96. usage?: {
  97. prompt_tokens?: number;
  98. total_tokens?: number;
  99. };
  100. };
  101. /**
  102. * Circuit breaker state — exported for tests
  103. */
  104. export type CircuitState = "closed" | "open" | "half-open";
  105. /**
  106. * Determine whether an HTTP status is retryable. 429 (Too Many Requests)
  107. * and 503 (Service Unavailable) are retried; 4xx (other than 429) are not.
  108. */
  109. export declare function isRetryableStatus(status: number): boolean;
  110. /**
  111. * Chunk an array into pieces of ≤ size each. `size` MUST be ≥ 1.
  112. */
  113. export declare function chunkArray<T>(items: T[], size: number): T[][];
  114. /**
  115. * Sliding-window circuit breaker. Tracks the last N samples (min 4) over a
  116. * 60-second window; flips OPEN when failure rate exceeds 50%, then auto-
  117. * resets to HALF-OPEN after 5 minutes — at which point the next probe
  118. * decides whether to close (success) or re-open (failure).
  119. */
  120. export declare class CircuitBreaker {
  121. private samples;
  122. private state;
  123. private openedAt;
  124. private readonly windowMs;
  125. private readonly openDurationMs;
  126. private readonly threshold;
  127. private readonly minSamples;
  128. private readonly now;
  129. constructor(opts?: {
  130. windowMs?: number;
  131. openDurationMs?: number;
  132. threshold?: number;
  133. minSamples?: number;
  134. now?: () => number;
  135. });
  136. getState(): CircuitState;
  137. /**
  138. * Returns true when calls should be short-circuited (skip HTTP, fall back).
  139. * Side-effects: may transition OPEN → HALF-OPEN if the open window expired.
  140. */
  141. shouldFailFast(): boolean;
  142. /** Record a successful call. */
  143. recordSuccess(): void;
  144. /** Record a failed call. May trigger OPEN. */
  145. recordFailure(): void;
  146. /** Force-reset the breaker (used by tests / admin) */
  147. reset(): void;
  148. private pushSample;
  149. private evaluate;
  150. private tickAutoReset;
  151. }
  152. /**
  153. * Raised when the circuit breaker is OPEN and a call is short-circuited.
  154. * Callers (e.g. fallback wrapper) can catch this to switch to local provider.
  155. */
  156. export declare class CircuitOpenError extends Error {
  157. constructor(message?: string);
  158. }
  159. /**
  160. * Persistent (non-retryable) HTTP error from upstream. Includes status code.
  161. */
  162. export declare class HttpError extends Error {
  163. readonly status: number;
  164. readonly bodyPreview: string;
  165. constructor(status: number, bodyPreview: string);
  166. }
  167. export declare class OpenAIEmbeddingsProvider implements EmbeddingProvider {
  168. readonly kind: ProviderKind;
  169. private readonly endpoint;
  170. private readonly apiKey?;
  171. private readonly modelId;
  172. private readonly upstreamModel;
  173. private readonly batchSize;
  174. private readonly concurrency;
  175. private readonly timeoutMs;
  176. private readonly fetchImpl;
  177. private readonly retryBackoffsMs;
  178. private readonly sleep;
  179. private readonly now;
  180. private dimensions;
  181. private lastError;
  182. readonly breaker: CircuitBreaker;
  183. constructor(config: OpenAIProviderConfig);
  184. getModelId(): string;
  185. getDimensions(): number | undefined;
  186. /**
  187. * Most recent per-chunk failure message (HTTP status + body preview, malformed
  188. * JSON, timeout, abort reason). Returns `undefined` after a successful call
  189. * or before the first call. See `EmbeddingProvider.getLastError`.
  190. */
  191. getLastError(): string | undefined;
  192. /** Endpoint URL configured at construction time — used by callers when
  193. * building error messages for failed first-chunk probes. */
  194. getEndpoint(): string;
  195. healthcheck(signal?: AbortSignal): Promise<ProviderHealth>;
  196. embed(text: string, options?: ProviderEmbedOptions): Promise<ProviderEmbedding | null>;
  197. embedBatch(texts: string[], options?: ProviderEmbedOptions): Promise<(ProviderEmbedding | null)[]>;
  198. dispose(): Promise<void>;
  199. /**
  200. * Format a request-failure context string for `lastError`. Includes endpoint
  201. * + HTTP status + body preview when the error was an `HttpError`, otherwise
  202. * falls back to the message of the underlying error (or the value itself
  203. * when not an Error). Kept short — body preview is already capped at 1024
  204. * chars by `HttpError`, but we trim further here for the dimension-probe
  205. * thrown error which surfaces directly to users.
  206. */
  207. private formatErrorContext;
  208. private buildHeaders;
  209. /**
  210. * Single HTTP request with retry on 429/503. Returns embeddings indexed
  211. * the same as `texts`. Throws on non-retryable failure or all attempts
  212. * exhausted.
  213. */
  214. private requestWithRetry;
  215. /**
  216. * Issue one HTTP attempt to `POST /v1/embeddings`. Does NOT retry.
  217. */
  218. private requestOnce;
  219. }