suby
/
qmd


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219
							/**
 * openai.ts - OpenAI-compatible HTTP embedding provider
 *
 * Talks to any endpoint that implements `POST /v1/embeddings` with the OpenAI
 * shape: request `{model, input: string|string[]}`, response
 * `{data: [{embedding: number[], index: number}, ...]}`.
 *
 * Used by qmd to delegate embeddings to a GPU worker (e.g. ai.mm.mk →
 * qmd-embed-worker on `models` LXC, RTX 4090) instead of running
 * node-llama-cpp locally.
 *
 * Features:
 *   - Batches input in groups of ≤64 (configurable via QMD_EMBED_BATCH_SIZE)
 *   - Retries 429 / 503 with exponential backoff (1s, 4s, 16s)
 *   - 4xx (non-429) → no retry, count as failure
 *   - Circuit breaker: >50% failures in a 60s window → OPEN for 5 min,
 *     callers can use this to fall back to a local provider
 *   - Per-call timeout via AbortSignal (default QMD_EMBED_TIMEOUT_MS=30000)
 *   - Healthcheck via `GET /health` if available, else a probe embed call
 */
import type { EmbeddingProvider, ProviderEmbedOptions, ProviderEmbedding, ProviderHealth, ProviderKind } from "./provider.js";
/**
 * Default batch size — most OpenAI-compatible embedding endpoints accept up to
 * 2048 inputs per call but for memory and latency we cap at 64.
 */
export declare const DEFAULT_BATCH_SIZE = 64;
/**
 * Default in-flight concurrency cap for `embedBatch`. The qmd-embed-worker
 * exposes a 4-way semaphore (`MAX_CONCURRENT_REQUESTS=4`) and idles at
 * queue-depth 1.0 under sequential clients (i-fkpnar9i baseline). Defaulting
 * to 4 matches the worker's advertised concurrency without overshooting the
 * GPU. Override per-deploy via `QMD_EMBED_CONCURRENCY`. Setting to 1 reverts
 * to the legacy sequential dispatch.
 */
export declare const DEFAULT_CONCURRENCY = 4;
/**
 * Default per-request timeout (30 s). embeddinggemma-300M on RTX 4090 takes
 * <500ms per batch of 64 in practice; 30s is a safe upper bound.
 */
export declare const DEFAULT_TIMEOUT_MS = 30000;
/**
 * Retry backoff schedule (ms) for 429/503 responses. 3 attempts total
 * (initial + 2 retries) — aligns with issue spec "1s/4s/16s".
 */
export declare const RETRY_BACKOFFS_MS: readonly number[];
/**
 * Circuit breaker — flips OPEN when error rate exceeds threshold within
 * window. While OPEN, every call fails fast so the caller can fall back.
 */
export declare const CIRCUIT_WINDOW_MS = 60000;
export declare const CIRCUIT_OPEN_DURATION_MS: number;
export declare const CIRCUIT_FAILURE_RATE_THRESHOLD = 0.5;
export declare const CIRCUIT_MIN_SAMPLES = 4;
export type OpenAIProviderConfig = {
    /** Endpoint base URL — e.g. "https://ai.mm.mk" (no trailing slash) */
    endpoint: string;
    /** Optional bearer token sent as `Authorization: Bearer ...` */
    apiKey?: string;
    /**
     * Stable model identifier to report up via `getModelId()`.
     * Defaults to "embeddinggemma" to match qmd's existing DB rows.
     */
    modelId?: string;
    /**
     * Upstream model name sent in the HTTP request body. Often differs from
     * `modelId` (e.g. modelId="embeddinggemma" but upstream model="embeddinggemma:300m").
     */
    upstreamModel?: string;
    /** Batch size cap (default DEFAULT_BATCH_SIZE = 64) */
    batchSize?: number;
    /**
     * Max in-flight HTTP requests during a single `embedBatch` call. Default
     * `DEFAULT_CONCURRENCY=4` matches the worker semaphore. Set to 1 to force
     * legacy sequential dispatch (useful for benchmarks / regression bisect).
     */
    concurrency?: number;
    /** Per-request timeout in ms (default DEFAULT_TIMEOUT_MS = 30_000) */
    timeoutMs?: number;
    /** Custom fetch (for testing). Defaults to global `fetch`. */
    fetchImpl?: typeof fetch;
    /** Custom retry schedule (for testing). Defaults to RETRY_BACKOFFS_MS. */
    retryBackoffsMs?: readonly number[];
    /** Custom sleep impl (for testing). Defaults to setTimeout. */
    sleep?: (ms: number) => Promise<void>;
    /** Custom clock (for testing). Defaults to Date.now. */
    now?: () => number;
};
export type OpenAIEmbeddingsResponse = {
    object?: string;
    model?: string;
    data: Array<{
        object?: string;
        index: number;
        embedding: number[];
    }>;
    usage?: {
        prompt_tokens?: number;
        total_tokens?: number;
    };
};
/**
 * Circuit breaker state — exported for tests
 */
export type CircuitState = "closed" | "open" | "half-open";
/**
 * Determine whether an HTTP status is retryable. 429 (Too Many Requests)
 * and 503 (Service Unavailable) are retried; 4xx (other than 429) are not.
 */
export declare function isRetryableStatus(status: number): boolean;
/**
 * Chunk an array into pieces of ≤ size each. `size` MUST be ≥ 1.
 */
export declare function chunkArray<T>(items: T[], size: number): T[][];
/**
 * Sliding-window circuit breaker. Tracks the last N samples (min 4) over a
 * 60-second window; flips OPEN when failure rate exceeds 50%, then auto-
 * resets to HALF-OPEN after 5 minutes — at which point the next probe
 * decides whether to close (success) or re-open (failure).
 */
export declare class CircuitBreaker {
    private samples;
    private state;
    private openedAt;
    private readonly windowMs;
    private readonly openDurationMs;
    private readonly threshold;
    private readonly minSamples;
    private readonly now;
    constructor(opts?: {
        windowMs?: number;
        openDurationMs?: number;
        threshold?: number;
        minSamples?: number;
        now?: () => number;
    });
    getState(): CircuitState;
    /**
     * Returns true when calls should be short-circuited (skip HTTP, fall back).
     * Side-effects: may transition OPEN → HALF-OPEN if the open window expired.
     */
    shouldFailFast(): boolean;
    /** Record a successful call. */
    recordSuccess(): void;
    /** Record a failed call. May trigger OPEN. */
    recordFailure(): void;
    /** Force-reset the breaker (used by tests / admin) */
    reset(): void;
    private pushSample;
    private evaluate;
    private tickAutoReset;
}
/**
 * Raised when the circuit breaker is OPEN and a call is short-circuited.
 * Callers (e.g. fallback wrapper) can catch this to switch to local provider.
 */
export declare class CircuitOpenError extends Error {
    constructor(message?: string);
}
/**
 * Persistent (non-retryable) HTTP error from upstream. Includes status code.
 */
export declare class HttpError extends Error {
    readonly status: number;
    readonly bodyPreview: string;
    constructor(status: number, bodyPreview: string);
}
export declare class OpenAIEmbeddingsProvider implements EmbeddingProvider {
    readonly kind: ProviderKind;
    private readonly endpoint;
    private readonly apiKey?;
    private readonly modelId;
    private readonly upstreamModel;
    private readonly batchSize;
    private readonly concurrency;
    private readonly timeoutMs;
    private readonly fetchImpl;
    private readonly retryBackoffsMs;
    private readonly sleep;
    private readonly now;
    private dimensions;
    private lastError;
    readonly breaker: CircuitBreaker;
    constructor(config: OpenAIProviderConfig);
    getModelId(): string;
    getDimensions(): number | undefined;
    /**
     * Most recent per-chunk failure message (HTTP status + body preview, malformed
     * JSON, timeout, abort reason). Returns `undefined` after a successful call
     * or before the first call. See `EmbeddingProvider.getLastError`.
     */
    getLastError(): string | undefined;
    /** Endpoint URL configured at construction time — used by callers when
     *  building error messages for failed first-chunk probes. */
    getEndpoint(): string;
    healthcheck(signal?: AbortSignal): Promise<ProviderHealth>;
    embed(text: string, options?: ProviderEmbedOptions): Promise<ProviderEmbedding | null>;
    embedBatch(texts: string[], options?: ProviderEmbedOptions): Promise<(ProviderEmbedding | null)[]>;
    dispose(): Promise<void>;
    /**
     * Format a request-failure context string for `lastError`. Includes endpoint
     * + HTTP status + body preview when the error was an `HttpError`, otherwise
     * falls back to the message of the underlying error (or the value itself
     * when not an Error). Kept short — body preview is already capped at 1024
     * chars by `HttpError`, but we trim further here for the dimension-probe
     * thrown error which surfaces directly to users.
     */
    private formatErrorContext;
    private buildHeaders;
    /**
     * Single HTTP request with retry on 429/503. Returns embeddings indexed
     * the same as `texts`. Throws on non-retryable failure or all attempts
     * exhausted.
     */
    private requestWithRetry;
    /**
     * Issue one HTTP attempt to `POST /v1/embeddings`. Does NOT retry.
     */
    private requestOnce;
}