suby
/
qmd


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442
							/**
 * llm.ts - LLM abstraction layer for QMD using node-llama-cpp
 *
 * Provides embeddings, text generation, and reranking using local GGUF models.
 */
import { type Token as LlamaToken } from "node-llama-cpp";
/**
 * `QMD_DISABLE_LOCAL_LLM=1` opt-out: when set, `LlamaCpp.ensureLlama()`
 * throws on first invocation. Use for remote-only deployments where any
 * `getLlama()` call indicates an unintended fallback (e.g. cron host
 * without libvulkan-dev/glslc — issue i-c28wngnd).
 */
export declare function isLocalLlmDisabled(env?: NodeJS.ProcessEnv): boolean;
/**
 * Resolve the GPU mode for `getLlama()`:
 *   1. Explicit `QMD_LLAMA_GPU=off|none|0|...`     → "cpu"
 *   2. Explicit `QMD_LLAMA_GPU=auto`               → "auto"
 *   3. Auto-detect: `QMD_EMBED_ENDPOINT` set        → "cpu"
 *      (remote embed provider — embed never touches local LLM. Rerank/expand
 *       still use prebuilt CPU binary; no Vulkan probe / cmake build.)
 *   4. Otherwise (legacy local-only setup)          → "auto"
 */
export declare function resolveLlamaGpuMode(env?: NodeJS.ProcessEnv): "cpu" | "auto";
/**
 * Detect if a model URI uses the Qwen3-Embedding format.
 * Qwen3-Embedding uses a different prompting style than nomic/embeddinggemma.
 */
export declare function isQwen3EmbeddingModel(modelUri: string): boolean;
/**
 * Format a query for embedding.
 * Uses nomic-style task prefix format for embeddinggemma (default).
 * Uses Qwen3-Embedding instruct format when a Qwen embedding model is active.
 */
export declare function formatQueryForEmbedding(query: string, modelUri?: string): string;
/**
 * Format a document for embedding.
 * Uses nomic-style format with title and text fields (default).
 * Qwen3-Embedding encodes documents as raw text without special prefixes.
 */
export declare function formatDocForEmbedding(text: string, title?: string, modelUri?: string): string;
/**
 * Token with log probability
 */
export type TokenLogProb = {
    token: string;
    logprob: number;
};
/**
 * Embedding result
 */
export type EmbeddingResult = {
    embedding: number[];
    model: string;
};
/**
 * Generation result with optional logprobs
 */
export type GenerateResult = {
    text: string;
    model: string;
    logprobs?: TokenLogProb[];
    done: boolean;
};
/**
 * Rerank result for a single document
 */
export type RerankDocumentResult = {
    file: string;
    score: number;
    index: number;
};
/**
 * Batch rerank result
 */
export type RerankResult = {
    results: RerankDocumentResult[];
    model: string;
};
/**
 * Model info
 */
export type ModelInfo = {
    name: string;
    exists: boolean;
    path?: string;
};
/**
 * Options for embedding
 */
export type EmbedOptions = {
    model?: string;
    isQuery?: boolean;
    title?: string;
};
/**
 * Options for text generation
 */
export type GenerateOptions = {
    model?: string;
    maxTokens?: number;
    temperature?: number;
};
/**
 * Options for reranking
 */
export type RerankOptions = {
    model?: string;
};
/**
 * Options for LLM sessions
 */
export type LLMSessionOptions = {
    /** Max session duration in ms (default: 10 minutes) */
    maxDuration?: number;
    /** External abort signal */
    signal?: AbortSignal;
    /** Debug name for logging */
    name?: string;
};
/**
 * Session interface for scoped LLM access with lifecycle guarantees
 */
export interface ILLMSession {
    embed(text: string, options?: EmbedOptions): Promise<EmbeddingResult | null>;
    embedBatch(texts: string[], options?: EmbedOptions): Promise<(EmbeddingResult | null)[]>;
    expandQuery(query: string, options?: {
        context?: string;
        includeLexical?: boolean;
    }): Promise<Queryable[]>;
    rerank(query: string, documents: RerankDocument[], options?: RerankOptions): Promise<RerankResult>;
    /** Whether this session is still valid (not released or aborted) */
    readonly isValid: boolean;
    /** Abort signal for this session (aborts on release or maxDuration) */
    readonly signal: AbortSignal;
}
/**
 * Supported query types for different search backends
 */
export type QueryType = 'lex' | 'vec' | 'hyde';
/**
 * A single query and its target backend type
 */
export type Queryable = {
    type: QueryType;
    text: string;
};
/**
 * Document to rerank
 */
export type RerankDocument = {
    file: string;
    text: string;
    title?: string;
};
export declare const LFM2_GENERATE_MODEL = "hf:LiquidAI/LFM2-1.2B-GGUF/LFM2-1.2B-Q4_K_M.gguf";
export declare const LFM2_INSTRUCT_MODEL = "hf:LiquidAI/LFM2.5-1.2B-Instruct-GGUF/LFM2.5-1.2B-Instruct-Q4_K_M.gguf";
export declare const DEFAULT_EMBED_MODEL_URI = "hf:ggml-org/embeddinggemma-300M-GGUF/embeddinggemma-300M-Q8_0.gguf";
export declare const DEFAULT_RERANK_MODEL_URI = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
export declare const DEFAULT_GENERATE_MODEL_URI = "hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf";
export declare const DEFAULT_MODEL_CACHE_DIR: string;
export type PullResult = {
    model: string;
    path: string;
    sizeBytes: number;
    refreshed: boolean;
};
export declare function pullModels(models: string[], options?: {
    refresh?: boolean;
    cacheDir?: string;
}): Promise<PullResult[]>;
/**
 * Abstract LLM interface - implement this for different backends
 */
export interface LLM {
    /**
     * Get embeddings for text
     */
    embed(text: string, options?: EmbedOptions): Promise<EmbeddingResult | null>;
    /**
     * Generate text completion
     */
    generate(prompt: string, options?: GenerateOptions): Promise<GenerateResult | null>;
    /**
     * Check if a model exists/is available
     */
    modelExists(model: string): Promise<ModelInfo>;
    /**
     * Expand a search query into multiple variations for different backends.
     * Returns a list of Queryable objects.
     */
    expandQuery(query: string, options?: {
        context?: string;
        includeLexical?: boolean;
    }): Promise<Queryable[]>;
    /**
     * Rerank documents by relevance to a query
     * Returns list of documents with relevance scores (higher = more relevant)
     */
    rerank(query: string, documents: RerankDocument[], options?: RerankOptions): Promise<RerankResult>;
    /**
     * Dispose of resources
     */
    dispose(): Promise<void>;
}
export type LlamaCppConfig = {
    embedModel?: string;
    generateModel?: string;
    rerankModel?: string;
    modelCacheDir?: string;
    /**
     * Context size used for query expansion generation contexts.
     * Default: 2048. Can also be set via QMD_EXPAND_CONTEXT_SIZE.
     */
    expandContextSize?: number;
    /**
     * Inactivity timeout in ms before unloading contexts (default: 2 minutes, 0 to disable).
     *
     * Per node-llama-cpp lifecycle guidance, we prefer keeping models loaded and only disposing
     * contexts when idle, since contexts (and their sequences) are the heavy per-session objects.
     * @see https://node-llama-cpp.withcat.ai/guide/objects-lifecycle
     */
    inactivityTimeoutMs?: number;
    /**
     * Whether to dispose models on inactivity (default: false).
     *
     * Keeping models loaded avoids repeated VRAM thrash; set to true only if you need aggressive
     * memory reclaim.
     */
    disposeModelsOnInactivity?: boolean;
};
export declare class LlamaCpp implements LLM {
    private readonly _ciMode;
    private llama;
    private embedModel;
    private embedContexts;
    private generateModel;
    private rerankModel;
    private rerankContexts;
    private embedModelUri;
    private generateModelUri;
    private rerankModelUri;
    private modelCacheDir;
    private expandContextSize;
    private embedModelLoadPromise;
    private generateModelLoadPromise;
    private rerankModelLoadPromise;
    private inactivityTimer;
    private inactivityTimeoutMs;
    private disposeModelsOnInactivity;
    private disposed;
    constructor(config?: LlamaCppConfig);
    get embedModelName(): string;
    /**
     * Reset the inactivity timer. Called after each model operation.
     * When timer fires, models are unloaded to free memory (if no active sessions).
     */
    private touchActivity;
    /**
     * Check if any contexts are currently loaded (and therefore worth unloading on inactivity).
     */
    private hasLoadedContexts;
    /**
     * Unload idle resources but keep the instance alive for future use.
     *
     * By default, this disposes contexts (and their dependent sequences), while keeping models loaded.
     * This matches the intended lifecycle: model → context → sequence, where contexts are per-session.
     */
    unloadIdleResources(): Promise<void>;
    /**
     * Ensure model cache directory exists
     */
    private ensureModelCacheDir;
    /**
     * Initialize the llama instance (lazy)
     *
     * Env-var controls (i-c28wngnd):
     *   - QMD_DISABLE_LOCAL_LLM=1    : hard-disable; throws on first ensureLlama()
     *                                  call. Use when the deployment must NEVER
     *                                  load node-llama-cpp (e.g. headless cron
     *                                  on a host without libvulkan-dev/glslc).
     *   - QMD_LLAMA_GPU=off|none|... : force CPU-only (skip Vulkan probe).
     *   - QMD_LLAMA_GPU=auto         : explicit opt-in to GPU probe even when
     *                                  QMD_EMBED_ENDPOINT is set (rare; useful
     *                                  for hybrid local-rerank + remote-embed).
     *
     * Auto-detect: when QMD_EMBED_ENDPOINT is set (HTTP embed provider, e.g.
     * cron on `code` → ai.mm.mk → models:8082), we default to CPU-only because
     * the embed path runs over HTTP and the only remaining local LLM consumers
     * are rerank/query-expansion, which work fine on the prebuilt CPU binary
     * and never need to invoke cmake-js-llama. This silences ~30s/run of
     * Vulkan probe + cmake noise on headless LXCs.
     */
    private ensureLlama;
    /**
     * Resolve a model URI to a local path, downloading if needed
     */
    private resolveModel;
    /**
     * Load embedding model (lazy)
     */
    private ensureEmbedModel;
    /**
     * Compute how many parallel contexts to create.
     *
     * GPU: constrained by VRAM (25% of free, capped at 8).
     * CPU: constrained by cores. Splitting threads across contexts enables
     *      true parallelism (each context runs on its own cores). Use at most
     *      half the math cores, with at least 4 threads per context.
     */
    private computeParallelism;
    /**
     * Get the number of threads each context should use, given N parallel contexts.
     * Splits available math cores evenly across contexts.
     */
    private threadsPerContext;
    /**
     * Load embedding contexts (lazy). Creates multiple for parallel embedding.
     * Uses promise guard to prevent concurrent context creation race condition.
     */
    private embedContextsCreatePromise;
    private ensureEmbedContexts;
    /**
     * Get a single embed context (for single-embed calls). Uses first from pool.
     */
    private ensureEmbedContext;
    /**
     * Load generation model (lazy) - context is created fresh per call
     */
    private ensureGenerateModel;
    /**
     * Load rerank model (lazy)
     */
    private ensureRerankModel;
    /**
     * Load rerank contexts (lazy). Creates multiple contexts for parallel ranking.
     * Each context has its own sequence, so they can evaluate independently.
     *
     * Tuning choices:
     * - contextSize 1024: reranking chunks are ~800 tokens max, 1024 is plenty
     * - flashAttention: ~20% less VRAM per context (568 vs 711 MB)
     * - Combined: drops from 11.6 GB (auto, no flash) to 568 MB per context (20×)
     */
    private static readonly RERANK_CONTEXT_SIZE;
    private static readonly EMBED_CONTEXT_SIZE;
    private ensureRerankContexts;
    /**
     * Tokenize text using the embedding model's tokenizer
     * Returns tokenizer tokens (opaque type from node-llama-cpp)
     */
    tokenize(text: string): Promise<readonly LlamaToken[]>;
    /**
     * Count tokens in text using the embedding model's tokenizer
     */
    countTokens(text: string): Promise<number>;
    /**
     * Detokenize token IDs back to text
     */
    detokenize(tokens: readonly LlamaToken[]): Promise<string>;
    /**
     * Truncate text to fit within the embedding model's context window.
     * Uses the model's own tokenizer for accurate token counting, then
     * detokenizes back to text if truncation is needed.
     * Returns the (possibly truncated) text and whether truncation occurred.
     */
    private truncateToContextSize;
    embed(text: string, options?: EmbedOptions): Promise<EmbeddingResult | null>;
    /**
     * Batch embed multiple texts efficiently
     * Uses Promise.all for parallel embedding - node-llama-cpp handles batching internally
     */
    embedBatch(texts: string[], options?: EmbedOptions): Promise<(EmbeddingResult | null)[]>;
    generate(prompt: string, options?: GenerateOptions): Promise<GenerateResult | null>;
    modelExists(modelUri: string): Promise<ModelInfo>;
    expandQuery(query: string, options?: {
        context?: string;
        includeLexical?: boolean;
        intent?: string;
    }): Promise<Queryable[]>;
    private static readonly RERANK_TEMPLATE_OVERHEAD;
    private static readonly RERANK_TARGET_DOCS_PER_CONTEXT;
    rerank(query: string, documents: RerankDocument[], options?: RerankOptions): Promise<RerankResult>;
    /**
     * Get device/GPU info for status display.
     * Initializes llama if not already done.
     */
    getDeviceInfo(): Promise<{
        gpu: string | false;
        gpuOffloading: boolean;
        gpuDevices: string[];
        vram?: {
            total: number;
            used: number;
            free: number;
        };
        cpuCores: number;
    }>;
    dispose(): Promise<void>;
}
/**
 * Error thrown when an operation is attempted on a released or aborted session.
 */
export declare class SessionReleasedError extends Error {
    constructor(message?: string);
}
/**
 * Execute a function with a scoped LLM session.
 * The session provides lifecycle guarantees - resources won't be disposed mid-operation.
 *
 * @example
 * ```typescript
 * await withLLMSession(async (session) => {
 *   const expanded = await session.expandQuery(query);
 *   const embeddings = await session.embedBatch(texts);
 *   const reranked = await session.rerank(query, docs);
 *   return reranked;
 * }, { maxDuration: 10 * 60 * 1000, name: 'querySearch' });
 * ```
 */
export declare function withLLMSession<T>(fn: (session: ILLMSession) => Promise<T>, options?: LLMSessionOptions): Promise<T>;
/**
 * Execute a function with a scoped LLM session using a specific LlamaCpp instance.
 * Unlike withLLMSession, this does not use the global singleton.
 */
export declare function withLLMSessionForLlm<T>(llm: LlamaCpp, fn: (session: ILLMSession) => Promise<T>, options?: LLMSessionOptions): Promise<T>;
/**
 * Check if idle unload is safe (no active sessions or operations).
 * Used internally by LlamaCpp idle timer.
 */
export declare function canUnloadLLM(): boolean;
/**
 * Get the default LlamaCpp instance (creates one if needed)
 */
export declare function getDefaultLlamaCpp(): LlamaCpp;
/**
 * Set a custom default LlamaCpp instance (useful for testing)
 */
export declare function setDefaultLlamaCpp(llm: LlamaCpp | null): void;
/**
 * Dispose the default LlamaCpp instance if it exists.
 * Call this before process exit to prevent NAPI crashes.
 */
export declare function disposeDefaultLlamaCpp(): Promise<void>;