suby
/
qmd


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287
							/**
 * llm.ts - LLM abstraction layer for QMD using node-llama-cpp
 *
 * Provides embeddings, text generation, and reranking using local GGUF models.
 */
import { getLlama, resolveModelFile, LlamaChatSession, LlamaLogLevel, } from "node-llama-cpp";
import { homedir } from "os";
import { join } from "path";
import { existsSync, mkdirSync, statSync, unlinkSync, readdirSync, readFileSync, writeFileSync } from "fs";
// =============================================================================
// Local-LLM env-var policy (i-c28wngnd)
// =============================================================================
/**
 * Truthy values for boolean-style env vars. Mirrors the convention used by
 * `QMD_LLAMA_GPU` (false-style) — kept narrow so unrelated values don't flip
 * the disable.
 */
const TRUTHY_ENV_VALUES = new Set(["1", "true", "yes", "on"]);
/**
 * Falsy / off-style values accepted by `QMD_LLAMA_GPU`.
 */
const QMD_LLAMA_GPU_OFF_VALUES = new Set([
    "false", "off", "none", "disable", "disabled", "0",
]);
/**
 * `QMD_DISABLE_LOCAL_LLM=1` opt-out: when set, `LlamaCpp.ensureLlama()`
 * throws on first invocation. Use for remote-only deployments where any
 * `getLlama()` call indicates an unintended fallback (e.g. cron host
 * without libvulkan-dev/glslc — issue i-c28wngnd).
 */
export function isLocalLlmDisabled(env = process.env) {
    const raw = env.QMD_DISABLE_LOCAL_LLM?.trim().toLowerCase();
    return raw !== undefined && TRUTHY_ENV_VALUES.has(raw);
}
/**
 * Resolve the GPU mode for `getLlama()`:
 *   1. Explicit `QMD_LLAMA_GPU=off|none|0|...`     → "cpu"
 *   2. Explicit `QMD_LLAMA_GPU=auto`               → "auto"
 *   3. Auto-detect: `QMD_EMBED_ENDPOINT` set        → "cpu"
 *      (remote embed provider — embed never touches local LLM. Rerank/expand
 *       still use prebuilt CPU binary; no Vulkan probe / cmake build.)
 *   4. Otherwise (legacy local-only setup)          → "auto"
 */
export function resolveLlamaGpuMode(env = process.env) {
    const explicit = env.QMD_LLAMA_GPU?.trim().toLowerCase();
    if (explicit !== undefined && explicit !== "") {
        if (QMD_LLAMA_GPU_OFF_VALUES.has(explicit))
            return "cpu";
        if (explicit === "auto" || explicit === "true" || explicit === "on") {
            return "auto";
        }
        // Unknown value — preserve legacy behavior (probe).
        return "auto";
    }
    // Auto-detect remote-only deployment. When QMD_EMBED_ENDPOINT is set the
    // embed path runs over HTTP (factory.ts resolveProviderKind), so any
    // local LLM access is for rerank/expand only — the prebuilt CPU binary
    // is sufficient and skipping the Vulkan probe avoids the ~30s cmake
    // attempt on hosts without libvulkan-dev/glslc.
    const remoteEmbed = env.QMD_EMBED_ENDPOINT?.trim();
    if (remoteEmbed && remoteEmbed !== "")
        return "cpu";
    return "auto";
}
// =============================================================================
// Embedding Formatting Functions
// =============================================================================
/**
 * Detect if a model URI uses the Qwen3-Embedding format.
 * Qwen3-Embedding uses a different prompting style than nomic/embeddinggemma.
 */
export function isQwen3EmbeddingModel(modelUri) {
    return /qwen.*embed/i.test(modelUri) || /embed.*qwen/i.test(modelUri);
}
/**
 * Format a query for embedding.
 * Uses nomic-style task prefix format for embeddinggemma (default).
 * Uses Qwen3-Embedding instruct format when a Qwen embedding model is active.
 */
export function formatQueryForEmbedding(query, modelUri) {
    const uri = modelUri ?? process.env.QMD_EMBED_MODEL ?? DEFAULT_EMBED_MODEL;
    if (isQwen3EmbeddingModel(uri)) {
        return `Instruct: Retrieve relevant documents for the given query\nQuery: ${query}`;
    }
    return `task: search result | query: ${query}`;
}
/**
 * Format a document for embedding.
 * Uses nomic-style format with title and text fields (default).
 * Qwen3-Embedding encodes documents as raw text without special prefixes.
 */
export function formatDocForEmbedding(text, title, modelUri) {
    const uri = modelUri ?? process.env.QMD_EMBED_MODEL ?? DEFAULT_EMBED_MODEL;
    if (isQwen3EmbeddingModel(uri)) {
        // Qwen3-Embedding: documents are raw text, no task prefix
        return title ? `${title}\n${text}` : text;
    }
    return `title: ${title || "none"} | text: ${text}`;
}
// =============================================================================
// Model Configuration
// =============================================================================
// HuggingFace model URIs for node-llama-cpp
// Format: hf:<user>/<repo>/<file>
// Override via QMD_EMBED_MODEL env var (e.g. hf:Qwen/Qwen3-Embedding-0.6B-GGUF/Qwen3-Embedding-0.6B-Q8_0.gguf)
const DEFAULT_EMBED_MODEL = "hf:ggml-org/embeddinggemma-300M-GGUF/embeddinggemma-300M-Q8_0.gguf";
const DEFAULT_RERANK_MODEL = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
// const DEFAULT_GENERATE_MODEL = "hf:ggml-org/Qwen3-0.6B-GGUF/Qwen3-0.6B-Q8_0.gguf";
const DEFAULT_GENERATE_MODEL = "hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf";
// Alternative generation models for query expansion:
// LiquidAI LFM2 - hybrid architecture optimized for edge/on-device inference
// Use these as base for fine-tuning with configs/sft_lfm2.yaml
export const LFM2_GENERATE_MODEL = "hf:LiquidAI/LFM2-1.2B-GGUF/LFM2-1.2B-Q4_K_M.gguf";
export const LFM2_INSTRUCT_MODEL = "hf:LiquidAI/LFM2.5-1.2B-Instruct-GGUF/LFM2.5-1.2B-Instruct-Q4_K_M.gguf";
export const DEFAULT_EMBED_MODEL_URI = DEFAULT_EMBED_MODEL;
export const DEFAULT_RERANK_MODEL_URI = DEFAULT_RERANK_MODEL;
export const DEFAULT_GENERATE_MODEL_URI = DEFAULT_GENERATE_MODEL;
// Local model cache directory
const MODEL_CACHE_DIR = process.env.XDG_CACHE_HOME
    ? join(process.env.XDG_CACHE_HOME, "qmd", "models")
    : join(homedir(), ".cache", "qmd", "models");
export const DEFAULT_MODEL_CACHE_DIR = MODEL_CACHE_DIR;
function parseHfUri(model) {
    if (!model.startsWith("hf:"))
        return null;
    const without = model.slice(3);
    const parts = without.split("/");
    if (parts.length < 3)
        return null;
    const repo = parts.slice(0, 2).join("/");
    const file = parts.slice(2).join("/");
    return { repo, file };
}
async function getRemoteEtag(ref) {
    const url = `https://huggingface.co/${ref.repo}/resolve/main/${ref.file}`;
    try {
        const resp = await fetch(url, { method: "HEAD" });
        if (!resp.ok)
            return null;
        const etag = resp.headers.get("etag");
        return etag || null;
    }
    catch {
        return null;
    }
}
export async function pullModels(models, options = {}) {
    const cacheDir = options.cacheDir || MODEL_CACHE_DIR;
    if (!existsSync(cacheDir)) {
        mkdirSync(cacheDir, { recursive: true });
    }
    const results = [];
    for (const model of models) {
        let refreshed = false;
        const hfRef = parseHfUri(model);
        const filename = model.split("/").pop();
        const entries = readdirSync(cacheDir, { withFileTypes: true });
        const cached = filename
            ? entries
                .filter((entry) => entry.isFile() && entry.name.includes(filename))
                .map((entry) => join(cacheDir, entry.name))
            : [];
        if (hfRef && filename) {
            const etagPath = join(cacheDir, `${filename}.etag`);
            const remoteEtag = await getRemoteEtag(hfRef);
            const localEtag = existsSync(etagPath)
                ? readFileSync(etagPath, "utf-8").trim()
                : null;
            const shouldRefresh = options.refresh || !remoteEtag || remoteEtag !== localEtag || cached.length === 0;
            if (shouldRefresh) {
                for (const candidate of cached) {
                    if (existsSync(candidate))
                        unlinkSync(candidate);
                }
                if (existsSync(etagPath))
                    unlinkSync(etagPath);
                refreshed = cached.length > 0;
            }
        }
        else if (options.refresh && filename) {
            for (const candidate of cached) {
                if (existsSync(candidate))
                    unlinkSync(candidate);
                refreshed = true;
            }
        }
        const path = await resolveModelFile(model, cacheDir);
        const sizeBytes = existsSync(path) ? statSync(path).size : 0;
        if (hfRef && filename) {
            const remoteEtag = await getRemoteEtag(hfRef);
            if (remoteEtag) {
                const etagPath = join(cacheDir, `${filename}.etag`);
                writeFileSync(etagPath, remoteEtag + "\n", "utf-8");
            }
        }
        results.push({ model, path, sizeBytes, refreshed });
    }
    return results;
}
/**
 * LLM implementation using node-llama-cpp
 */
// Default inactivity timeout: 5 minutes (keep models warm during typical search sessions)
const DEFAULT_INACTIVITY_TIMEOUT_MS = 5 * 60 * 1000;
const DEFAULT_EXPAND_CONTEXT_SIZE = 2048;
function resolveExpandContextSize(configValue) {
    if (configValue !== undefined) {
        if (!Number.isInteger(configValue) || configValue <= 0) {
            throw new Error(`Invalid expandContextSize: ${configValue}. Must be a positive integer.`);
        }
        return configValue;
    }
    const envValue = process.env.QMD_EXPAND_CONTEXT_SIZE?.trim();
    if (!envValue)
        return DEFAULT_EXPAND_CONTEXT_SIZE;
    const parsed = Number.parseInt(envValue, 10);
    if (!Number.isInteger(parsed) || parsed <= 0) {
        process.stderr.write(`QMD Warning: invalid QMD_EXPAND_CONTEXT_SIZE="${envValue}", using default ${DEFAULT_EXPAND_CONTEXT_SIZE}.\n`);
        return DEFAULT_EXPAND_CONTEXT_SIZE;
    }
    return parsed;
}
export class LlamaCpp {
    _ciMode = !!process.env.CI;
    llama = null;
    embedModel = null;
    embedContexts = [];
    generateModel = null;
    rerankModel = null;
    rerankContexts = [];
    embedModelUri;
    generateModelUri;
    rerankModelUri;
    modelCacheDir;
    expandContextSize;
    // Ensure we don't load the same model/context concurrently (which can allocate duplicate VRAM).
    embedModelLoadPromise = null;
    generateModelLoadPromise = null;
    rerankModelLoadPromise = null;
    // Inactivity timer for auto-unloading models
    inactivityTimer = null;
    inactivityTimeoutMs;
    disposeModelsOnInactivity;
    // Track disposal state to prevent double-dispose
    disposed = false;
    constructor(config = {}) {
        this.embedModelUri = config.embedModel || process.env.QMD_EMBED_MODEL || DEFAULT_EMBED_MODEL;
        this.generateModelUri = config.generateModel || process.env.QMD_GENERATE_MODEL || DEFAULT_GENERATE_MODEL;
        this.rerankModelUri = config.rerankModel || process.env.QMD_RERANK_MODEL || DEFAULT_RERANK_MODEL;
        this.modelCacheDir = config.modelCacheDir || MODEL_CACHE_DIR;
        this.expandContextSize = resolveExpandContextSize(config.expandContextSize);
        this.inactivityTimeoutMs = config.inactivityTimeoutMs ?? DEFAULT_INACTIVITY_TIMEOUT_MS;
        this.disposeModelsOnInactivity = config.disposeModelsOnInactivity ?? false;
    }
    get embedModelName() {
        return this.embedModelUri;
    }
    /**
     * Reset the inactivity timer. Called after each model operation.
     * When timer fires, models are unloaded to free memory (if no active sessions).
     */
    touchActivity() {
        // Clear existing timer
        if (this.inactivityTimer) {
            clearTimeout(this.inactivityTimer);
            this.inactivityTimer = null;
        }
        // Only set timer if we have disposable contexts and timeout is enabled
        if (this.inactivityTimeoutMs > 0 && this.hasLoadedContexts()) {
            this.inactivityTimer = setTimeout(() => {
                // Check if session manager allows unloading
                // canUnloadLLM is defined later in this file - it checks the session manager
                // We use dynamic import pattern to avoid circular dependency issues
                if (typeof canUnloadLLM === 'function' && !canUnloadLLM()) {
                    // Active sessions/operations - reschedule timer
                    this.touchActivity();
                    return;
                }
                this.unloadIdleResources().catch(err => {
                    console.error("Error unloading idle resources:", err);
                });
            }, this.inactivityTimeoutMs);
            // Don't keep process alive just for this timer
            this.inactivityTimer.unref();
        }
    }
    /**
     * Check if any contexts are currently loaded (and therefore worth unloading on inactivity).
     */
    hasLoadedContexts() {
        return !!(this.embedContexts.length > 0 || this.rerankContexts.length > 0);
    }
    /**
     * Unload idle resources but keep the instance alive for future use.
     *
     * By default, this disposes contexts (and their dependent sequences), while keeping models loaded.
     * This matches the intended lifecycle: model → context → sequence, where contexts are per-session.
     */
    async unloadIdleResources() {
        // Don't unload if already disposed
        if (this.disposed) {
            return;
        }
        // Clear timer
        if (this.inactivityTimer) {
            clearTimeout(this.inactivityTimer);
            this.inactivityTimer = null;
        }
        // Dispose contexts first
        for (const ctx of this.embedContexts) {
            await ctx.dispose();
        }
        this.embedContexts = [];
        for (const ctx of this.rerankContexts) {
            await ctx.dispose();
        }
        this.rerankContexts = [];
        // Optionally dispose models too (opt-in)
        if (this.disposeModelsOnInactivity) {
            if (this.embedModel) {
                await this.embedModel.dispose();
                this.embedModel = null;
            }
            if (this.generateModel) {
                await this.generateModel.dispose();
                this.generateModel = null;
            }
            if (this.rerankModel) {
                await this.rerankModel.dispose();
                this.rerankModel = null;
            }
            // Reset load promises so models can be reloaded later
            this.embedModelLoadPromise = null;
            this.generateModelLoadPromise = null;
            this.rerankModelLoadPromise = null;
        }
        // Note: We keep llama instance alive - it's lightweight
    }
    /**
     * Ensure model cache directory exists
     */
    ensureModelCacheDir() {
        if (!existsSync(this.modelCacheDir)) {
            mkdirSync(this.modelCacheDir, { recursive: true });
        }
    }
    /**
     * Initialize the llama instance (lazy)
     *
     * Env-var controls (i-c28wngnd):
     *   - QMD_DISABLE_LOCAL_LLM=1    : hard-disable; throws on first ensureLlama()
     *                                  call. Use when the deployment must NEVER
     *                                  load node-llama-cpp (e.g. headless cron
     *                                  on a host without libvulkan-dev/glslc).
     *   - QMD_LLAMA_GPU=off|none|... : force CPU-only (skip Vulkan probe).
     *   - QMD_LLAMA_GPU=auto         : explicit opt-in to GPU probe even when
     *                                  QMD_EMBED_ENDPOINT is set (rare; useful
     *                                  for hybrid local-rerank + remote-embed).
     *
     * Auto-detect: when QMD_EMBED_ENDPOINT is set (HTTP embed provider, e.g.
     * cron on `code` → ai.mm.mk → models:8082), we default to CPU-only because
     * the embed path runs over HTTP and the only remaining local LLM consumers
     * are rerank/query-expansion, which work fine on the prebuilt CPU binary
     * and never need to invoke cmake-js-llama. This silences ~30s/run of
     * Vulkan probe + cmake noise on headless LXCs.
     */
    async ensureLlama() {
        if (!this.llama) {
            // Hard-disable opt-out — fails fast so the caller knows. Throw early
            // so any path that ignores the documented `EmbeddingProvider` route
            // and reaches for the local LLM gets a loud, actionable error rather
            // than a silent 30s Vulkan compile attempt.
            if (isLocalLlmDisabled(process.env)) {
                throw new Error("QMD_DISABLE_LOCAL_LLM=1 — local node-llama-cpp is disabled. " +
                    "This deployment is configured for remote embeddings only; the " +
                    "code path that reached `ensureLlama()` should route through an " +
                    "EmbeddingProvider (set QMD_EMBED_ENDPOINT) instead. Unset " +
                    "QMD_DISABLE_LOCAL_LLM to re-enable local rerank/expand.");
            }
            // Resolve GPU mode: explicit QMD_LLAMA_GPU wins, else auto-detect
            // remote-only deployment (CPU when QMD_EMBED_ENDPOINT is set), else
            // probe GPU normally for legacy local-only setups.
            const gpuMode = resolveLlamaGpuMode(process.env);
            const loadLlama = async (gpu) => await getLlama({
                build: "autoAttempt",
                logLevel: LlamaLogLevel.error,
                gpu,
            });
            let llama;
            if (gpuMode === "cpu") {
                llama = await loadLlama(false);
            }
            else {
                try {
                    llama = await loadLlama("auto");
                }
                catch (err) {
                    // GPU backend (e.g. Vulkan on headless/driverless machines) can throw at init.
                    // Fall back to CPU so qmd still works.
                    process.stderr.write(`QMD Warning: GPU init failed (${err instanceof Error ? err.message : String(err)}), falling back to CPU.\n`);
                    llama = await loadLlama(false);
                }
            }
            // Suppress the "running on CPU (slow)" warning when CPU was requested
            // explicitly or auto-selected for a remote-only deployment — there's
            // nothing the operator can do about it and the hint isn't relevant
            // (embed runs via HTTP; only rerank/expand use the local CPU path).
            if (llama.gpu === false && gpuMode === "auto") {
                process.stderr.write("QMD Warning: no GPU acceleration, running on CPU (slow). Run 'qmd status' for details.\n");
            }
            this.llama = llama;
        }
        return this.llama;
    }
    /**
     * Resolve a model URI to a local path, downloading if needed
     */
    async resolveModel(modelUri) {
        this.ensureModelCacheDir();
        // resolveModelFile handles HF URIs and downloads to the cache dir
        return await resolveModelFile(modelUri, this.modelCacheDir);
    }
    /**
     * Load embedding model (lazy)
     */
    async ensureEmbedModel() {
        if (this.embedModel) {
            return this.embedModel;
        }
        if (this.embedModelLoadPromise) {
            return await this.embedModelLoadPromise;
        }
        this.embedModelLoadPromise = (async () => {
            const llama = await this.ensureLlama();
            const modelPath = await this.resolveModel(this.embedModelUri);
            const model = await llama.loadModel({ modelPath });
            this.embedModel = model;
            // Model loading counts as activity - ping to keep alive
            this.touchActivity();
            return model;
        })();
        try {
            return await this.embedModelLoadPromise;
        }
        finally {
            // Keep the resolved model cached; clear only the in-flight promise.
            this.embedModelLoadPromise = null;
        }
    }
    /**
     * Compute how many parallel contexts to create.
     *
     * GPU: constrained by VRAM (25% of free, capped at 8).
     * CPU: constrained by cores. Splitting threads across contexts enables
     *      true parallelism (each context runs on its own cores). Use at most
     *      half the math cores, with at least 4 threads per context.
     */
    async computeParallelism(perContextMB) {
        const llama = await this.ensureLlama();
        if (llama.gpu) {
            try {
                const vram = await llama.getVramState();
                const freeMB = vram.free / (1024 * 1024);
                const maxByVram = Math.floor((freeMB * 0.25) / perContextMB);
                return Math.max(1, Math.min(8, maxByVram));
            }
            catch {
                return 2;
            }
        }
        // CPU: split cores across contexts. At least 4 threads per context.
        const cores = llama.cpuMathCores || 4;
        const maxContexts = Math.floor(cores / 4);
        return Math.max(1, Math.min(4, maxContexts));
    }
    /**
     * Get the number of threads each context should use, given N parallel contexts.
     * Splits available math cores evenly across contexts.
     */
    async threadsPerContext(parallelism) {
        const llama = await this.ensureLlama();
        if (llama.gpu)
            return 0; // GPU: let the library decide
        const cores = llama.cpuMathCores || 4;
        return Math.max(1, Math.floor(cores / parallelism));
    }
    /**
     * Load embedding contexts (lazy). Creates multiple for parallel embedding.
     * Uses promise guard to prevent concurrent context creation race condition.
     */
    embedContextsCreatePromise = null;
    async ensureEmbedContexts() {
        if (this.embedContexts.length > 0) {
            this.touchActivity();
            return this.embedContexts;
        }
        if (this.embedContextsCreatePromise) {
            return await this.embedContextsCreatePromise;
        }
        this.embedContextsCreatePromise = (async () => {
            const model = await this.ensureEmbedModel();
            // Embed contexts are ~143 MB each (nomic-embed 2048 ctx)
            const n = await this.computeParallelism(150);
            const threads = await this.threadsPerContext(n);
            for (let i = 0; i < n; i++) {
                try {
                    this.embedContexts.push(await model.createEmbeddingContext({
                        contextSize: LlamaCpp.EMBED_CONTEXT_SIZE,
                        ...(threads > 0 ? { threads } : {}),
                    }));
                }
                catch {
                    if (this.embedContexts.length === 0)
                        throw new Error("Failed to create any embedding context");
                    break;
                }
            }
            this.touchActivity();
            return this.embedContexts;
        })();
        try {
            return await this.embedContextsCreatePromise;
        }
        finally {
            this.embedContextsCreatePromise = null;
        }
    }
    /**
     * Get a single embed context (for single-embed calls). Uses first from pool.
     */
    async ensureEmbedContext() {
        const contexts = await this.ensureEmbedContexts();
        return contexts[0];
    }
    /**
     * Load generation model (lazy) - context is created fresh per call
     */
    async ensureGenerateModel() {
        if (!this.generateModel) {
            if (this.generateModelLoadPromise) {
                return await this.generateModelLoadPromise;
            }
            this.generateModelLoadPromise = (async () => {
                const llama = await this.ensureLlama();
                const modelPath = await this.resolveModel(this.generateModelUri);
                const model = await llama.loadModel({ modelPath });
                this.generateModel = model;
                return model;
            })();
            try {
                await this.generateModelLoadPromise;
            }
            finally {
                this.generateModelLoadPromise = null;
            }
        }
        this.touchActivity();
        if (!this.generateModel) {
            throw new Error("Generate model not loaded");
        }
        return this.generateModel;
    }
    /**
     * Load rerank model (lazy)
     */
    async ensureRerankModel() {
        if (this.rerankModel) {
            return this.rerankModel;
        }
        if (this.rerankModelLoadPromise) {
            return await this.rerankModelLoadPromise;
        }
        this.rerankModelLoadPromise = (async () => {
            const llama = await this.ensureLlama();
            const modelPath = await this.resolveModel(this.rerankModelUri);
            const model = await llama.loadModel({ modelPath });
            this.rerankModel = model;
            // Model loading counts as activity - ping to keep alive
            this.touchActivity();
            return model;
        })();
        try {
            return await this.rerankModelLoadPromise;
        }
        finally {
            this.rerankModelLoadPromise = null;
        }
    }
    /**
     * Load rerank contexts (lazy). Creates multiple contexts for parallel ranking.
     * Each context has its own sequence, so they can evaluate independently.
     *
     * Tuning choices:
     * - contextSize 1024: reranking chunks are ~800 tokens max, 1024 is plenty
     * - flashAttention: ~20% less VRAM per context (568 vs 711 MB)
     * - Combined: drops from 11.6 GB (auto, no flash) to 568 MB per context (20×)
     */
    // Qwen3 reranker template adds ~200 tokens overhead (system prompt, tags, etc.)
    // Default 2048 was too small for longer documents (e.g. session transcripts,
    // CJK text, or large markdown files) — callers hit "input lengths exceed
    // context size" errors even after truncation because the overhead estimate
    // was insufficient.  4096 comfortably fits the largest real-world chunks
    // while staying well below the 40 960-token auto size.
    // Override with QMD_RERANK_CONTEXT_SIZE env var if you need more headroom.
    static RERANK_CONTEXT_SIZE = (() => {
        const v = parseInt(process.env.QMD_RERANK_CONTEXT_SIZE ?? "", 10);
        return Number.isFinite(v) && v > 0 ? v : 4096;
    })();
    static EMBED_CONTEXT_SIZE = (() => {
        const v = parseInt(process.env.QMD_EMBED_CONTEXT_SIZE ?? "", 10);
        return Number.isFinite(v) && v > 0 ? v : 2048;
    })();
    async ensureRerankContexts() {
        if (this.rerankContexts.length === 0) {
            const model = await this.ensureRerankModel();
            // ~960 MB per context with flash attention at contextSize 2048
            const n = Math.min(await this.computeParallelism(1000), 4);
            const threads = await this.threadsPerContext(n);
            for (let i = 0; i < n; i++) {
                try {
                    this.rerankContexts.push(await model.createRankingContext({
                        contextSize: LlamaCpp.RERANK_CONTEXT_SIZE,
                        flashAttention: true,
                        ...(threads > 0 ? { threads } : {}),
                    }));
                }
                catch {
                    if (this.rerankContexts.length === 0) {
                        // Flash attention might not be supported — retry without it
                        try {
                            this.rerankContexts.push(await model.createRankingContext({
                                contextSize: LlamaCpp.RERANK_CONTEXT_SIZE,
                                ...(threads > 0 ? { threads } : {}),
                            }));
                        }
                        catch {
                            throw new Error("Failed to create any rerank context");
                        }
                    }
                    break;
                }
            }
        }
        this.touchActivity();
        return this.rerankContexts;
    }
    // ==========================================================================
    // Tokenization
    // ==========================================================================
    /**
     * Tokenize text using the embedding model's tokenizer
     * Returns tokenizer tokens (opaque type from node-llama-cpp)
     */
    async tokenize(text) {
        await this.ensureEmbedContext(); // Ensure model is loaded
        if (!this.embedModel) {
            throw new Error("Embed model not loaded");
        }
        return this.embedModel.tokenize(text);
    }
    /**
     * Count tokens in text using the embedding model's tokenizer
     */
    async countTokens(text) {
        const tokens = await this.tokenize(text);
        return tokens.length;
    }
    /**
     * Detokenize token IDs back to text
     */
    async detokenize(tokens) {
        await this.ensureEmbedContext();
        if (!this.embedModel) {
            throw new Error("Embed model not loaded");
        }
        return this.embedModel.detokenize(tokens);
    }
    // ==========================================================================
    // Core API methods
    // ==========================================================================
    /**
     * Truncate text to fit within the embedding model's context window.
     * Uses the model's own tokenizer for accurate token counting, then
     * detokenizes back to text if truncation is needed.
     * Returns the (possibly truncated) text and whether truncation occurred.
     */
    async truncateToContextSize(text) {
        if (!this.embedModel)
            return { text, truncated: false };
        const maxTokens = this.embedModel.trainContextSize;
        if (maxTokens <= 0)
            return { text, truncated: false };
        const tokens = this.embedModel.tokenize(text);
        if (tokens.length <= maxTokens)
            return { text, truncated: false };
        // Leave a small margin (4 tokens) for BOS/EOS overhead
        const safeLimit = Math.max(1, maxTokens - 4);
        const truncatedTokens = tokens.slice(0, safeLimit);
        const truncatedText = this.embedModel.detokenize(truncatedTokens);
        return { text: truncatedText, truncated: true };
    }
    async embed(text, options = {}) {
        // Ping activity at start to keep models alive during this operation
        this.touchActivity();
        try {
            const context = await this.ensureEmbedContext();
            // Guard: truncate text that exceeds model context window to prevent GGML crash
            const { text: safeText, truncated } = await this.truncateToContextSize(text);
            if (truncated) {
                console.warn(`⚠ Text truncated to fit embedding context (${this.embedModel?.trainContextSize} tokens)`);
            }
            const embedding = await context.getEmbeddingFor(safeText);
            return {
                embedding: Array.from(embedding.vector),
                model: options.model ?? this.embedModelUri,
            };
        }
        catch (error) {
            console.error("Embedding error:", error);
            return null;
        }
    }
    /**
     * Batch embed multiple texts efficiently
     * Uses Promise.all for parallel embedding - node-llama-cpp handles batching internally
     */
    async embedBatch(texts, options = {}) {
        if (this._ciMode)
            throw new Error("LLM operations are disabled in CI (set CI=true)");
        // Ping activity at start to keep models alive during this operation
        this.touchActivity();
        if (texts.length === 0)
            return [];
        try {
            const contexts = await this.ensureEmbedContexts();
            const n = contexts.length;
            if (n === 1) {
                // Single context: sequential (no point splitting)
                const context = contexts[0];
                const embeddings = [];
                for (const text of texts) {
                    try {
                        const { text: safeText, truncated } = await this.truncateToContextSize(text);
                        if (truncated) {
                            console.warn(`⚠ Batch text truncated to fit embedding context (${this.embedModel?.trainContextSize} tokens)`);
                        }
                        const embedding = await context.getEmbeddingFor(safeText);
                        this.touchActivity();
                        embeddings.push({ embedding: Array.from(embedding.vector), model: options.model ?? this.embedModelUri });
                    }
                    catch (err) {
                        console.error("Embedding error for text:", err);
                        embeddings.push(null);
                    }
                }
                return embeddings;
            }
            // Multiple contexts: split texts across contexts for parallel evaluation
            const chunkSize = Math.ceil(texts.length / n);
            const chunks = Array.from({ length: n }, (_, i) => texts.slice(i * chunkSize, (i + 1) * chunkSize));
            const chunkResults = await Promise.all(chunks.map(async (chunk, i) => {
                const ctx = contexts[i];
                const results = [];
                for (const text of chunk) {
                    try {
                        const { text: safeText, truncated } = await this.truncateToContextSize(text);
                        if (truncated) {
                            console.warn(`⚠ Batch text truncated to fit embedding context (${this.embedModel?.trainContextSize} tokens)`);
                        }
                        const embedding = await ctx.getEmbeddingFor(safeText);
                        this.touchActivity();
                        results.push({ embedding: Array.from(embedding.vector), model: options.model ?? this.embedModelUri });
                    }
                    catch (err) {
                        console.error("Embedding error for text:", err);
                        results.push(null);
                    }
                }
                return results;
            }));
            return chunkResults.flat();
        }
        catch (error) {
            console.error("Batch embedding error:", error);
            return texts.map(() => null);
        }
    }
    async generate(prompt, options = {}) {
        if (this._ciMode)
            throw new Error("LLM operations are disabled in CI (set CI=true)");
        // Ping activity at start to keep models alive during this operation
        this.touchActivity();
        // Ensure model is loaded
        await this.ensureGenerateModel();
        // Create fresh context -> sequence -> session for each call
        const context = await this.generateModel.createContext();
        const sequence = context.getSequence();
        const session = new LlamaChatSession({ contextSequence: sequence });
        const maxTokens = options.maxTokens ?? 150;
        // Qwen3 recommends temp=0.7, topP=0.8, topK=20 for non-thinking mode
        // DO NOT use greedy decoding (temp=0) - causes repetition loops
        const temperature = options.temperature ?? 0.7;
        let result = "";
        try {
            await session.prompt(prompt, {
                maxTokens,
                temperature,
                topK: 20,
                topP: 0.8,
                onTextChunk: (text) => {
                    result += text;
                },
            });
            return {
                text: result,
                model: this.generateModelUri,
                done: true,
            };
        }
        finally {
            // Dispose context (which disposes dependent sequences/sessions per lifecycle rules)
            await context.dispose();
        }
    }
    async modelExists(modelUri) {
        // For HuggingFace URIs, we assume they exist
        // For local paths, check if file exists
        if (modelUri.startsWith("hf:")) {
            return { name: modelUri, exists: true };
        }
        const exists = existsSync(modelUri);
        return {
            name: modelUri,
            exists,
            path: exists ? modelUri : undefined,
        };
    }
    // ==========================================================================
    // High-level abstractions
    // ==========================================================================
    async expandQuery(query, options = {}) {
        if (this._ciMode)
            throw new Error("LLM operations are disabled in CI (set CI=true)");
        // Ping activity at start to keep models alive during this operation
        this.touchActivity();
        const llama = await this.ensureLlama();
        await this.ensureGenerateModel();
        const includeLexical = options.includeLexical ?? true;
        const context = options.context;
        const grammar = await llama.createGrammar({
            grammar: `
        root ::= line+
        line ::= type ": " content "\\n"
        type ::= "lex" | "vec" | "hyde"
        content ::= [^\\n]+
      `
        });
        const intent = options.intent;
        const prompt = intent
            ? `/no_think Expand this search query: ${query}\nQuery intent: ${intent}`
            : `/no_think Expand this search query: ${query}`;
        // Create a bounded context for expansion to prevent large default VRAM allocations.
        const genContext = await this.generateModel.createContext({
            contextSize: this.expandContextSize,
        });
        const sequence = genContext.getSequence();
        const session = new LlamaChatSession({ contextSequence: sequence });
        try {
            // Qwen3 recommended settings for non-thinking mode:
            // temp=0.7, topP=0.8, topK=20, presence_penalty for repetition
            // DO NOT use greedy decoding (temp=0) - causes infinite loops
            const result = await session.prompt(prompt, {
                grammar,
                maxTokens: 600,
                temperature: 0.7,
                topK: 20,
                topP: 0.8,
                repeatPenalty: {
                    lastTokens: 64,
                    presencePenalty: 0.5,
                },
            });
            const lines = result.trim().split("\n");
            const queryLower = query.toLowerCase();
            const queryTerms = queryLower.replace(/[^a-z0-9\s]/g, " ").split(/\s+/).filter(Boolean);
            const hasQueryTerm = (text) => {
                const lower = text.toLowerCase();
                if (queryTerms.length === 0)
                    return true;
                return queryTerms.some(term => lower.includes(term));
            };
            const queryables = lines.map(line => {
                const colonIdx = line.indexOf(":");
                if (colonIdx === -1)
                    return null;
                const type = line.slice(0, colonIdx).trim();
                if (type !== 'lex' && type !== 'vec' && type !== 'hyde')
                    return null;
                const text = line.slice(colonIdx + 1).trim();
                if (!hasQueryTerm(text))
                    return null;
                return { type: type, text };
            }).filter((q) => q !== null);
            // Filter out lex entries if not requested
            const filtered = includeLexical ? queryables : queryables.filter(q => q.type !== 'lex');
            if (filtered.length > 0)
                return filtered;
            const fallback = [
                { type: 'hyde', text: `Information about ${query}` },
                { type: 'lex', text: query },
                { type: 'vec', text: query },
            ];
            return includeLexical ? fallback : fallback.filter(q => q.type !== 'lex');
        }
        catch (error) {
            console.error("Structured query expansion failed:", error);
            // Fallback to original query
            const fallback = [{ type: 'vec', text: query }];
            if (includeLexical)
                fallback.unshift({ type: 'lex', text: query });
            return fallback;
        }
        finally {
            await genContext.dispose();
        }
    }
    // Qwen3 reranker chat template overhead (system prompt, tags, separators).
    // Measured at ~350 tokens on real queries; use 512 as a safe upper bound so
    // the truncation budget never lets a document slip past the context limit.
    static RERANK_TEMPLATE_OVERHEAD = 512;
    static RERANK_TARGET_DOCS_PER_CONTEXT = 10;
    async rerank(query, documents, options = {}) {
        if (this._ciMode)
            throw new Error("LLM operations are disabled in CI (set CI=true)");
        // Ping activity at start to keep models alive during this operation
        this.touchActivity();
        const contexts = await this.ensureRerankContexts();
        const model = await this.ensureRerankModel();
        // Truncate documents that would exceed the rerank context size.
        // Budget = contextSize - template overhead - query tokens
        const queryTokens = model.tokenize(query).length;
        const maxDocTokens = LlamaCpp.RERANK_CONTEXT_SIZE - LlamaCpp.RERANK_TEMPLATE_OVERHEAD - queryTokens;
        const truncationCache = new Map();
        const truncatedDocs = documents.map((doc) => {
            const cached = truncationCache.get(doc.text);
            if (cached !== undefined) {
                return cached === doc.text ? doc : { ...doc, text: cached };
            }
            const tokens = model.tokenize(doc.text);
            const truncatedText = tokens.length <= maxDocTokens
                ? doc.text
                : model.detokenize(tokens.slice(0, maxDocTokens));
            truncationCache.set(doc.text, truncatedText);
            if (truncatedText === doc.text)
                return doc;
            return { ...doc, text: truncatedText };
        });
        // Deduplicate identical effective texts before scoring.
        // This avoids redundant work for repeated chunks and fixes collisions where
        // multiple docs map to the same chunk text.
        const textToDocs = new Map();
        truncatedDocs.forEach((doc, index) => {
            const existing = textToDocs.get(doc.text);
            if (existing) {
                existing.push({ file: doc.file, index });
            }
            else {
                textToDocs.set(doc.text, [{ file: doc.file, index }]);
            }
        });
        // Extract just the text for ranking
        const texts = Array.from(textToDocs.keys());
        // Split documents across contexts for parallel evaluation.
        // Each context has its own sequence with a lock, so parallelism comes
        // from multiple contexts evaluating different chunks simultaneously.
        const activeContextCount = Math.max(1, Math.min(contexts.length, Math.ceil(texts.length / LlamaCpp.RERANK_TARGET_DOCS_PER_CONTEXT)));
        const activeContexts = contexts.slice(0, activeContextCount);
        const chunkSize = Math.ceil(texts.length / activeContexts.length);
        const chunks = Array.from({ length: activeContexts.length }, (_, i) => texts.slice(i * chunkSize, (i + 1) * chunkSize)).filter(chunk => chunk.length > 0);
        const allScores = await Promise.all(chunks.map((chunk, i) => activeContexts[i].rankAll(query, chunk)));
        // Reassemble scores in original order and sort
        const flatScores = allScores.flat();
        const ranked = texts
            .map((text, i) => ({ document: text, score: flatScores[i] }))
            .sort((a, b) => b.score - a.score);
        // Map back to our result format.
        const results = [];
        for (const item of ranked) {
            const docInfos = textToDocs.get(item.document) ?? [];
            for (const docInfo of docInfos) {
                results.push({
                    file: docInfo.file,
                    score: item.score,
                    index: docInfo.index,
                });
            }
        }
        return {
            results,
            model: this.rerankModelUri,
        };
    }
    /**
     * Get device/GPU info for status display.
     * Initializes llama if not already done.
     */
    async getDeviceInfo() {
        const llama = await this.ensureLlama();
        const gpuDevices = await llama.getGpuDeviceNames();
        let vram;
        if (llama.gpu) {
            try {
                const state = await llama.getVramState();
                vram = { total: state.total, used: state.used, free: state.free };
            }
            catch { /* no vram info */ }
        }
        return {
            gpu: llama.gpu,
            gpuOffloading: llama.supportsGpuOffloading,
            gpuDevices,
            vram,
            cpuCores: llama.cpuMathCores,
        };
    }
    async dispose() {
        // Prevent double-dispose
        if (this.disposed) {
            return;
        }
        this.disposed = true;
        // Clear inactivity timer
        if (this.inactivityTimer) {
            clearTimeout(this.inactivityTimer);
            this.inactivityTimer = null;
        }
        // Disposing llama cascades to models and contexts automatically
        // See: https://node-llama-cpp.withcat.ai/guide/objects-lifecycle
        // Note: llama.dispose() can hang indefinitely, so we use a timeout
        if (this.llama) {
            const disposePromise = this.llama.dispose();
            const timeoutPromise = new Promise((resolve) => setTimeout(resolve, 1000));
            await Promise.race([disposePromise, timeoutPromise]);
        }
        // Clear references
        this.embedContexts = [];
        this.rerankContexts = [];
        this.embedModel = null;
        this.generateModel = null;
        this.rerankModel = null;
        this.llama = null;
        // Clear any in-flight load/create promises
        this.embedModelLoadPromise = null;
        this.embedContextsCreatePromise = null;
        this.generateModelLoadPromise = null;
        this.rerankModelLoadPromise = null;
    }
}
// =============================================================================
// Session Management Layer
// =============================================================================
/**
 * Manages LLM session lifecycle with reference counting.
 * Coordinates with LlamaCpp idle timeout to prevent disposal during active sessions.
 */
class LLMSessionManager {
    llm;
    _activeSessionCount = 0;
    _inFlightOperations = 0;
    constructor(llm) {
        this.llm = llm;
    }
    get activeSessionCount() {
        return this._activeSessionCount;
    }
    get inFlightOperations() {
        return this._inFlightOperations;
    }
    /**
     * Returns true only when both session count and in-flight operations are 0.
     * Used by LlamaCpp to determine if idle unload is safe.
     */
    canUnload() {
        return this._activeSessionCount === 0 && this._inFlightOperations === 0;
    }
    acquire() {
        this._activeSessionCount++;
    }
    release() {
        this._activeSessionCount = Math.max(0, this._activeSessionCount - 1);
    }
    operationStart() {
        this._inFlightOperations++;
    }
    operationEnd() {
        this._inFlightOperations = Math.max(0, this._inFlightOperations - 1);
    }
    getLlamaCpp() {
        return this.llm;
    }
}
/**
 * Error thrown when an operation is attempted on a released or aborted session.
 */
export class SessionReleasedError extends Error {
    constructor(message = "LLM session has been released or aborted") {
        super(message);
        this.name = "SessionReleasedError";
    }
}
/**
 * Scoped LLM session with automatic lifecycle management.
 * Wraps LlamaCpp methods with operation tracking and abort handling.
 */
class LLMSession {
    manager;
    released = false;
    abortController;
    maxDurationTimer = null;
    name;
    constructor(manager, options = {}) {
        this.manager = manager;
        this.name = options.name || "unnamed";
        this.abortController = new AbortController();
        // Link external abort signal if provided
        if (options.signal) {
            if (options.signal.aborted) {
                this.abortController.abort(options.signal.reason);
            }
            else {
                options.signal.addEventListener("abort", () => {
                    this.abortController.abort(options.signal.reason);
                }, { once: true });
            }
        }
        // Set up max duration timer
        const maxDuration = options.maxDuration ?? 10 * 60 * 1000; // Default 10 minutes
        if (maxDuration > 0) {
            this.maxDurationTimer = setTimeout(() => {
                this.abortController.abort(new Error(`Session "${this.name}" exceeded max duration of ${maxDuration}ms`));
            }, maxDuration);
            this.maxDurationTimer.unref(); // Don't keep process alive
        }
        // Acquire session lease
        this.manager.acquire();
    }
    get isValid() {
        return !this.released && !this.abortController.signal.aborted;
    }
    get signal() {
        return this.abortController.signal;
    }
    /**
     * Release the session and decrement ref count.
     * Called automatically by withLLMSession when the callback completes.
     */
    release() {
        if (this.released)
            return;
        this.released = true;
        if (this.maxDurationTimer) {
            clearTimeout(this.maxDurationTimer);
            this.maxDurationTimer = null;
        }
        this.abortController.abort(new Error("Session released"));
        this.manager.release();
    }
    /**
     * Wrap an operation with tracking and abort checking.
     */
    async withOperation(fn) {
        if (!this.isValid) {
            throw new SessionReleasedError();
        }
        this.manager.operationStart();
        try {
            // Check abort before starting
            if (this.abortController.signal.aborted) {
                throw new SessionReleasedError(this.abortController.signal.reason?.message || "Session aborted");
            }
            return await fn();
        }
        finally {
            this.manager.operationEnd();
        }
    }
    async embed(text, options) {
        return this.withOperation(() => this.manager.getLlamaCpp().embed(text, options));
    }
    async embedBatch(texts, options) {
        return this.withOperation(() => this.manager.getLlamaCpp().embedBatch(texts, options));
    }
    async expandQuery(query, options) {
        return this.withOperation(() => this.manager.getLlamaCpp().expandQuery(query, options));
    }
    async rerank(query, documents, options) {
        return this.withOperation(() => this.manager.getLlamaCpp().rerank(query, documents, options));
    }
}
// Session manager for the default LlamaCpp instance
let defaultSessionManager = null;
/**
 * Get the session manager for the default LlamaCpp instance.
 */
function getSessionManager() {
    const llm = getDefaultLlamaCpp();
    if (!defaultSessionManager || defaultSessionManager.getLlamaCpp() !== llm) {
        defaultSessionManager = new LLMSessionManager(llm);
    }
    return defaultSessionManager;
}
/**
 * Execute a function with a scoped LLM session.
 * The session provides lifecycle guarantees - resources won't be disposed mid-operation.
 *
 * @example
 * ```typescript
 * await withLLMSession(async (session) => {
 *   const expanded = await session.expandQuery(query);
 *   const embeddings = await session.embedBatch(texts);
 *   const reranked = await session.rerank(query, docs);
 *   return reranked;
 * }, { maxDuration: 10 * 60 * 1000, name: 'querySearch' });
 * ```
 */
export async function withLLMSession(fn, options) {
    const manager = getSessionManager();
    const session = new LLMSession(manager, options);
    try {
        return await fn(session);
    }
    finally {
        session.release();
    }
}
/**
 * Execute a function with a scoped LLM session using a specific LlamaCpp instance.
 * Unlike withLLMSession, this does not use the global singleton.
 */
export async function withLLMSessionForLlm(llm, fn, options) {
    const manager = new LLMSessionManager(llm);
    const session = new LLMSession(manager, options);
    try {
        return await fn(session);
    }
    finally {
        session.release();
    }
}
/**
 * Check if idle unload is safe (no active sessions or operations).
 * Used internally by LlamaCpp idle timer.
 */
export function canUnloadLLM() {
    if (!defaultSessionManager)
        return true;
    return defaultSessionManager.canUnload();
}
// =============================================================================
// Singleton for default LlamaCpp instance
// =============================================================================
let defaultLlamaCpp = null;
/**
 * Get the default LlamaCpp instance (creates one if needed)
 */
export function getDefaultLlamaCpp() {
    if (!defaultLlamaCpp) {
        defaultLlamaCpp = new LlamaCpp();
    }
    return defaultLlamaCpp;
}
/**
 * Set a custom default LlamaCpp instance (useful for testing)
 */
export function setDefaultLlamaCpp(llm) {
    defaultLlamaCpp = llm;
}
/**
 * Dispose the default LlamaCpp instance if it exists.
 * Call this before process exit to prevent NAPI crashes.
 */
export async function disposeDefaultLlamaCpp() {
    if (defaultLlamaCpp) {
        await defaultLlamaCpp.dispose();
        defaultLlamaCpp = null;
    }
}