| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509 |
- /**
- * llm.ts - LLM abstraction layer for QMD using node-llama-cpp
- *
- * Provides embeddings, text generation, and reranking using local GGUF models.
- */
- import {
- getLlama,
- resolveModelFile,
- LlamaChatSession,
- LlamaLogLevel,
- type Llama,
- type LlamaModel,
- type LlamaEmbeddingContext,
- type Token as LlamaToken,
- } from "node-llama-cpp";
- import { homedir } from "os";
- import { join } from "path";
- import { existsSync, mkdirSync, statSync, unlinkSync, readdirSync, readFileSync, writeFileSync } from "fs";
- // =============================================================================
- // Embedding Formatting Functions
- // =============================================================================
- /**
- * Detect if a model URI uses the Qwen3-Embedding format.
- * Qwen3-Embedding uses a different prompting style than nomic/embeddinggemma.
- */
- export function isQwen3EmbeddingModel(modelUri: string): boolean {
- return /qwen.*embed/i.test(modelUri) || /embed.*qwen/i.test(modelUri);
- }
- /**
- * Format a query for embedding.
- * Uses nomic-style task prefix format for embeddinggemma (default).
- * Uses Qwen3-Embedding instruct format when a Qwen embedding model is active.
- */
- export function formatQueryForEmbedding(query: string, modelUri?: string): string {
- const uri = modelUri ?? process.env.QMD_EMBED_MODEL ?? DEFAULT_EMBED_MODEL;
- if (isQwen3EmbeddingModel(uri)) {
- return `Instruct: Retrieve relevant documents for the given query\nQuery: ${query}`;
- }
- return `task: search result | query: ${query}`;
- }
- /**
- * Format a document for embedding.
- * Uses nomic-style format with title and text fields (default).
- * Qwen3-Embedding encodes documents as raw text without special prefixes.
- */
- export function formatDocForEmbedding(text: string, title?: string, modelUri?: string): string {
- const uri = modelUri ?? process.env.QMD_EMBED_MODEL ?? DEFAULT_EMBED_MODEL;
- if (isQwen3EmbeddingModel(uri)) {
- // Qwen3-Embedding: documents are raw text, no task prefix
- return title ? `${title}\n${text}` : text;
- }
- return `title: ${title || "none"} | text: ${text}`;
- }
- // =============================================================================
- // Types
- // =============================================================================
- /**
- * Token with log probability
- */
- export type TokenLogProb = {
- token: string;
- logprob: number;
- };
- /**
- * Embedding result
- */
- export type EmbeddingResult = {
- embedding: number[];
- model: string;
- };
- /**
- * Generation result with optional logprobs
- */
- export type GenerateResult = {
- text: string;
- model: string;
- logprobs?: TokenLogProb[];
- done: boolean;
- };
- /**
- * Rerank result for a single document
- */
- export type RerankDocumentResult = {
- file: string;
- score: number;
- index: number;
- };
- /**
- * Batch rerank result
- */
- export type RerankResult = {
- results: RerankDocumentResult[];
- model: string;
- };
- /**
- * Model info
- */
- export type ModelInfo = {
- name: string;
- exists: boolean;
- path?: string;
- };
- /**
- * Options for embedding
- */
- export type EmbedOptions = {
- model?: string;
- isQuery?: boolean;
- title?: string;
- };
- /**
- * Options for text generation
- */
- export type GenerateOptions = {
- model?: string;
- maxTokens?: number;
- temperature?: number;
- };
- /**
- * Options for reranking
- */
- export type RerankOptions = {
- model?: string;
- };
- /**
- * Options for LLM sessions
- */
- export type LLMSessionOptions = {
- /** Max session duration in ms (default: 10 minutes) */
- maxDuration?: number;
- /** External abort signal */
- signal?: AbortSignal;
- /** Debug name for logging */
- name?: string;
- };
- /**
- * Session interface for scoped LLM access with lifecycle guarantees
- */
- export interface ILLMSession {
- embed(text: string, options?: EmbedOptions): Promise<EmbeddingResult | null>;
- embedBatch(texts: string[]): Promise<(EmbeddingResult | null)[]>;
- expandQuery(query: string, options?: { context?: string; includeLexical?: boolean }): Promise<Queryable[]>;
- rerank(query: string, documents: RerankDocument[], options?: RerankOptions): Promise<RerankResult>;
- /** Whether this session is still valid (not released or aborted) */
- readonly isValid: boolean;
- /** Abort signal for this session (aborts on release or maxDuration) */
- readonly signal: AbortSignal;
- }
- /**
- * Supported query types for different search backends
- */
- export type QueryType = 'lex' | 'vec' | 'hyde';
- /**
- * A single query and its target backend type
- */
- export type Queryable = {
- type: QueryType;
- text: string;
- };
- /**
- * Document to rerank
- */
- export type RerankDocument = {
- file: string;
- text: string;
- title?: string;
- };
- // =============================================================================
- // Model Configuration
- // =============================================================================
- // HuggingFace model URIs for node-llama-cpp
- // Format: hf:<user>/<repo>/<file>
- // Override via QMD_EMBED_MODEL env var (e.g. hf:Qwen/Qwen3-Embedding-0.6B-GGUF/Qwen3-Embedding-0.6B-Q8_0.gguf)
- const DEFAULT_EMBED_MODEL = process.env.QMD_EMBED_MODEL ?? "hf:ggml-org/embeddinggemma-300M-GGUF/embeddinggemma-300M-Q8_0.gguf";
- const DEFAULT_RERANK_MODEL = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
- // const DEFAULT_GENERATE_MODEL = "hf:ggml-org/Qwen3-0.6B-GGUF/Qwen3-0.6B-Q8_0.gguf";
- const DEFAULT_GENERATE_MODEL = "hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf";
- // Alternative generation models for query expansion:
- // LiquidAI LFM2 - hybrid architecture optimized for edge/on-device inference
- // Use these as base for fine-tuning with configs/sft_lfm2.yaml
- export const LFM2_GENERATE_MODEL = "hf:LiquidAI/LFM2-1.2B-GGUF/LFM2-1.2B-Q4_K_M.gguf";
- export const LFM2_INSTRUCT_MODEL = "hf:LiquidAI/LFM2.5-1.2B-Instruct-GGUF/LFM2.5-1.2B-Instruct-Q4_K_M.gguf";
- export const DEFAULT_EMBED_MODEL_URI = DEFAULT_EMBED_MODEL;
- export const DEFAULT_RERANK_MODEL_URI = DEFAULT_RERANK_MODEL;
- export const DEFAULT_GENERATE_MODEL_URI = DEFAULT_GENERATE_MODEL;
- // Local model cache directory
- const MODEL_CACHE_DIR = join(homedir(), ".cache", "qmd", "models");
- export const DEFAULT_MODEL_CACHE_DIR = MODEL_CACHE_DIR;
- export type PullResult = {
- model: string;
- path: string;
- sizeBytes: number;
- refreshed: boolean;
- };
- type HfRef = {
- repo: string;
- file: string;
- };
- function parseHfUri(model: string): HfRef | null {
- if (!model.startsWith("hf:")) return null;
- const without = model.slice(3);
- const parts = without.split("/");
- if (parts.length < 3) return null;
- const repo = parts.slice(0, 2).join("/");
- const file = parts.slice(2).join("/");
- return { repo, file };
- }
- async function getRemoteEtag(ref: HfRef): Promise<string | null> {
- const url = `https://huggingface.co/${ref.repo}/resolve/main/${ref.file}`;
- try {
- const resp = await fetch(url, { method: "HEAD" });
- if (!resp.ok) return null;
- const etag = resp.headers.get("etag");
- return etag || null;
- } catch {
- return null;
- }
- }
- export async function pullModels(
- models: string[],
- options: { refresh?: boolean; cacheDir?: string } = {}
- ): Promise<PullResult[]> {
- const cacheDir = options.cacheDir || MODEL_CACHE_DIR;
- if (!existsSync(cacheDir)) {
- mkdirSync(cacheDir, { recursive: true });
- }
- const results: PullResult[] = [];
- for (const model of models) {
- let refreshed = false;
- const hfRef = parseHfUri(model);
- const filename = model.split("/").pop();
- const entries = readdirSync(cacheDir, { withFileTypes: true });
- const cached = filename
- ? entries
- .filter((entry) => entry.isFile() && entry.name.includes(filename))
- .map((entry) => join(cacheDir, entry.name))
- : [];
- if (hfRef && filename) {
- const etagPath = join(cacheDir, `${filename}.etag`);
- const remoteEtag = await getRemoteEtag(hfRef);
- const localEtag = existsSync(etagPath)
- ? readFileSync(etagPath, "utf-8").trim()
- : null;
- const shouldRefresh =
- options.refresh || !remoteEtag || remoteEtag !== localEtag || cached.length === 0;
- if (shouldRefresh) {
- for (const candidate of cached) {
- if (existsSync(candidate)) unlinkSync(candidate);
- }
- if (existsSync(etagPath)) unlinkSync(etagPath);
- refreshed = cached.length > 0;
- }
- } else if (options.refresh && filename) {
- for (const candidate of cached) {
- if (existsSync(candidate)) unlinkSync(candidate);
- refreshed = true;
- }
- }
- const path = await resolveModelFile(model, cacheDir);
- const sizeBytes = existsSync(path) ? statSync(path).size : 0;
- if (hfRef && filename) {
- const remoteEtag = await getRemoteEtag(hfRef);
- if (remoteEtag) {
- const etagPath = join(cacheDir, `${filename}.etag`);
- writeFileSync(etagPath, remoteEtag + "\n", "utf-8");
- }
- }
- results.push({ model, path, sizeBytes, refreshed });
- }
- return results;
- }
- // =============================================================================
- // LLM Interface
- // =============================================================================
- /**
- * Abstract LLM interface - implement this for different backends
- */
- export interface LLM {
- /**
- * Get embeddings for text
- */
- embed(text: string, options?: EmbedOptions): Promise<EmbeddingResult | null>;
- /**
- * Generate text completion
- */
- generate(prompt: string, options?: GenerateOptions): Promise<GenerateResult | null>;
- /**
- * Check if a model exists/is available
- */
- modelExists(model: string): Promise<ModelInfo>;
- /**
- * Expand a search query into multiple variations for different backends.
- * Returns a list of Queryable objects.
- */
- expandQuery(query: string, options?: { context?: string, includeLexical?: boolean }): Promise<Queryable[]>;
- /**
- * Rerank documents by relevance to a query
- * Returns list of documents with relevance scores (higher = more relevant)
- */
- rerank(query: string, documents: RerankDocument[], options?: RerankOptions): Promise<RerankResult>;
- /**
- * Dispose of resources
- */
- dispose(): Promise<void>;
- }
- // =============================================================================
- // node-llama-cpp Implementation
- // =============================================================================
- export type LlamaCppConfig = {
- embedModel?: string;
- generateModel?: string;
- rerankModel?: string;
- modelCacheDir?: string;
- /**
- * Context size used for query expansion generation contexts.
- * Default: 2048. Can also be set via QMD_EXPAND_CONTEXT_SIZE.
- */
- expandContextSize?: number;
- /**
- * Inactivity timeout in ms before unloading contexts (default: 2 minutes, 0 to disable).
- *
- * Per node-llama-cpp lifecycle guidance, we prefer keeping models loaded and only disposing
- * contexts when idle, since contexts (and their sequences) are the heavy per-session objects.
- * @see https://node-llama-cpp.withcat.ai/guide/objects-lifecycle
- */
- inactivityTimeoutMs?: number;
- /**
- * Whether to dispose models on inactivity (default: false).
- *
- * Keeping models loaded avoids repeated VRAM thrash; set to true only if you need aggressive
- * memory reclaim.
- */
- disposeModelsOnInactivity?: boolean;
- };
- /**
- * LLM implementation using node-llama-cpp
- */
- // Default inactivity timeout: 5 minutes (keep models warm during typical search sessions)
- const DEFAULT_INACTIVITY_TIMEOUT_MS = 5 * 60 * 1000;
- const DEFAULT_EXPAND_CONTEXT_SIZE = 2048;
- function resolveExpandContextSize(configValue?: number): number {
- if (configValue !== undefined) {
- if (!Number.isInteger(configValue) || configValue <= 0) {
- throw new Error(`Invalid expandContextSize: ${configValue}. Must be a positive integer.`);
- }
- return configValue;
- }
- const envValue = process.env.QMD_EXPAND_CONTEXT_SIZE?.trim();
- if (!envValue) return DEFAULT_EXPAND_CONTEXT_SIZE;
- const parsed = Number.parseInt(envValue, 10);
- if (!Number.isInteger(parsed) || parsed <= 0) {
- process.stderr.write(
- `QMD Warning: invalid QMD_EXPAND_CONTEXT_SIZE="${envValue}", using default ${DEFAULT_EXPAND_CONTEXT_SIZE}.\n`
- );
- return DEFAULT_EXPAND_CONTEXT_SIZE;
- }
- return parsed;
- }
- export class LlamaCpp implements LLM {
- private readonly _ciMode = !!process.env.CI;
- private llama: Llama | null = null;
- private embedModel: LlamaModel | null = null;
- private embedContexts: LlamaEmbeddingContext[] = [];
- private generateModel: LlamaModel | null = null;
- private rerankModel: LlamaModel | null = null;
- private rerankContexts: Awaited<ReturnType<LlamaModel["createRankingContext"]>>[] = [];
- private embedModelUri: string;
- private generateModelUri: string;
- private rerankModelUri: string;
- private modelCacheDir: string;
- private expandContextSize: number;
- // Ensure we don't load the same model/context concurrently (which can allocate duplicate VRAM).
- private embedModelLoadPromise: Promise<LlamaModel> | null = null;
- private generateModelLoadPromise: Promise<LlamaModel> | null = null;
- private rerankModelLoadPromise: Promise<LlamaModel> | null = null;
- // Inactivity timer for auto-unloading models
- private inactivityTimer: ReturnType<typeof setTimeout> | null = null;
- private inactivityTimeoutMs: number;
- private disposeModelsOnInactivity: boolean;
- // Track disposal state to prevent double-dispose
- private disposed = false;
- constructor(config: LlamaCppConfig = {}) {
- this.embedModelUri = config.embedModel || DEFAULT_EMBED_MODEL;
- this.generateModelUri = config.generateModel || DEFAULT_GENERATE_MODEL;
- this.rerankModelUri = config.rerankModel || DEFAULT_RERANK_MODEL;
- this.modelCacheDir = config.modelCacheDir || MODEL_CACHE_DIR;
- this.expandContextSize = resolveExpandContextSize(config.expandContextSize);
- this.inactivityTimeoutMs = config.inactivityTimeoutMs ?? DEFAULT_INACTIVITY_TIMEOUT_MS;
- this.disposeModelsOnInactivity = config.disposeModelsOnInactivity ?? false;
- }
- /**
- * Reset the inactivity timer. Called after each model operation.
- * When timer fires, models are unloaded to free memory (if no active sessions).
- */
- private touchActivity(): void {
- // Clear existing timer
- if (this.inactivityTimer) {
- clearTimeout(this.inactivityTimer);
- this.inactivityTimer = null;
- }
- // Only set timer if we have disposable contexts and timeout is enabled
- if (this.inactivityTimeoutMs > 0 && this.hasLoadedContexts()) {
- this.inactivityTimer = setTimeout(() => {
- // Check if session manager allows unloading
- // canUnloadLLM is defined later in this file - it checks the session manager
- // We use dynamic import pattern to avoid circular dependency issues
- if (typeof canUnloadLLM === 'function' && !canUnloadLLM()) {
- // Active sessions/operations - reschedule timer
- this.touchActivity();
- return;
- }
- this.unloadIdleResources().catch(err => {
- console.error("Error unloading idle resources:", err);
- });
- }, this.inactivityTimeoutMs);
- // Don't keep process alive just for this timer
- this.inactivityTimer.unref();
- }
- }
- /**
- * Check if any contexts are currently loaded (and therefore worth unloading on inactivity).
- */
- private hasLoadedContexts(): boolean {
- return !!(this.embedContexts.length > 0 || this.rerankContexts.length > 0);
- }
- /**
- * Unload idle resources but keep the instance alive for future use.
- *
- * By default, this disposes contexts (and their dependent sequences), while keeping models loaded.
- * This matches the intended lifecycle: model → context → sequence, where contexts are per-session.
- */
- async unloadIdleResources(): Promise<void> {
- // Don't unload if already disposed
- if (this.disposed) {
- return;
- }
- // Clear timer
- if (this.inactivityTimer) {
- clearTimeout(this.inactivityTimer);
- this.inactivityTimer = null;
- }
- // Dispose contexts first
- for (const ctx of this.embedContexts) {
- await ctx.dispose();
- }
- this.embedContexts = [];
- for (const ctx of this.rerankContexts) {
- await ctx.dispose();
- }
- this.rerankContexts = [];
- // Optionally dispose models too (opt-in)
- if (this.disposeModelsOnInactivity) {
- if (this.embedModel) {
- await this.embedModel.dispose();
- this.embedModel = null;
- }
- if (this.generateModel) {
- await this.generateModel.dispose();
- this.generateModel = null;
- }
- if (this.rerankModel) {
- await this.rerankModel.dispose();
- this.rerankModel = null;
- }
- // Reset load promises so models can be reloaded later
- this.embedModelLoadPromise = null;
- this.generateModelLoadPromise = null;
- this.rerankModelLoadPromise = null;
- }
- // Note: We keep llama instance alive - it's lightweight
- }
- /**
- * Ensure model cache directory exists
- */
- private ensureModelCacheDir(): void {
- if (!existsSync(this.modelCacheDir)) {
- mkdirSync(this.modelCacheDir, { recursive: true });
- }
- }
- /**
- * Initialize the llama instance (lazy)
- */
- private async ensureLlama(): Promise<Llama> {
- if (!this.llama) {
- const llama = await getLlama({
- // attempt to build
- build: "autoAttempt",
- logLevel: LlamaLogLevel.error
- });
- if (llama.gpu === false) {
- process.stderr.write(
- "QMD Warning: no GPU acceleration, running on CPU (slow). Run 'qmd status' for details.\n"
- );
- }
- this.llama = llama;
- }
- return this.llama;
- }
- /**
- * Resolve a model URI to a local path, downloading if needed
- */
- private async resolveModel(modelUri: string): Promise<string> {
- this.ensureModelCacheDir();
- // resolveModelFile handles HF URIs and downloads to the cache dir
- return await resolveModelFile(modelUri, this.modelCacheDir);
- }
- /**
- * Load embedding model (lazy)
- */
- private async ensureEmbedModel(): Promise<LlamaModel> {
- if (this.embedModel) {
- return this.embedModel;
- }
- if (this.embedModelLoadPromise) {
- return await this.embedModelLoadPromise;
- }
- this.embedModelLoadPromise = (async () => {
- const llama = await this.ensureLlama();
- const modelPath = await this.resolveModel(this.embedModelUri);
- const model = await llama.loadModel({ modelPath });
- this.embedModel = model;
- // Model loading counts as activity - ping to keep alive
- this.touchActivity();
- return model;
- })();
- try {
- return await this.embedModelLoadPromise;
- } finally {
- // Keep the resolved model cached; clear only the in-flight promise.
- this.embedModelLoadPromise = null;
- }
- }
- /**
- * Compute how many parallel contexts to create.
- *
- * GPU: constrained by VRAM (25% of free, capped at 8).
- * CPU: constrained by cores. Splitting threads across contexts enables
- * true parallelism (each context runs on its own cores). Use at most
- * half the math cores, with at least 4 threads per context.
- */
- private async computeParallelism(perContextMB: number): Promise<number> {
- const llama = await this.ensureLlama();
- if (llama.gpu) {
- try {
- const vram = await llama.getVramState();
- const freeMB = vram.free / (1024 * 1024);
- const maxByVram = Math.floor((freeMB * 0.25) / perContextMB);
- return Math.max(1, Math.min(8, maxByVram));
- } catch {
- return 2;
- }
- }
- // CPU: split cores across contexts. At least 4 threads per context.
- const cores = llama.cpuMathCores || 4;
- const maxContexts = Math.floor(cores / 4);
- return Math.max(1, Math.min(4, maxContexts));
- }
- /**
- * Get the number of threads each context should use, given N parallel contexts.
- * Splits available math cores evenly across contexts.
- */
- private async threadsPerContext(parallelism: number): Promise<number> {
- const llama = await this.ensureLlama();
- if (llama.gpu) return 0; // GPU: let the library decide
- const cores = llama.cpuMathCores || 4;
- return Math.max(1, Math.floor(cores / parallelism));
- }
- /**
- * Load embedding contexts (lazy). Creates multiple for parallel embedding.
- * Uses promise guard to prevent concurrent context creation race condition.
- */
- private embedContextsCreatePromise: Promise<LlamaEmbeddingContext[]> | null = null;
- private async ensureEmbedContexts(): Promise<LlamaEmbeddingContext[]> {
- if (this.embedContexts.length > 0) {
- this.touchActivity();
- return this.embedContexts;
- }
- if (this.embedContextsCreatePromise) {
- return await this.embedContextsCreatePromise;
- }
- this.embedContextsCreatePromise = (async () => {
- const model = await this.ensureEmbedModel();
- // Embed contexts are ~143 MB each (nomic-embed 2048 ctx)
- const n = await this.computeParallelism(150);
- const threads = await this.threadsPerContext(n);
- for (let i = 0; i < n; i++) {
- try {
- this.embedContexts.push(await model.createEmbeddingContext({
- ...(threads > 0 ? { threads } : {}),
- }));
- } catch {
- if (this.embedContexts.length === 0) throw new Error("Failed to create any embedding context");
- break;
- }
- }
- this.touchActivity();
- return this.embedContexts;
- })();
- try {
- return await this.embedContextsCreatePromise;
- } finally {
- this.embedContextsCreatePromise = null;
- }
- }
- /**
- * Get a single embed context (for single-embed calls). Uses first from pool.
- */
- private async ensureEmbedContext(): Promise<LlamaEmbeddingContext> {
- const contexts = await this.ensureEmbedContexts();
- return contexts[0]!;
- }
- /**
- * Load generation model (lazy) - context is created fresh per call
- */
- private async ensureGenerateModel(): Promise<LlamaModel> {
- if (!this.generateModel) {
- if (this.generateModelLoadPromise) {
- return await this.generateModelLoadPromise;
- }
- this.generateModelLoadPromise = (async () => {
- const llama = await this.ensureLlama();
- const modelPath = await this.resolveModel(this.generateModelUri);
- const model = await llama.loadModel({ modelPath });
- this.generateModel = model;
- return model;
- })();
- try {
- await this.generateModelLoadPromise;
- } finally {
- this.generateModelLoadPromise = null;
- }
- }
- this.touchActivity();
- if (!this.generateModel) {
- throw new Error("Generate model not loaded");
- }
- return this.generateModel;
- }
- /**
- * Load rerank model (lazy)
- */
- private async ensureRerankModel(): Promise<LlamaModel> {
- if (this.rerankModel) {
- return this.rerankModel;
- }
- if (this.rerankModelLoadPromise) {
- return await this.rerankModelLoadPromise;
- }
- this.rerankModelLoadPromise = (async () => {
- const llama = await this.ensureLlama();
- const modelPath = await this.resolveModel(this.rerankModelUri);
- const model = await llama.loadModel({ modelPath });
- this.rerankModel = model;
- // Model loading counts as activity - ping to keep alive
- this.touchActivity();
- return model;
- })();
- try {
- return await this.rerankModelLoadPromise;
- } finally {
- this.rerankModelLoadPromise = null;
- }
- }
- /**
- * Load rerank contexts (lazy). Creates multiple contexts for parallel ranking.
- * Each context has its own sequence, so they can evaluate independently.
- *
- * Tuning choices:
- * - contextSize 1024: reranking chunks are ~800 tokens max, 1024 is plenty
- * - flashAttention: ~20% less VRAM per context (568 vs 711 MB)
- * - Combined: drops from 11.6 GB (auto, no flash) to 568 MB per context (20×)
- */
- // Qwen3 reranker template adds ~200 tokens overhead (system prompt, tags, etc.)
- // Chunks are max 800 tokens, so 800 + 200 + query ≈ 1100 tokens typical.
- // Use 2048 for safety margin. Still 17× less than auto (40960).
- private static readonly RERANK_CONTEXT_SIZE = 2048;
- private async ensureRerankContexts(): Promise<Awaited<ReturnType<LlamaModel["createRankingContext"]>>[]> {
- if (this.rerankContexts.length === 0) {
- const model = await this.ensureRerankModel();
- // ~960 MB per context with flash attention at contextSize 2048
- const n = Math.min(await this.computeParallelism(1000), 4);
- const threads = await this.threadsPerContext(n);
- for (let i = 0; i < n; i++) {
- try {
- this.rerankContexts.push(await model.createRankingContext({
- contextSize: LlamaCpp.RERANK_CONTEXT_SIZE,
- flashAttention: true,
- ...(threads > 0 ? { threads } : {}),
- } as any));
- } catch {
- if (this.rerankContexts.length === 0) {
- // Flash attention might not be supported — retry without it
- try {
- this.rerankContexts.push(await model.createRankingContext({
- contextSize: LlamaCpp.RERANK_CONTEXT_SIZE,
- ...(threads > 0 ? { threads } : {}),
- }));
- } catch {
- throw new Error("Failed to create any rerank context");
- }
- }
- break;
- }
- }
- }
- this.touchActivity();
- return this.rerankContexts;
- }
- // ==========================================================================
- // Tokenization
- // ==========================================================================
- /**
- * Tokenize text using the embedding model's tokenizer
- * Returns tokenizer tokens (opaque type from node-llama-cpp)
- */
- async tokenize(text: string): Promise<readonly LlamaToken[]> {
- await this.ensureEmbedContext(); // Ensure model is loaded
- if (!this.embedModel) {
- throw new Error("Embed model not loaded");
- }
- return this.embedModel.tokenize(text);
- }
- /**
- * Count tokens in text using the embedding model's tokenizer
- */
- async countTokens(text: string): Promise<number> {
- const tokens = await this.tokenize(text);
- return tokens.length;
- }
- /**
- * Detokenize token IDs back to text
- */
- async detokenize(tokens: readonly LlamaToken[]): Promise<string> {
- await this.ensureEmbedContext();
- if (!this.embedModel) {
- throw new Error("Embed model not loaded");
- }
- return this.embedModel.detokenize(tokens);
- }
- // ==========================================================================
- // Core API methods
- // ==========================================================================
- async embed(text: string, options: EmbedOptions = {}): Promise<EmbeddingResult | null> {
- // Ping activity at start to keep models alive during this operation
- this.touchActivity();
- try {
- const context = await this.ensureEmbedContext();
- const embedding = await context.getEmbeddingFor(text);
- return {
- embedding: Array.from(embedding.vector),
- model: this.embedModelUri,
- };
- } catch (error) {
- console.error("Embedding error:", error);
- return null;
- }
- }
- /**
- * Batch embed multiple texts efficiently
- * Uses Promise.all for parallel embedding - node-llama-cpp handles batching internally
- */
- async embedBatch(texts: string[]): Promise<(EmbeddingResult | null)[]> {
- if (this._ciMode) throw new Error("LLM operations are disabled in CI (set CI=true)");
- // Ping activity at start to keep models alive during this operation
- this.touchActivity();
- if (texts.length === 0) return [];
- try {
- const contexts = await this.ensureEmbedContexts();
- const n = contexts.length;
- if (n === 1) {
- // Single context: sequential (no point splitting)
- const context = contexts[0]!;
- const embeddings: ({ embedding: number[]; model: string } | null)[] = [];
- for (const text of texts) {
- try {
- const embedding = await context.getEmbeddingFor(text);
- this.touchActivity();
- embeddings.push({ embedding: Array.from(embedding.vector), model: this.embedModelUri });
- } catch (err) {
- console.error("Embedding error for text:", err);
- embeddings.push(null);
- }
- }
- return embeddings;
- }
- // Multiple contexts: split texts across contexts for parallel evaluation
- const chunkSize = Math.ceil(texts.length / n);
- const chunks = Array.from({ length: n }, (_, i) =>
- texts.slice(i * chunkSize, (i + 1) * chunkSize)
- );
- const chunkResults = await Promise.all(
- chunks.map(async (chunk, i) => {
- const ctx = contexts[i]!;
- const results: (EmbeddingResult | null)[] = [];
- for (const text of chunk) {
- try {
- const embedding = await ctx.getEmbeddingFor(text);
- this.touchActivity();
- results.push({ embedding: Array.from(embedding.vector), model: this.embedModelUri });
- } catch (err) {
- console.error("Embedding error for text:", err);
- results.push(null);
- }
- }
- return results;
- })
- );
- return chunkResults.flat();
- } catch (error) {
- console.error("Batch embedding error:", error);
- return texts.map(() => null);
- }
- }
- async generate(prompt: string, options: GenerateOptions = {}): Promise<GenerateResult | null> {
- if (this._ciMode) throw new Error("LLM operations are disabled in CI (set CI=true)");
- // Ping activity at start to keep models alive during this operation
- this.touchActivity();
- // Ensure model is loaded
- await this.ensureGenerateModel();
- // Create fresh context -> sequence -> session for each call
- const context = await this.generateModel!.createContext();
- const sequence = context.getSequence();
- const session = new LlamaChatSession({ contextSequence: sequence });
- const maxTokens = options.maxTokens ?? 150;
- // Qwen3 recommends temp=0.7, topP=0.8, topK=20 for non-thinking mode
- // DO NOT use greedy decoding (temp=0) - causes repetition loops
- const temperature = options.temperature ?? 0.7;
- let result = "";
- try {
- await session.prompt(prompt, {
- maxTokens,
- temperature,
- topK: 20,
- topP: 0.8,
- onTextChunk: (text) => {
- result += text;
- },
- });
- return {
- text: result,
- model: this.generateModelUri,
- done: true,
- };
- } finally {
- // Dispose context (which disposes dependent sequences/sessions per lifecycle rules)
- await context.dispose();
- }
- }
- async modelExists(modelUri: string): Promise<ModelInfo> {
- // For HuggingFace URIs, we assume they exist
- // For local paths, check if file exists
- if (modelUri.startsWith("hf:")) {
- return { name: modelUri, exists: true };
- }
- const exists = existsSync(modelUri);
- return {
- name: modelUri,
- exists,
- path: exists ? modelUri : undefined,
- };
- }
- // ==========================================================================
- // High-level abstractions
- // ==========================================================================
- async expandQuery(query: string, options: { context?: string, includeLexical?: boolean, intent?: string } = {}): Promise<Queryable[]> {
- if (this._ciMode) throw new Error("LLM operations are disabled in CI (set CI=true)");
- // Ping activity at start to keep models alive during this operation
- this.touchActivity();
- const llama = await this.ensureLlama();
- await this.ensureGenerateModel();
- const includeLexical = options.includeLexical ?? true;
- const context = options.context;
- const grammar = await llama.createGrammar({
- grammar: `
- root ::= line+
- line ::= type ": " content "\\n"
- type ::= "lex" | "vec" | "hyde"
- content ::= [^\\n]+
- `
- });
- const intent = options.intent;
- const prompt = intent
- ? `/no_think Expand this search query: ${query}\nQuery intent: ${intent}`
- : `/no_think Expand this search query: ${query}`;
- // Create a bounded context for expansion to prevent large default VRAM allocations.
- const genContext = await this.generateModel!.createContext({
- contextSize: this.expandContextSize,
- });
- const sequence = genContext.getSequence();
- const session = new LlamaChatSession({ contextSequence: sequence });
- try {
- // Qwen3 recommended settings for non-thinking mode:
- // temp=0.7, topP=0.8, topK=20, presence_penalty for repetition
- // DO NOT use greedy decoding (temp=0) - causes infinite loops
- const result = await session.prompt(prompt, {
- grammar,
- maxTokens: 600,
- temperature: 0.7,
- topK: 20,
- topP: 0.8,
- repeatPenalty: {
- lastTokens: 64,
- presencePenalty: 0.5,
- },
- });
- const lines = result.trim().split("\n");
- const queryLower = query.toLowerCase();
- const queryTerms = queryLower.replace(/[^a-z0-9\s]/g, " ").split(/\s+/).filter(Boolean);
- const hasQueryTerm = (text: string): boolean => {
- const lower = text.toLowerCase();
- if (queryTerms.length === 0) return true;
- return queryTerms.some(term => lower.includes(term));
- };
- const queryables: Queryable[] = lines.map(line => {
- const colonIdx = line.indexOf(":");
- if (colonIdx === -1) return null;
- const type = line.slice(0, colonIdx).trim();
- if (type !== 'lex' && type !== 'vec' && type !== 'hyde') return null;
- const text = line.slice(colonIdx + 1).trim();
- if (!hasQueryTerm(text)) return null;
- return { type: type as QueryType, text };
- }).filter((q): q is Queryable => q !== null);
- // Filter out lex entries if not requested
- const filtered = includeLexical ? queryables : queryables.filter(q => q.type !== 'lex');
- if (filtered.length > 0) return filtered;
- const fallback: Queryable[] = [
- { type: 'hyde', text: `Information about ${query}` },
- { type: 'lex', text: query },
- { type: 'vec', text: query },
- ];
- return includeLexical ? fallback : fallback.filter(q => q.type !== 'lex');
- } catch (error) {
- console.error("Structured query expansion failed:", error);
- // Fallback to original query
- const fallback: Queryable[] = [{ type: 'vec', text: query }];
- if (includeLexical) fallback.unshift({ type: 'lex', text: query });
- return fallback;
- } finally {
- await genContext.dispose();
- }
- }
- // Qwen3 reranker chat template overhead (system prompt, tags, separators)
- private static readonly RERANK_TEMPLATE_OVERHEAD = 200;
- private static readonly RERANK_TARGET_DOCS_PER_CONTEXT = 10;
- async rerank(
- query: string,
- documents: RerankDocument[],
- options: RerankOptions = {}
- ): Promise<RerankResult> {
- if (this._ciMode) throw new Error("LLM operations are disabled in CI (set CI=true)");
- // Ping activity at start to keep models alive during this operation
- this.touchActivity();
- const contexts = await this.ensureRerankContexts();
- const model = await this.ensureRerankModel();
- // Truncate documents that would exceed the rerank context size.
- // Budget = contextSize - template overhead - query tokens
- const queryTokens = model.tokenize(query).length;
- const maxDocTokens = LlamaCpp.RERANK_CONTEXT_SIZE - LlamaCpp.RERANK_TEMPLATE_OVERHEAD - queryTokens;
- const truncationCache = new Map<string, string>();
- const truncatedDocs = documents.map((doc) => {
- const cached = truncationCache.get(doc.text);
- if (cached !== undefined) {
- return cached === doc.text ? doc : { ...doc, text: cached };
- }
- const tokens = model.tokenize(doc.text);
- const truncatedText = tokens.length <= maxDocTokens
- ? doc.text
- : model.detokenize(tokens.slice(0, maxDocTokens));
- truncationCache.set(doc.text, truncatedText);
- if (truncatedText === doc.text) return doc;
- return { ...doc, text: truncatedText };
- });
- // Deduplicate identical effective texts before scoring.
- // This avoids redundant work for repeated chunks and fixes collisions where
- // multiple docs map to the same chunk text.
- const textToDocs = new Map<string, { file: string; index: number }[]>();
- truncatedDocs.forEach((doc, index) => {
- const existing = textToDocs.get(doc.text);
- if (existing) {
- existing.push({ file: doc.file, index });
- } else {
- textToDocs.set(doc.text, [{ file: doc.file, index }]);
- }
- });
- // Extract just the text for ranking
- const texts = Array.from(textToDocs.keys());
- // Split documents across contexts for parallel evaluation.
- // Each context has its own sequence with a lock, so parallelism comes
- // from multiple contexts evaluating different chunks simultaneously.
- const activeContextCount = Math.max(
- 1,
- Math.min(
- contexts.length,
- Math.ceil(texts.length / LlamaCpp.RERANK_TARGET_DOCS_PER_CONTEXT)
- )
- );
- const activeContexts = contexts.slice(0, activeContextCount);
- const chunkSize = Math.ceil(texts.length / activeContexts.length);
- const chunks = Array.from({ length: activeContexts.length }, (_, i) =>
- texts.slice(i * chunkSize, (i + 1) * chunkSize)
- ).filter(chunk => chunk.length > 0);
- const allScores = await Promise.all(
- chunks.map((chunk, i) => activeContexts[i]!.rankAll(query, chunk))
- );
- // Reassemble scores in original order and sort
- const flatScores = allScores.flat();
- const ranked = texts
- .map((text, i) => ({ document: text, score: flatScores[i]! }))
- .sort((a, b) => b.score - a.score);
- // Map back to our result format.
- const results: RerankDocumentResult[] = [];
- for (const item of ranked) {
- const docInfos = textToDocs.get(item.document) ?? [];
- for (const docInfo of docInfos) {
- results.push({
- file: docInfo.file,
- score: item.score,
- index: docInfo.index,
- });
- }
- }
- return {
- results,
- model: this.rerankModelUri,
- };
- }
- /**
- * Get device/GPU info for status display.
- * Initializes llama if not already done.
- */
- async getDeviceInfo(): Promise<{
- gpu: string | false;
- gpuOffloading: boolean;
- gpuDevices: string[];
- vram?: { total: number; used: number; free: number };
- cpuCores: number;
- }> {
- const llama = await this.ensureLlama();
- const gpuDevices = await llama.getGpuDeviceNames();
- let vram: { total: number; used: number; free: number } | undefined;
- if (llama.gpu) {
- try {
- const state = await llama.getVramState();
- vram = { total: state.total, used: state.used, free: state.free };
- } catch { /* no vram info */ }
- }
- return {
- gpu: llama.gpu,
- gpuOffloading: llama.supportsGpuOffloading,
- gpuDevices,
- vram,
- cpuCores: llama.cpuMathCores,
- };
- }
- async dispose(): Promise<void> {
- // Prevent double-dispose
- if (this.disposed) {
- return;
- }
- this.disposed = true;
- // Clear inactivity timer
- if (this.inactivityTimer) {
- clearTimeout(this.inactivityTimer);
- this.inactivityTimer = null;
- }
- // Disposing llama cascades to models and contexts automatically
- // See: https://node-llama-cpp.withcat.ai/guide/objects-lifecycle
- // Note: llama.dispose() can hang indefinitely, so we use a timeout
- if (this.llama) {
- const disposePromise = this.llama.dispose();
- const timeoutPromise = new Promise<void>((resolve) => setTimeout(resolve, 1000));
- await Promise.race([disposePromise, timeoutPromise]);
- }
- // Clear references
- this.embedContexts = [];
- this.rerankContexts = [];
- this.embedModel = null;
- this.generateModel = null;
- this.rerankModel = null;
- this.llama = null;
- // Clear any in-flight load/create promises
- this.embedModelLoadPromise = null;
- this.embedContextsCreatePromise = null;
- this.generateModelLoadPromise = null;
- this.rerankModelLoadPromise = null;
- }
- }
- // =============================================================================
- // Session Management Layer
- // =============================================================================
- /**
- * Manages LLM session lifecycle with reference counting.
- * Coordinates with LlamaCpp idle timeout to prevent disposal during active sessions.
- */
- class LLMSessionManager {
- private llm: LlamaCpp;
- private _activeSessionCount = 0;
- private _inFlightOperations = 0;
- constructor(llm: LlamaCpp) {
- this.llm = llm;
- }
- get activeSessionCount(): number {
- return this._activeSessionCount;
- }
- get inFlightOperations(): number {
- return this._inFlightOperations;
- }
- /**
- * Returns true only when both session count and in-flight operations are 0.
- * Used by LlamaCpp to determine if idle unload is safe.
- */
- canUnload(): boolean {
- return this._activeSessionCount === 0 && this._inFlightOperations === 0;
- }
- acquire(): void {
- this._activeSessionCount++;
- }
- release(): void {
- this._activeSessionCount = Math.max(0, this._activeSessionCount - 1);
- }
- operationStart(): void {
- this._inFlightOperations++;
- }
- operationEnd(): void {
- this._inFlightOperations = Math.max(0, this._inFlightOperations - 1);
- }
- getLlamaCpp(): LlamaCpp {
- return this.llm;
- }
- }
- /**
- * Error thrown when an operation is attempted on a released or aborted session.
- */
- export class SessionReleasedError extends Error {
- constructor(message = "LLM session has been released or aborted") {
- super(message);
- this.name = "SessionReleasedError";
- }
- }
- /**
- * Scoped LLM session with automatic lifecycle management.
- * Wraps LlamaCpp methods with operation tracking and abort handling.
- */
- class LLMSession implements ILLMSession {
- private manager: LLMSessionManager;
- private released = false;
- private abortController: AbortController;
- private maxDurationTimer: ReturnType<typeof setTimeout> | null = null;
- private name: string;
- constructor(manager: LLMSessionManager, options: LLMSessionOptions = {}) {
- this.manager = manager;
- this.name = options.name || "unnamed";
- this.abortController = new AbortController();
- // Link external abort signal if provided
- if (options.signal) {
- if (options.signal.aborted) {
- this.abortController.abort(options.signal.reason);
- } else {
- options.signal.addEventListener("abort", () => {
- this.abortController.abort(options.signal!.reason);
- }, { once: true });
- }
- }
- // Set up max duration timer
- const maxDuration = options.maxDuration ?? 10 * 60 * 1000; // Default 10 minutes
- if (maxDuration > 0) {
- this.maxDurationTimer = setTimeout(() => {
- this.abortController.abort(new Error(`Session "${this.name}" exceeded max duration of ${maxDuration}ms`));
- }, maxDuration);
- this.maxDurationTimer.unref(); // Don't keep process alive
- }
- // Acquire session lease
- this.manager.acquire();
- }
- get isValid(): boolean {
- return !this.released && !this.abortController.signal.aborted;
- }
- get signal(): AbortSignal {
- return this.abortController.signal;
- }
- /**
- * Release the session and decrement ref count.
- * Called automatically by withLLMSession when the callback completes.
- */
- release(): void {
- if (this.released) return;
- this.released = true;
- if (this.maxDurationTimer) {
- clearTimeout(this.maxDurationTimer);
- this.maxDurationTimer = null;
- }
- this.abortController.abort(new Error("Session released"));
- this.manager.release();
- }
- /**
- * Wrap an operation with tracking and abort checking.
- */
- private async withOperation<T>(fn: () => Promise<T>): Promise<T> {
- if (!this.isValid) {
- throw new SessionReleasedError();
- }
- this.manager.operationStart();
- try {
- // Check abort before starting
- if (this.abortController.signal.aborted) {
- throw new SessionReleasedError(
- this.abortController.signal.reason?.message || "Session aborted"
- );
- }
- return await fn();
- } finally {
- this.manager.operationEnd();
- }
- }
- async embed(text: string, options?: EmbedOptions): Promise<EmbeddingResult | null> {
- return this.withOperation(() => this.manager.getLlamaCpp().embed(text, options));
- }
- async embedBatch(texts: string[]): Promise<(EmbeddingResult | null)[]> {
- return this.withOperation(() => this.manager.getLlamaCpp().embedBatch(texts));
- }
- async expandQuery(
- query: string,
- options?: { context?: string; includeLexical?: boolean }
- ): Promise<Queryable[]> {
- return this.withOperation(() => this.manager.getLlamaCpp().expandQuery(query, options));
- }
- async rerank(
- query: string,
- documents: RerankDocument[],
- options?: RerankOptions
- ): Promise<RerankResult> {
- return this.withOperation(() => this.manager.getLlamaCpp().rerank(query, documents, options));
- }
- }
- // Session manager for the default LlamaCpp instance
- let defaultSessionManager: LLMSessionManager | null = null;
- /**
- * Get the session manager for the default LlamaCpp instance.
- */
- function getSessionManager(): LLMSessionManager {
- const llm = getDefaultLlamaCpp();
- if (!defaultSessionManager || defaultSessionManager.getLlamaCpp() !== llm) {
- defaultSessionManager = new LLMSessionManager(llm);
- }
- return defaultSessionManager;
- }
- /**
- * Execute a function with a scoped LLM session.
- * The session provides lifecycle guarantees - resources won't be disposed mid-operation.
- *
- * @example
- * ```typescript
- * await withLLMSession(async (session) => {
- * const expanded = await session.expandQuery(query);
- * const embeddings = await session.embedBatch(texts);
- * const reranked = await session.rerank(query, docs);
- * return reranked;
- * }, { maxDuration: 10 * 60 * 1000, name: 'querySearch' });
- * ```
- */
- export async function withLLMSession<T>(
- fn: (session: ILLMSession) => Promise<T>,
- options?: LLMSessionOptions
- ): Promise<T> {
- const manager = getSessionManager();
- const session = new LLMSession(manager, options);
- try {
- return await fn(session);
- } finally {
- session.release();
- }
- }
- /**
- * Execute a function with a scoped LLM session using a specific LlamaCpp instance.
- * Unlike withLLMSession, this does not use the global singleton.
- */
- export async function withLLMSessionForLlm<T>(
- llm: LlamaCpp,
- fn: (session: ILLMSession) => Promise<T>,
- options?: LLMSessionOptions
- ): Promise<T> {
- const manager = new LLMSessionManager(llm);
- const session = new LLMSession(manager, options);
- try {
- return await fn(session);
- } finally {
- session.release();
- }
- }
- /**
- * Check if idle unload is safe (no active sessions or operations).
- * Used internally by LlamaCpp idle timer.
- */
- export function canUnloadLLM(): boolean {
- if (!defaultSessionManager) return true;
- return defaultSessionManager.canUnload();
- }
- // =============================================================================
- // Singleton for default LlamaCpp instance
- // =============================================================================
- let defaultLlamaCpp: LlamaCpp | null = null;
- /**
- * Get the default LlamaCpp instance (creates one if needed)
- */
- export function getDefaultLlamaCpp(): LlamaCpp {
- if (!defaultLlamaCpp) {
- const embedModel = process.env.QMD_EMBED_MODEL;
- defaultLlamaCpp = new LlamaCpp(embedModel ? { embedModel } : {});
- }
- return defaultLlamaCpp;
- }
- /**
- * Set a custom default LlamaCpp instance (useful for testing)
- */
- export function setDefaultLlamaCpp(llm: LlamaCpp | null): void {
- defaultLlamaCpp = llm;
- }
- /**
- * Dispose the default LlamaCpp instance if it exists.
- * Call this before process exit to prevent NAPI crashes.
- */
- export async function disposeDefaultLlamaCpp(): Promise<void> {
- if (defaultLlamaCpp) {
- await defaultLlamaCpp.dispose();
- defaultLlamaCpp = null;
- }
- }
|