llm.d.ts 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442
  1. /**
  2. * llm.ts - LLM abstraction layer for QMD using node-llama-cpp
  3. *
  4. * Provides embeddings, text generation, and reranking using local GGUF models.
  5. */
  6. import { type Token as LlamaToken } from "node-llama-cpp";
  7. /**
  8. * `QMD_DISABLE_LOCAL_LLM=1` opt-out: when set, `LlamaCpp.ensureLlama()`
  9. * throws on first invocation. Use for remote-only deployments where any
  10. * `getLlama()` call indicates an unintended fallback (e.g. cron host
  11. * without libvulkan-dev/glslc — issue i-c28wngnd).
  12. */
  13. export declare function isLocalLlmDisabled(env?: NodeJS.ProcessEnv): boolean;
  14. /**
  15. * Resolve the GPU mode for `getLlama()`:
  16. * 1. Explicit `QMD_LLAMA_GPU=off|none|0|...` → "cpu"
  17. * 2. Explicit `QMD_LLAMA_GPU=auto` → "auto"
  18. * 3. Auto-detect: `QMD_EMBED_ENDPOINT` set → "cpu"
  19. * (remote embed provider — embed never touches local LLM. Rerank/expand
  20. * still use prebuilt CPU binary; no Vulkan probe / cmake build.)
  21. * 4. Otherwise (legacy local-only setup) → "auto"
  22. */
  23. export declare function resolveLlamaGpuMode(env?: NodeJS.ProcessEnv): "cpu" | "auto";
  24. /**
  25. * Detect if a model URI uses the Qwen3-Embedding format.
  26. * Qwen3-Embedding uses a different prompting style than nomic/embeddinggemma.
  27. */
  28. export declare function isQwen3EmbeddingModel(modelUri: string): boolean;
  29. /**
  30. * Format a query for embedding.
  31. * Uses nomic-style task prefix format for embeddinggemma (default).
  32. * Uses Qwen3-Embedding instruct format when a Qwen embedding model is active.
  33. */
  34. export declare function formatQueryForEmbedding(query: string, modelUri?: string): string;
  35. /**
  36. * Format a document for embedding.
  37. * Uses nomic-style format with title and text fields (default).
  38. * Qwen3-Embedding encodes documents as raw text without special prefixes.
  39. */
  40. export declare function formatDocForEmbedding(text: string, title?: string, modelUri?: string): string;
  41. /**
  42. * Token with log probability
  43. */
  44. export type TokenLogProb = {
  45. token: string;
  46. logprob: number;
  47. };
  48. /**
  49. * Embedding result
  50. */
  51. export type EmbeddingResult = {
  52. embedding: number[];
  53. model: string;
  54. };
  55. /**
  56. * Generation result with optional logprobs
  57. */
  58. export type GenerateResult = {
  59. text: string;
  60. model: string;
  61. logprobs?: TokenLogProb[];
  62. done: boolean;
  63. };
  64. /**
  65. * Rerank result for a single document
  66. */
  67. export type RerankDocumentResult = {
  68. file: string;
  69. score: number;
  70. index: number;
  71. };
  72. /**
  73. * Batch rerank result
  74. */
  75. export type RerankResult = {
  76. results: RerankDocumentResult[];
  77. model: string;
  78. };
  79. /**
  80. * Model info
  81. */
  82. export type ModelInfo = {
  83. name: string;
  84. exists: boolean;
  85. path?: string;
  86. };
  87. /**
  88. * Options for embedding
  89. */
  90. export type EmbedOptions = {
  91. model?: string;
  92. isQuery?: boolean;
  93. title?: string;
  94. };
  95. /**
  96. * Options for text generation
  97. */
  98. export type GenerateOptions = {
  99. model?: string;
  100. maxTokens?: number;
  101. temperature?: number;
  102. };
  103. /**
  104. * Options for reranking
  105. */
  106. export type RerankOptions = {
  107. model?: string;
  108. };
  109. /**
  110. * Options for LLM sessions
  111. */
  112. export type LLMSessionOptions = {
  113. /** Max session duration in ms (default: 10 minutes) */
  114. maxDuration?: number;
  115. /** External abort signal */
  116. signal?: AbortSignal;
  117. /** Debug name for logging */
  118. name?: string;
  119. };
  120. /**
  121. * Session interface for scoped LLM access with lifecycle guarantees
  122. */
  123. export interface ILLMSession {
  124. embed(text: string, options?: EmbedOptions): Promise<EmbeddingResult | null>;
  125. embedBatch(texts: string[], options?: EmbedOptions): Promise<(EmbeddingResult | null)[]>;
  126. expandQuery(query: string, options?: {
  127. context?: string;
  128. includeLexical?: boolean;
  129. }): Promise<Queryable[]>;
  130. rerank(query: string, documents: RerankDocument[], options?: RerankOptions): Promise<RerankResult>;
  131. /** Whether this session is still valid (not released or aborted) */
  132. readonly isValid: boolean;
  133. /** Abort signal for this session (aborts on release or maxDuration) */
  134. readonly signal: AbortSignal;
  135. }
  136. /**
  137. * Supported query types for different search backends
  138. */
  139. export type QueryType = 'lex' | 'vec' | 'hyde';
  140. /**
  141. * A single query and its target backend type
  142. */
  143. export type Queryable = {
  144. type: QueryType;
  145. text: string;
  146. };
  147. /**
  148. * Document to rerank
  149. */
  150. export type RerankDocument = {
  151. file: string;
  152. text: string;
  153. title?: string;
  154. };
  155. export declare const LFM2_GENERATE_MODEL = "hf:LiquidAI/LFM2-1.2B-GGUF/LFM2-1.2B-Q4_K_M.gguf";
  156. export declare const LFM2_INSTRUCT_MODEL = "hf:LiquidAI/LFM2.5-1.2B-Instruct-GGUF/LFM2.5-1.2B-Instruct-Q4_K_M.gguf";
  157. export declare const DEFAULT_EMBED_MODEL_URI = "hf:ggml-org/embeddinggemma-300M-GGUF/embeddinggemma-300M-Q8_0.gguf";
  158. export declare const DEFAULT_RERANK_MODEL_URI = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
  159. export declare const DEFAULT_GENERATE_MODEL_URI = "hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf";
  160. export declare const DEFAULT_MODEL_CACHE_DIR: string;
  161. export type PullResult = {
  162. model: string;
  163. path: string;
  164. sizeBytes: number;
  165. refreshed: boolean;
  166. };
  167. export declare function pullModels(models: string[], options?: {
  168. refresh?: boolean;
  169. cacheDir?: string;
  170. }): Promise<PullResult[]>;
  171. /**
  172. * Abstract LLM interface - implement this for different backends
  173. */
  174. export interface LLM {
  175. /**
  176. * Get embeddings for text
  177. */
  178. embed(text: string, options?: EmbedOptions): Promise<EmbeddingResult | null>;
  179. /**
  180. * Generate text completion
  181. */
  182. generate(prompt: string, options?: GenerateOptions): Promise<GenerateResult | null>;
  183. /**
  184. * Check if a model exists/is available
  185. */
  186. modelExists(model: string): Promise<ModelInfo>;
  187. /**
  188. * Expand a search query into multiple variations for different backends.
  189. * Returns a list of Queryable objects.
  190. */
  191. expandQuery(query: string, options?: {
  192. context?: string;
  193. includeLexical?: boolean;
  194. }): Promise<Queryable[]>;
  195. /**
  196. * Rerank documents by relevance to a query
  197. * Returns list of documents with relevance scores (higher = more relevant)
  198. */
  199. rerank(query: string, documents: RerankDocument[], options?: RerankOptions): Promise<RerankResult>;
  200. /**
  201. * Dispose of resources
  202. */
  203. dispose(): Promise<void>;
  204. }
  205. export type LlamaCppConfig = {
  206. embedModel?: string;
  207. generateModel?: string;
  208. rerankModel?: string;
  209. modelCacheDir?: string;
  210. /**
  211. * Context size used for query expansion generation contexts.
  212. * Default: 2048. Can also be set via QMD_EXPAND_CONTEXT_SIZE.
  213. */
  214. expandContextSize?: number;
  215. /**
  216. * Inactivity timeout in ms before unloading contexts (default: 2 minutes, 0 to disable).
  217. *
  218. * Per node-llama-cpp lifecycle guidance, we prefer keeping models loaded and only disposing
  219. * contexts when idle, since contexts (and their sequences) are the heavy per-session objects.
  220. * @see https://node-llama-cpp.withcat.ai/guide/objects-lifecycle
  221. */
  222. inactivityTimeoutMs?: number;
  223. /**
  224. * Whether to dispose models on inactivity (default: false).
  225. *
  226. * Keeping models loaded avoids repeated VRAM thrash; set to true only if you need aggressive
  227. * memory reclaim.
  228. */
  229. disposeModelsOnInactivity?: boolean;
  230. };
  231. export declare class LlamaCpp implements LLM {
  232. private readonly _ciMode;
  233. private llama;
  234. private embedModel;
  235. private embedContexts;
  236. private generateModel;
  237. private rerankModel;
  238. private rerankContexts;
  239. private embedModelUri;
  240. private generateModelUri;
  241. private rerankModelUri;
  242. private modelCacheDir;
  243. private expandContextSize;
  244. private embedModelLoadPromise;
  245. private generateModelLoadPromise;
  246. private rerankModelLoadPromise;
  247. private inactivityTimer;
  248. private inactivityTimeoutMs;
  249. private disposeModelsOnInactivity;
  250. private disposed;
  251. constructor(config?: LlamaCppConfig);
  252. get embedModelName(): string;
  253. /**
  254. * Reset the inactivity timer. Called after each model operation.
  255. * When timer fires, models are unloaded to free memory (if no active sessions).
  256. */
  257. private touchActivity;
  258. /**
  259. * Check if any contexts are currently loaded (and therefore worth unloading on inactivity).
  260. */
  261. private hasLoadedContexts;
  262. /**
  263. * Unload idle resources but keep the instance alive for future use.
  264. *
  265. * By default, this disposes contexts (and their dependent sequences), while keeping models loaded.
  266. * This matches the intended lifecycle: model → context → sequence, where contexts are per-session.
  267. */
  268. unloadIdleResources(): Promise<void>;
  269. /**
  270. * Ensure model cache directory exists
  271. */
  272. private ensureModelCacheDir;
  273. /**
  274. * Initialize the llama instance (lazy)
  275. *
  276. * Env-var controls (i-c28wngnd):
  277. * - QMD_DISABLE_LOCAL_LLM=1 : hard-disable; throws on first ensureLlama()
  278. * call. Use when the deployment must NEVER
  279. * load node-llama-cpp (e.g. headless cron
  280. * on a host without libvulkan-dev/glslc).
  281. * - QMD_LLAMA_GPU=off|none|... : force CPU-only (skip Vulkan probe).
  282. * - QMD_LLAMA_GPU=auto : explicit opt-in to GPU probe even when
  283. * QMD_EMBED_ENDPOINT is set (rare; useful
  284. * for hybrid local-rerank + remote-embed).
  285. *
  286. * Auto-detect: when QMD_EMBED_ENDPOINT is set (HTTP embed provider, e.g.
  287. * cron on `code` → ai.mm.mk → models:8082), we default to CPU-only because
  288. * the embed path runs over HTTP and the only remaining local LLM consumers
  289. * are rerank/query-expansion, which work fine on the prebuilt CPU binary
  290. * and never need to invoke cmake-js-llama. This silences ~30s/run of
  291. * Vulkan probe + cmake noise on headless LXCs.
  292. */
  293. private ensureLlama;
  294. /**
  295. * Resolve a model URI to a local path, downloading if needed
  296. */
  297. private resolveModel;
  298. /**
  299. * Load embedding model (lazy)
  300. */
  301. private ensureEmbedModel;
  302. /**
  303. * Compute how many parallel contexts to create.
  304. *
  305. * GPU: constrained by VRAM (25% of free, capped at 8).
  306. * CPU: constrained by cores. Splitting threads across contexts enables
  307. * true parallelism (each context runs on its own cores). Use at most
  308. * half the math cores, with at least 4 threads per context.
  309. */
  310. private computeParallelism;
  311. /**
  312. * Get the number of threads each context should use, given N parallel contexts.
  313. * Splits available math cores evenly across contexts.
  314. */
  315. private threadsPerContext;
  316. /**
  317. * Load embedding contexts (lazy). Creates multiple for parallel embedding.
  318. * Uses promise guard to prevent concurrent context creation race condition.
  319. */
  320. private embedContextsCreatePromise;
  321. private ensureEmbedContexts;
  322. /**
  323. * Get a single embed context (for single-embed calls). Uses first from pool.
  324. */
  325. private ensureEmbedContext;
  326. /**
  327. * Load generation model (lazy) - context is created fresh per call
  328. */
  329. private ensureGenerateModel;
  330. /**
  331. * Load rerank model (lazy)
  332. */
  333. private ensureRerankModel;
  334. /**
  335. * Load rerank contexts (lazy). Creates multiple contexts for parallel ranking.
  336. * Each context has its own sequence, so they can evaluate independently.
  337. *
  338. * Tuning choices:
  339. * - contextSize 1024: reranking chunks are ~800 tokens max, 1024 is plenty
  340. * - flashAttention: ~20% less VRAM per context (568 vs 711 MB)
  341. * - Combined: drops from 11.6 GB (auto, no flash) to 568 MB per context (20×)
  342. */
  343. private static readonly RERANK_CONTEXT_SIZE;
  344. private static readonly EMBED_CONTEXT_SIZE;
  345. private ensureRerankContexts;
  346. /**
  347. * Tokenize text using the embedding model's tokenizer
  348. * Returns tokenizer tokens (opaque type from node-llama-cpp)
  349. */
  350. tokenize(text: string): Promise<readonly LlamaToken[]>;
  351. /**
  352. * Count tokens in text using the embedding model's tokenizer
  353. */
  354. countTokens(text: string): Promise<number>;
  355. /**
  356. * Detokenize token IDs back to text
  357. */
  358. detokenize(tokens: readonly LlamaToken[]): Promise<string>;
  359. /**
  360. * Truncate text to fit within the embedding model's context window.
  361. * Uses the model's own tokenizer for accurate token counting, then
  362. * detokenizes back to text if truncation is needed.
  363. * Returns the (possibly truncated) text and whether truncation occurred.
  364. */
  365. private truncateToContextSize;
  366. embed(text: string, options?: EmbedOptions): Promise<EmbeddingResult | null>;
  367. /**
  368. * Batch embed multiple texts efficiently
  369. * Uses Promise.all for parallel embedding - node-llama-cpp handles batching internally
  370. */
  371. embedBatch(texts: string[], options?: EmbedOptions): Promise<(EmbeddingResult | null)[]>;
  372. generate(prompt: string, options?: GenerateOptions): Promise<GenerateResult | null>;
  373. modelExists(modelUri: string): Promise<ModelInfo>;
  374. expandQuery(query: string, options?: {
  375. context?: string;
  376. includeLexical?: boolean;
  377. intent?: string;
  378. }): Promise<Queryable[]>;
  379. private static readonly RERANK_TEMPLATE_OVERHEAD;
  380. private static readonly RERANK_TARGET_DOCS_PER_CONTEXT;
  381. rerank(query: string, documents: RerankDocument[], options?: RerankOptions): Promise<RerankResult>;
  382. /**
  383. * Get device/GPU info for status display.
  384. * Initializes llama if not already done.
  385. */
  386. getDeviceInfo(): Promise<{
  387. gpu: string | false;
  388. gpuOffloading: boolean;
  389. gpuDevices: string[];
  390. vram?: {
  391. total: number;
  392. used: number;
  393. free: number;
  394. };
  395. cpuCores: number;
  396. }>;
  397. dispose(): Promise<void>;
  398. }
  399. /**
  400. * Error thrown when an operation is attempted on a released or aborted session.
  401. */
  402. export declare class SessionReleasedError extends Error {
  403. constructor(message?: string);
  404. }
  405. /**
  406. * Execute a function with a scoped LLM session.
  407. * The session provides lifecycle guarantees - resources won't be disposed mid-operation.
  408. *
  409. * @example
  410. * ```typescript
  411. * await withLLMSession(async (session) => {
  412. * const expanded = await session.expandQuery(query);
  413. * const embeddings = await session.embedBatch(texts);
  414. * const reranked = await session.rerank(query, docs);
  415. * return reranked;
  416. * }, { maxDuration: 10 * 60 * 1000, name: 'querySearch' });
  417. * ```
  418. */
  419. export declare function withLLMSession<T>(fn: (session: ILLMSession) => Promise<T>, options?: LLMSessionOptions): Promise<T>;
  420. /**
  421. * Execute a function with a scoped LLM session using a specific LlamaCpp instance.
  422. * Unlike withLLMSession, this does not use the global singleton.
  423. */
  424. export declare function withLLMSessionForLlm<T>(llm: LlamaCpp, fn: (session: ILLMSession) => Promise<T>, options?: LLMSessionOptions): Promise<T>;
  425. /**
  426. * Check if idle unload is safe (no active sessions or operations).
  427. * Used internally by LlamaCpp idle timer.
  428. */
  429. export declare function canUnloadLLM(): boolean;
  430. /**
  431. * Get the default LlamaCpp instance (creates one if needed)
  432. */
  433. export declare function getDefaultLlamaCpp(): LlamaCpp;
  434. /**
  435. * Set a custom default LlamaCpp instance (useful for testing)
  436. */
  437. export declare function setDefaultLlamaCpp(llm: LlamaCpp | null): void;
  438. /**
  439. * Dispose the default LlamaCpp instance if it exists.
  440. * Call this before process exit to prevent NAPI crashes.
  441. */
  442. export declare function disposeDefaultLlamaCpp(): Promise<void>;