llm.d.ts 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408
  1. /**
  2. * llm.ts - LLM abstraction layer for QMD using node-llama-cpp
  3. *
  4. * Provides embeddings, text generation, and reranking using local GGUF models.
  5. */
  6. import { type Token as LlamaToken } from "node-llama-cpp";
  7. /**
  8. * Detect if a model URI uses the Qwen3-Embedding format.
  9. * Qwen3-Embedding uses a different prompting style than nomic/embeddinggemma.
  10. */
  11. export declare function isQwen3EmbeddingModel(modelUri: string): boolean;
  12. /**
  13. * Format a query for embedding.
  14. * Uses nomic-style task prefix format for embeddinggemma (default).
  15. * Uses Qwen3-Embedding instruct format when a Qwen embedding model is active.
  16. */
  17. export declare function formatQueryForEmbedding(query: string, modelUri?: string): string;
  18. /**
  19. * Format a document for embedding.
  20. * Uses nomic-style format with title and text fields (default).
  21. * Qwen3-Embedding encodes documents as raw text without special prefixes.
  22. */
  23. export declare function formatDocForEmbedding(text: string, title?: string, modelUri?: string): string;
  24. /**
  25. * Token with log probability
  26. */
  27. export type TokenLogProb = {
  28. token: string;
  29. logprob: number;
  30. };
  31. /**
  32. * Embedding result
  33. */
  34. export type EmbeddingResult = {
  35. embedding: number[];
  36. model: string;
  37. };
  38. /**
  39. * Generation result with optional logprobs
  40. */
  41. export type GenerateResult = {
  42. text: string;
  43. model: string;
  44. logprobs?: TokenLogProb[];
  45. done: boolean;
  46. };
  47. /**
  48. * Rerank result for a single document
  49. */
  50. export type RerankDocumentResult = {
  51. file: string;
  52. score: number;
  53. index: number;
  54. };
  55. /**
  56. * Batch rerank result
  57. */
  58. export type RerankResult = {
  59. results: RerankDocumentResult[];
  60. model: string;
  61. };
  62. /**
  63. * Model info
  64. */
  65. export type ModelInfo = {
  66. name: string;
  67. exists: boolean;
  68. path?: string;
  69. };
  70. /**
  71. * Options for embedding
  72. */
  73. export type EmbedOptions = {
  74. model?: string;
  75. isQuery?: boolean;
  76. title?: string;
  77. };
  78. /**
  79. * Options for text generation
  80. */
  81. export type GenerateOptions = {
  82. model?: string;
  83. maxTokens?: number;
  84. temperature?: number;
  85. };
  86. /**
  87. * Options for reranking
  88. */
  89. export type RerankOptions = {
  90. model?: string;
  91. };
  92. /**
  93. * Options for LLM sessions
  94. */
  95. export type LLMSessionOptions = {
  96. /** Max session duration in ms (default: 10 minutes) */
  97. maxDuration?: number;
  98. /** External abort signal */
  99. signal?: AbortSignal;
  100. /** Debug name for logging */
  101. name?: string;
  102. };
  103. /**
  104. * Session interface for scoped LLM access with lifecycle guarantees
  105. */
  106. export interface ILLMSession {
  107. embed(text: string, options?: EmbedOptions): Promise<EmbeddingResult | null>;
  108. embedBatch(texts: string[], options?: EmbedOptions): Promise<(EmbeddingResult | null)[]>;
  109. expandQuery(query: string, options?: {
  110. context?: string;
  111. includeLexical?: boolean;
  112. }): Promise<Queryable[]>;
  113. rerank(query: string, documents: RerankDocument[], options?: RerankOptions): Promise<RerankResult>;
  114. /** Whether this session is still valid (not released or aborted) */
  115. readonly isValid: boolean;
  116. /** Abort signal for this session (aborts on release or maxDuration) */
  117. readonly signal: AbortSignal;
  118. }
  119. /**
  120. * Supported query types for different search backends
  121. */
  122. export type QueryType = 'lex' | 'vec' | 'hyde';
  123. /**
  124. * A single query and its target backend type
  125. */
  126. export type Queryable = {
  127. type: QueryType;
  128. text: string;
  129. };
  130. /**
  131. * Document to rerank
  132. */
  133. export type RerankDocument = {
  134. file: string;
  135. text: string;
  136. title?: string;
  137. };
  138. export declare const LFM2_GENERATE_MODEL = "hf:LiquidAI/LFM2-1.2B-GGUF/LFM2-1.2B-Q4_K_M.gguf";
  139. export declare const LFM2_INSTRUCT_MODEL = "hf:LiquidAI/LFM2.5-1.2B-Instruct-GGUF/LFM2.5-1.2B-Instruct-Q4_K_M.gguf";
  140. export declare const DEFAULT_EMBED_MODEL_URI = "hf:ggml-org/embeddinggemma-300M-GGUF/embeddinggemma-300M-Q8_0.gguf";
  141. export declare const DEFAULT_RERANK_MODEL_URI = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
  142. export declare const DEFAULT_GENERATE_MODEL_URI = "hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf";
  143. export declare const DEFAULT_MODEL_CACHE_DIR: string;
  144. export type PullResult = {
  145. model: string;
  146. path: string;
  147. sizeBytes: number;
  148. refreshed: boolean;
  149. };
  150. export declare function pullModels(models: string[], options?: {
  151. refresh?: boolean;
  152. cacheDir?: string;
  153. }): Promise<PullResult[]>;
  154. /**
  155. * Abstract LLM interface - implement this for different backends
  156. */
  157. export interface LLM {
  158. /**
  159. * Get embeddings for text
  160. */
  161. embed(text: string, options?: EmbedOptions): Promise<EmbeddingResult | null>;
  162. /**
  163. * Generate text completion
  164. */
  165. generate(prompt: string, options?: GenerateOptions): Promise<GenerateResult | null>;
  166. /**
  167. * Check if a model exists/is available
  168. */
  169. modelExists(model: string): Promise<ModelInfo>;
  170. /**
  171. * Expand a search query into multiple variations for different backends.
  172. * Returns a list of Queryable objects.
  173. */
  174. expandQuery(query: string, options?: {
  175. context?: string;
  176. includeLexical?: boolean;
  177. }): Promise<Queryable[]>;
  178. /**
  179. * Rerank documents by relevance to a query
  180. * Returns list of documents with relevance scores (higher = more relevant)
  181. */
  182. rerank(query: string, documents: RerankDocument[], options?: RerankOptions): Promise<RerankResult>;
  183. /**
  184. * Dispose of resources
  185. */
  186. dispose(): Promise<void>;
  187. }
  188. export type LlamaCppConfig = {
  189. embedModel?: string;
  190. generateModel?: string;
  191. rerankModel?: string;
  192. modelCacheDir?: string;
  193. /**
  194. * Context size used for query expansion generation contexts.
  195. * Default: 2048. Can also be set via QMD_EXPAND_CONTEXT_SIZE.
  196. */
  197. expandContextSize?: number;
  198. /**
  199. * Inactivity timeout in ms before unloading contexts (default: 2 minutes, 0 to disable).
  200. *
  201. * Per node-llama-cpp lifecycle guidance, we prefer keeping models loaded and only disposing
  202. * contexts when idle, since contexts (and their sequences) are the heavy per-session objects.
  203. * @see https://node-llama-cpp.withcat.ai/guide/objects-lifecycle
  204. */
  205. inactivityTimeoutMs?: number;
  206. /**
  207. * Whether to dispose models on inactivity (default: false).
  208. *
  209. * Keeping models loaded avoids repeated VRAM thrash; set to true only if you need aggressive
  210. * memory reclaim.
  211. */
  212. disposeModelsOnInactivity?: boolean;
  213. };
  214. export declare class LlamaCpp implements LLM {
  215. private readonly _ciMode;
  216. private llama;
  217. private embedModel;
  218. private embedContexts;
  219. private generateModel;
  220. private rerankModel;
  221. private rerankContexts;
  222. private embedModelUri;
  223. private generateModelUri;
  224. private rerankModelUri;
  225. private modelCacheDir;
  226. private expandContextSize;
  227. private embedModelLoadPromise;
  228. private generateModelLoadPromise;
  229. private rerankModelLoadPromise;
  230. private inactivityTimer;
  231. private inactivityTimeoutMs;
  232. private disposeModelsOnInactivity;
  233. private disposed;
  234. constructor(config?: LlamaCppConfig);
  235. get embedModelName(): string;
  236. /**
  237. * Reset the inactivity timer. Called after each model operation.
  238. * When timer fires, models are unloaded to free memory (if no active sessions).
  239. */
  240. private touchActivity;
  241. /**
  242. * Check if any contexts are currently loaded (and therefore worth unloading on inactivity).
  243. */
  244. private hasLoadedContexts;
  245. /**
  246. * Unload idle resources but keep the instance alive for future use.
  247. *
  248. * By default, this disposes contexts (and their dependent sequences), while keeping models loaded.
  249. * This matches the intended lifecycle: model → context → sequence, where contexts are per-session.
  250. */
  251. unloadIdleResources(): Promise<void>;
  252. /**
  253. * Ensure model cache directory exists
  254. */
  255. private ensureModelCacheDir;
  256. /**
  257. * Initialize the llama instance (lazy)
  258. */
  259. private ensureLlama;
  260. /**
  261. * Resolve a model URI to a local path, downloading if needed
  262. */
  263. private resolveModel;
  264. /**
  265. * Load embedding model (lazy)
  266. */
  267. private ensureEmbedModel;
  268. /**
  269. * Compute how many parallel contexts to create.
  270. *
  271. * GPU: constrained by VRAM (25% of free, capped at 8).
  272. * CPU: constrained by cores. Splitting threads across contexts enables
  273. * true parallelism (each context runs on its own cores). Use at most
  274. * half the math cores, with at least 4 threads per context.
  275. */
  276. private computeParallelism;
  277. /**
  278. * Get the number of threads each context should use, given N parallel contexts.
  279. * Splits available math cores evenly across contexts.
  280. */
  281. private threadsPerContext;
  282. /**
  283. * Load embedding contexts (lazy). Creates multiple for parallel embedding.
  284. * Uses promise guard to prevent concurrent context creation race condition.
  285. */
  286. private embedContextsCreatePromise;
  287. private ensureEmbedContexts;
  288. /**
  289. * Get a single embed context (for single-embed calls). Uses first from pool.
  290. */
  291. private ensureEmbedContext;
  292. /**
  293. * Load generation model (lazy) - context is created fresh per call
  294. */
  295. private ensureGenerateModel;
  296. /**
  297. * Load rerank model (lazy)
  298. */
  299. private ensureRerankModel;
  300. /**
  301. * Load rerank contexts (lazy). Creates multiple contexts for parallel ranking.
  302. * Each context has its own sequence, so they can evaluate independently.
  303. *
  304. * Tuning choices:
  305. * - contextSize 1024: reranking chunks are ~800 tokens max, 1024 is plenty
  306. * - flashAttention: ~20% less VRAM per context (568 vs 711 MB)
  307. * - Combined: drops from 11.6 GB (auto, no flash) to 568 MB per context (20×)
  308. */
  309. private static readonly RERANK_CONTEXT_SIZE;
  310. private static readonly EMBED_CONTEXT_SIZE;
  311. private ensureRerankContexts;
  312. /**
  313. * Tokenize text using the embedding model's tokenizer
  314. * Returns tokenizer tokens (opaque type from node-llama-cpp)
  315. */
  316. tokenize(text: string): Promise<readonly LlamaToken[]>;
  317. /**
  318. * Count tokens in text using the embedding model's tokenizer
  319. */
  320. countTokens(text: string): Promise<number>;
  321. /**
  322. * Detokenize token IDs back to text
  323. */
  324. detokenize(tokens: readonly LlamaToken[]): Promise<string>;
  325. /**
  326. * Truncate text to fit within the embedding model's context window.
  327. * Uses the model's own tokenizer for accurate token counting, then
  328. * detokenizes back to text if truncation is needed.
  329. * Returns the (possibly truncated) text and whether truncation occurred.
  330. */
  331. private truncateToContextSize;
  332. embed(text: string, options?: EmbedOptions): Promise<EmbeddingResult | null>;
  333. /**
  334. * Batch embed multiple texts efficiently
  335. * Uses Promise.all for parallel embedding - node-llama-cpp handles batching internally
  336. */
  337. embedBatch(texts: string[], options?: EmbedOptions): Promise<(EmbeddingResult | null)[]>;
  338. generate(prompt: string, options?: GenerateOptions): Promise<GenerateResult | null>;
  339. modelExists(modelUri: string): Promise<ModelInfo>;
  340. expandQuery(query: string, options?: {
  341. context?: string;
  342. includeLexical?: boolean;
  343. intent?: string;
  344. }): Promise<Queryable[]>;
  345. private static readonly RERANK_TEMPLATE_OVERHEAD;
  346. private static readonly RERANK_TARGET_DOCS_PER_CONTEXT;
  347. rerank(query: string, documents: RerankDocument[], options?: RerankOptions): Promise<RerankResult>;
  348. /**
  349. * Get device/GPU info for status display.
  350. * Initializes llama if not already done.
  351. */
  352. getDeviceInfo(): Promise<{
  353. gpu: string | false;
  354. gpuOffloading: boolean;
  355. gpuDevices: string[];
  356. vram?: {
  357. total: number;
  358. used: number;
  359. free: number;
  360. };
  361. cpuCores: number;
  362. }>;
  363. dispose(): Promise<void>;
  364. }
  365. /**
  366. * Error thrown when an operation is attempted on a released or aborted session.
  367. */
  368. export declare class SessionReleasedError extends Error {
  369. constructor(message?: string);
  370. }
  371. /**
  372. * Execute a function with a scoped LLM session.
  373. * The session provides lifecycle guarantees - resources won't be disposed mid-operation.
  374. *
  375. * @example
  376. * ```typescript
  377. * await withLLMSession(async (session) => {
  378. * const expanded = await session.expandQuery(query);
  379. * const embeddings = await session.embedBatch(texts);
  380. * const reranked = await session.rerank(query, docs);
  381. * return reranked;
  382. * }, { maxDuration: 10 * 60 * 1000, name: 'querySearch' });
  383. * ```
  384. */
  385. export declare function withLLMSession<T>(fn: (session: ILLMSession) => Promise<T>, options?: LLMSessionOptions): Promise<T>;
  386. /**
  387. * Execute a function with a scoped LLM session using a specific LlamaCpp instance.
  388. * Unlike withLLMSession, this does not use the global singleton.
  389. */
  390. export declare function withLLMSessionForLlm<T>(llm: LlamaCpp, fn: (session: ILLMSession) => Promise<T>, options?: LLMSessionOptions): Promise<T>;
  391. /**
  392. * Check if idle unload is safe (no active sessions or operations).
  393. * Used internally by LlamaCpp idle timer.
  394. */
  395. export declare function canUnloadLLM(): boolean;
  396. /**
  397. * Get the default LlamaCpp instance (creates one if needed)
  398. */
  399. export declare function getDefaultLlamaCpp(): LlamaCpp;
  400. /**
  401. * Set a custom default LlamaCpp instance (useful for testing)
  402. */
  403. export declare function setDefaultLlamaCpp(llm: LlamaCpp | null): void;
  404. /**
  405. * Dispose the default LlamaCpp instance if it exists.
  406. * Call this before process exit to prevent NAPI crashes.
  407. */
  408. export declare function disposeDefaultLlamaCpp(): Promise<void>;