llm.js 47 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199
  1. /**
  2. * llm.ts - LLM abstraction layer for QMD using node-llama-cpp
  3. *
  4. * Provides embeddings, text generation, and reranking using local GGUF models.
  5. */
  6. import { getLlama, resolveModelFile, LlamaChatSession, LlamaLogLevel, } from "node-llama-cpp";
  7. import { homedir } from "os";
  8. import { join } from "path";
  9. import { existsSync, mkdirSync, statSync, unlinkSync, readdirSync, readFileSync, writeFileSync } from "fs";
  10. // =============================================================================
  11. // Embedding Formatting Functions
  12. // =============================================================================
  13. /**
  14. * Detect if a model URI uses the Qwen3-Embedding format.
  15. * Qwen3-Embedding uses a different prompting style than nomic/embeddinggemma.
  16. */
  17. export function isQwen3EmbeddingModel(modelUri) {
  18. return /qwen.*embed/i.test(modelUri) || /embed.*qwen/i.test(modelUri);
  19. }
  20. /**
  21. * Format a query for embedding.
  22. * Uses nomic-style task prefix format for embeddinggemma (default).
  23. * Uses Qwen3-Embedding instruct format when a Qwen embedding model is active.
  24. */
  25. export function formatQueryForEmbedding(query, modelUri) {
  26. const uri = modelUri ?? process.env.QMD_EMBED_MODEL ?? DEFAULT_EMBED_MODEL;
  27. if (isQwen3EmbeddingModel(uri)) {
  28. return `Instruct: Retrieve relevant documents for the given query\nQuery: ${query}`;
  29. }
  30. return `task: search result | query: ${query}`;
  31. }
  32. /**
  33. * Format a document for embedding.
  34. * Uses nomic-style format with title and text fields (default).
  35. * Qwen3-Embedding encodes documents as raw text without special prefixes.
  36. */
  37. export function formatDocForEmbedding(text, title, modelUri) {
  38. const uri = modelUri ?? process.env.QMD_EMBED_MODEL ?? DEFAULT_EMBED_MODEL;
  39. if (isQwen3EmbeddingModel(uri)) {
  40. // Qwen3-Embedding: documents are raw text, no task prefix
  41. return title ? `${title}\n${text}` : text;
  42. }
  43. return `title: ${title || "none"} | text: ${text}`;
  44. }
  45. // =============================================================================
  46. // Model Configuration
  47. // =============================================================================
  48. // HuggingFace model URIs for node-llama-cpp
  49. // Format: hf:<user>/<repo>/<file>
  50. // Override via QMD_EMBED_MODEL env var (e.g. hf:Qwen/Qwen3-Embedding-0.6B-GGUF/Qwen3-Embedding-0.6B-Q8_0.gguf)
  51. const DEFAULT_EMBED_MODEL = "hf:ggml-org/embeddinggemma-300M-GGUF/embeddinggemma-300M-Q8_0.gguf";
  52. const DEFAULT_RERANK_MODEL = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
  53. // const DEFAULT_GENERATE_MODEL = "hf:ggml-org/Qwen3-0.6B-GGUF/Qwen3-0.6B-Q8_0.gguf";
  54. const DEFAULT_GENERATE_MODEL = "hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf";
  55. // Alternative generation models for query expansion:
  56. // LiquidAI LFM2 - hybrid architecture optimized for edge/on-device inference
  57. // Use these as base for fine-tuning with configs/sft_lfm2.yaml
  58. export const LFM2_GENERATE_MODEL = "hf:LiquidAI/LFM2-1.2B-GGUF/LFM2-1.2B-Q4_K_M.gguf";
  59. export const LFM2_INSTRUCT_MODEL = "hf:LiquidAI/LFM2.5-1.2B-Instruct-GGUF/LFM2.5-1.2B-Instruct-Q4_K_M.gguf";
  60. export const DEFAULT_EMBED_MODEL_URI = DEFAULT_EMBED_MODEL;
  61. export const DEFAULT_RERANK_MODEL_URI = DEFAULT_RERANK_MODEL;
  62. export const DEFAULT_GENERATE_MODEL_URI = DEFAULT_GENERATE_MODEL;
  63. // Local model cache directory
  64. const MODEL_CACHE_DIR = process.env.XDG_CACHE_HOME
  65. ? join(process.env.XDG_CACHE_HOME, "qmd", "models")
  66. : join(homedir(), ".cache", "qmd", "models");
  67. export const DEFAULT_MODEL_CACHE_DIR = MODEL_CACHE_DIR;
  68. function parseHfUri(model) {
  69. if (!model.startsWith("hf:"))
  70. return null;
  71. const without = model.slice(3);
  72. const parts = without.split("/");
  73. if (parts.length < 3)
  74. return null;
  75. const repo = parts.slice(0, 2).join("/");
  76. const file = parts.slice(2).join("/");
  77. return { repo, file };
  78. }
  79. async function getRemoteEtag(ref) {
  80. const url = `https://huggingface.co/${ref.repo}/resolve/main/${ref.file}`;
  81. try {
  82. const resp = await fetch(url, { method: "HEAD" });
  83. if (!resp.ok)
  84. return null;
  85. const etag = resp.headers.get("etag");
  86. return etag || null;
  87. }
  88. catch {
  89. return null;
  90. }
  91. }
  92. export async function pullModels(models, options = {}) {
  93. const cacheDir = options.cacheDir || MODEL_CACHE_DIR;
  94. if (!existsSync(cacheDir)) {
  95. mkdirSync(cacheDir, { recursive: true });
  96. }
  97. const results = [];
  98. for (const model of models) {
  99. let refreshed = false;
  100. const hfRef = parseHfUri(model);
  101. const filename = model.split("/").pop();
  102. const entries = readdirSync(cacheDir, { withFileTypes: true });
  103. const cached = filename
  104. ? entries
  105. .filter((entry) => entry.isFile() && entry.name.includes(filename))
  106. .map((entry) => join(cacheDir, entry.name))
  107. : [];
  108. if (hfRef && filename) {
  109. const etagPath = join(cacheDir, `${filename}.etag`);
  110. const remoteEtag = await getRemoteEtag(hfRef);
  111. const localEtag = existsSync(etagPath)
  112. ? readFileSync(etagPath, "utf-8").trim()
  113. : null;
  114. const shouldRefresh = options.refresh || !remoteEtag || remoteEtag !== localEtag || cached.length === 0;
  115. if (shouldRefresh) {
  116. for (const candidate of cached) {
  117. if (existsSync(candidate))
  118. unlinkSync(candidate);
  119. }
  120. if (existsSync(etagPath))
  121. unlinkSync(etagPath);
  122. refreshed = cached.length > 0;
  123. }
  124. }
  125. else if (options.refresh && filename) {
  126. for (const candidate of cached) {
  127. if (existsSync(candidate))
  128. unlinkSync(candidate);
  129. refreshed = true;
  130. }
  131. }
  132. const path = await resolveModelFile(model, cacheDir);
  133. const sizeBytes = existsSync(path) ? statSync(path).size : 0;
  134. if (hfRef && filename) {
  135. const remoteEtag = await getRemoteEtag(hfRef);
  136. if (remoteEtag) {
  137. const etagPath = join(cacheDir, `${filename}.etag`);
  138. writeFileSync(etagPath, remoteEtag + "\n", "utf-8");
  139. }
  140. }
  141. results.push({ model, path, sizeBytes, refreshed });
  142. }
  143. return results;
  144. }
  145. /**
  146. * LLM implementation using node-llama-cpp
  147. */
  148. // Default inactivity timeout: 5 minutes (keep models warm during typical search sessions)
  149. const DEFAULT_INACTIVITY_TIMEOUT_MS = 5 * 60 * 1000;
  150. const DEFAULT_EXPAND_CONTEXT_SIZE = 2048;
  151. function resolveExpandContextSize(configValue) {
  152. if (configValue !== undefined) {
  153. if (!Number.isInteger(configValue) || configValue <= 0) {
  154. throw new Error(`Invalid expandContextSize: ${configValue}. Must be a positive integer.`);
  155. }
  156. return configValue;
  157. }
  158. const envValue = process.env.QMD_EXPAND_CONTEXT_SIZE?.trim();
  159. if (!envValue)
  160. return DEFAULT_EXPAND_CONTEXT_SIZE;
  161. const parsed = Number.parseInt(envValue, 10);
  162. if (!Number.isInteger(parsed) || parsed <= 0) {
  163. process.stderr.write(`QMD Warning: invalid QMD_EXPAND_CONTEXT_SIZE="${envValue}", using default ${DEFAULT_EXPAND_CONTEXT_SIZE}.\n`);
  164. return DEFAULT_EXPAND_CONTEXT_SIZE;
  165. }
  166. return parsed;
  167. }
  168. export class LlamaCpp {
  169. _ciMode = !!process.env.CI;
  170. llama = null;
  171. embedModel = null;
  172. embedContexts = [];
  173. generateModel = null;
  174. rerankModel = null;
  175. rerankContexts = [];
  176. embedModelUri;
  177. generateModelUri;
  178. rerankModelUri;
  179. modelCacheDir;
  180. expandContextSize;
  181. // Ensure we don't load the same model/context concurrently (which can allocate duplicate VRAM).
  182. embedModelLoadPromise = null;
  183. generateModelLoadPromise = null;
  184. rerankModelLoadPromise = null;
  185. // Inactivity timer for auto-unloading models
  186. inactivityTimer = null;
  187. inactivityTimeoutMs;
  188. disposeModelsOnInactivity;
  189. // Track disposal state to prevent double-dispose
  190. disposed = false;
  191. constructor(config = {}) {
  192. this.embedModelUri = config.embedModel || process.env.QMD_EMBED_MODEL || DEFAULT_EMBED_MODEL;
  193. this.generateModelUri = config.generateModel || process.env.QMD_GENERATE_MODEL || DEFAULT_GENERATE_MODEL;
  194. this.rerankModelUri = config.rerankModel || process.env.QMD_RERANK_MODEL || DEFAULT_RERANK_MODEL;
  195. this.modelCacheDir = config.modelCacheDir || MODEL_CACHE_DIR;
  196. this.expandContextSize = resolveExpandContextSize(config.expandContextSize);
  197. this.inactivityTimeoutMs = config.inactivityTimeoutMs ?? DEFAULT_INACTIVITY_TIMEOUT_MS;
  198. this.disposeModelsOnInactivity = config.disposeModelsOnInactivity ?? false;
  199. }
  200. get embedModelName() {
  201. return this.embedModelUri;
  202. }
  203. /**
  204. * Reset the inactivity timer. Called after each model operation.
  205. * When timer fires, models are unloaded to free memory (if no active sessions).
  206. */
  207. touchActivity() {
  208. // Clear existing timer
  209. if (this.inactivityTimer) {
  210. clearTimeout(this.inactivityTimer);
  211. this.inactivityTimer = null;
  212. }
  213. // Only set timer if we have disposable contexts and timeout is enabled
  214. if (this.inactivityTimeoutMs > 0 && this.hasLoadedContexts()) {
  215. this.inactivityTimer = setTimeout(() => {
  216. // Check if session manager allows unloading
  217. // canUnloadLLM is defined later in this file - it checks the session manager
  218. // We use dynamic import pattern to avoid circular dependency issues
  219. if (typeof canUnloadLLM === 'function' && !canUnloadLLM()) {
  220. // Active sessions/operations - reschedule timer
  221. this.touchActivity();
  222. return;
  223. }
  224. this.unloadIdleResources().catch(err => {
  225. console.error("Error unloading idle resources:", err);
  226. });
  227. }, this.inactivityTimeoutMs);
  228. // Don't keep process alive just for this timer
  229. this.inactivityTimer.unref();
  230. }
  231. }
  232. /**
  233. * Check if any contexts are currently loaded (and therefore worth unloading on inactivity).
  234. */
  235. hasLoadedContexts() {
  236. return !!(this.embedContexts.length > 0 || this.rerankContexts.length > 0);
  237. }
  238. /**
  239. * Unload idle resources but keep the instance alive for future use.
  240. *
  241. * By default, this disposes contexts (and their dependent sequences), while keeping models loaded.
  242. * This matches the intended lifecycle: model → context → sequence, where contexts are per-session.
  243. */
  244. async unloadIdleResources() {
  245. // Don't unload if already disposed
  246. if (this.disposed) {
  247. return;
  248. }
  249. // Clear timer
  250. if (this.inactivityTimer) {
  251. clearTimeout(this.inactivityTimer);
  252. this.inactivityTimer = null;
  253. }
  254. // Dispose contexts first
  255. for (const ctx of this.embedContexts) {
  256. await ctx.dispose();
  257. }
  258. this.embedContexts = [];
  259. for (const ctx of this.rerankContexts) {
  260. await ctx.dispose();
  261. }
  262. this.rerankContexts = [];
  263. // Optionally dispose models too (opt-in)
  264. if (this.disposeModelsOnInactivity) {
  265. if (this.embedModel) {
  266. await this.embedModel.dispose();
  267. this.embedModel = null;
  268. }
  269. if (this.generateModel) {
  270. await this.generateModel.dispose();
  271. this.generateModel = null;
  272. }
  273. if (this.rerankModel) {
  274. await this.rerankModel.dispose();
  275. this.rerankModel = null;
  276. }
  277. // Reset load promises so models can be reloaded later
  278. this.embedModelLoadPromise = null;
  279. this.generateModelLoadPromise = null;
  280. this.rerankModelLoadPromise = null;
  281. }
  282. // Note: We keep llama instance alive - it's lightweight
  283. }
  284. /**
  285. * Ensure model cache directory exists
  286. */
  287. ensureModelCacheDir() {
  288. if (!existsSync(this.modelCacheDir)) {
  289. mkdirSync(this.modelCacheDir, { recursive: true });
  290. }
  291. }
  292. /**
  293. * Initialize the llama instance (lazy)
  294. */
  295. async ensureLlama() {
  296. if (!this.llama) {
  297. // Allow override via QMD_LLAMA_GPU: "false" | "off" | "none" forces CPU
  298. const gpuOverride = (process.env.QMD_LLAMA_GPU ?? "").toLowerCase();
  299. const forceCpu = ["false", "off", "none", "disable", "disabled", "0"].includes(gpuOverride);
  300. const loadLlama = async (gpu) => await getLlama({
  301. build: "autoAttempt",
  302. logLevel: LlamaLogLevel.error,
  303. gpu,
  304. });
  305. let llama;
  306. if (forceCpu) {
  307. llama = await loadLlama(false);
  308. }
  309. else {
  310. try {
  311. llama = await loadLlama("auto");
  312. }
  313. catch (err) {
  314. // GPU backend (e.g. Vulkan on headless/driverless machines) can throw at init.
  315. // Fall back to CPU so qmd still works.
  316. process.stderr.write(`QMD Warning: GPU init failed (${err instanceof Error ? err.message : String(err)}), falling back to CPU.\n`);
  317. llama = await loadLlama(false);
  318. }
  319. }
  320. if (llama.gpu === false) {
  321. process.stderr.write("QMD Warning: no GPU acceleration, running on CPU (slow). Run 'qmd status' for details.\n");
  322. }
  323. this.llama = llama;
  324. }
  325. return this.llama;
  326. }
  327. /**
  328. * Resolve a model URI to a local path, downloading if needed
  329. */
  330. async resolveModel(modelUri) {
  331. this.ensureModelCacheDir();
  332. // resolveModelFile handles HF URIs and downloads to the cache dir
  333. return await resolveModelFile(modelUri, this.modelCacheDir);
  334. }
  335. /**
  336. * Load embedding model (lazy)
  337. */
  338. async ensureEmbedModel() {
  339. if (this.embedModel) {
  340. return this.embedModel;
  341. }
  342. if (this.embedModelLoadPromise) {
  343. return await this.embedModelLoadPromise;
  344. }
  345. this.embedModelLoadPromise = (async () => {
  346. const llama = await this.ensureLlama();
  347. const modelPath = await this.resolveModel(this.embedModelUri);
  348. const model = await llama.loadModel({ modelPath });
  349. this.embedModel = model;
  350. // Model loading counts as activity - ping to keep alive
  351. this.touchActivity();
  352. return model;
  353. })();
  354. try {
  355. return await this.embedModelLoadPromise;
  356. }
  357. finally {
  358. // Keep the resolved model cached; clear only the in-flight promise.
  359. this.embedModelLoadPromise = null;
  360. }
  361. }
  362. /**
  363. * Compute how many parallel contexts to create.
  364. *
  365. * GPU: constrained by VRAM (25% of free, capped at 8).
  366. * CPU: constrained by cores. Splitting threads across contexts enables
  367. * true parallelism (each context runs on its own cores). Use at most
  368. * half the math cores, with at least 4 threads per context.
  369. */
  370. async computeParallelism(perContextMB) {
  371. const llama = await this.ensureLlama();
  372. if (llama.gpu) {
  373. try {
  374. const vram = await llama.getVramState();
  375. const freeMB = vram.free / (1024 * 1024);
  376. const maxByVram = Math.floor((freeMB * 0.25) / perContextMB);
  377. return Math.max(1, Math.min(8, maxByVram));
  378. }
  379. catch {
  380. return 2;
  381. }
  382. }
  383. // CPU: split cores across contexts. At least 4 threads per context.
  384. const cores = llama.cpuMathCores || 4;
  385. const maxContexts = Math.floor(cores / 4);
  386. return Math.max(1, Math.min(4, maxContexts));
  387. }
  388. /**
  389. * Get the number of threads each context should use, given N parallel contexts.
  390. * Splits available math cores evenly across contexts.
  391. */
  392. async threadsPerContext(parallelism) {
  393. const llama = await this.ensureLlama();
  394. if (llama.gpu)
  395. return 0; // GPU: let the library decide
  396. const cores = llama.cpuMathCores || 4;
  397. return Math.max(1, Math.floor(cores / parallelism));
  398. }
  399. /**
  400. * Load embedding contexts (lazy). Creates multiple for parallel embedding.
  401. * Uses promise guard to prevent concurrent context creation race condition.
  402. */
  403. embedContextsCreatePromise = null;
  404. async ensureEmbedContexts() {
  405. if (this.embedContexts.length > 0) {
  406. this.touchActivity();
  407. return this.embedContexts;
  408. }
  409. if (this.embedContextsCreatePromise) {
  410. return await this.embedContextsCreatePromise;
  411. }
  412. this.embedContextsCreatePromise = (async () => {
  413. const model = await this.ensureEmbedModel();
  414. // Embed contexts are ~143 MB each (nomic-embed 2048 ctx)
  415. const n = await this.computeParallelism(150);
  416. const threads = await this.threadsPerContext(n);
  417. for (let i = 0; i < n; i++) {
  418. try {
  419. this.embedContexts.push(await model.createEmbeddingContext({
  420. contextSize: LlamaCpp.EMBED_CONTEXT_SIZE,
  421. ...(threads > 0 ? { threads } : {}),
  422. }));
  423. }
  424. catch {
  425. if (this.embedContexts.length === 0)
  426. throw new Error("Failed to create any embedding context");
  427. break;
  428. }
  429. }
  430. this.touchActivity();
  431. return this.embedContexts;
  432. })();
  433. try {
  434. return await this.embedContextsCreatePromise;
  435. }
  436. finally {
  437. this.embedContextsCreatePromise = null;
  438. }
  439. }
  440. /**
  441. * Get a single embed context (for single-embed calls). Uses first from pool.
  442. */
  443. async ensureEmbedContext() {
  444. const contexts = await this.ensureEmbedContexts();
  445. return contexts[0];
  446. }
  447. /**
  448. * Load generation model (lazy) - context is created fresh per call
  449. */
  450. async ensureGenerateModel() {
  451. if (!this.generateModel) {
  452. if (this.generateModelLoadPromise) {
  453. return await this.generateModelLoadPromise;
  454. }
  455. this.generateModelLoadPromise = (async () => {
  456. const llama = await this.ensureLlama();
  457. const modelPath = await this.resolveModel(this.generateModelUri);
  458. const model = await llama.loadModel({ modelPath });
  459. this.generateModel = model;
  460. return model;
  461. })();
  462. try {
  463. await this.generateModelLoadPromise;
  464. }
  465. finally {
  466. this.generateModelLoadPromise = null;
  467. }
  468. }
  469. this.touchActivity();
  470. if (!this.generateModel) {
  471. throw new Error("Generate model not loaded");
  472. }
  473. return this.generateModel;
  474. }
  475. /**
  476. * Load rerank model (lazy)
  477. */
  478. async ensureRerankModel() {
  479. if (this.rerankModel) {
  480. return this.rerankModel;
  481. }
  482. if (this.rerankModelLoadPromise) {
  483. return await this.rerankModelLoadPromise;
  484. }
  485. this.rerankModelLoadPromise = (async () => {
  486. const llama = await this.ensureLlama();
  487. const modelPath = await this.resolveModel(this.rerankModelUri);
  488. const model = await llama.loadModel({ modelPath });
  489. this.rerankModel = model;
  490. // Model loading counts as activity - ping to keep alive
  491. this.touchActivity();
  492. return model;
  493. })();
  494. try {
  495. return await this.rerankModelLoadPromise;
  496. }
  497. finally {
  498. this.rerankModelLoadPromise = null;
  499. }
  500. }
  501. /**
  502. * Load rerank contexts (lazy). Creates multiple contexts for parallel ranking.
  503. * Each context has its own sequence, so they can evaluate independently.
  504. *
  505. * Tuning choices:
  506. * - contextSize 1024: reranking chunks are ~800 tokens max, 1024 is plenty
  507. * - flashAttention: ~20% less VRAM per context (568 vs 711 MB)
  508. * - Combined: drops from 11.6 GB (auto, no flash) to 568 MB per context (20×)
  509. */
  510. // Qwen3 reranker template adds ~200 tokens overhead (system prompt, tags, etc.)
  511. // Default 2048 was too small for longer documents (e.g. session transcripts,
  512. // CJK text, or large markdown files) — callers hit "input lengths exceed
  513. // context size" errors even after truncation because the overhead estimate
  514. // was insufficient. 4096 comfortably fits the largest real-world chunks
  515. // while staying well below the 40 960-token auto size.
  516. // Override with QMD_RERANK_CONTEXT_SIZE env var if you need more headroom.
  517. static RERANK_CONTEXT_SIZE = (() => {
  518. const v = parseInt(process.env.QMD_RERANK_CONTEXT_SIZE ?? "", 10);
  519. return Number.isFinite(v) && v > 0 ? v : 4096;
  520. })();
  521. static EMBED_CONTEXT_SIZE = (() => {
  522. const v = parseInt(process.env.QMD_EMBED_CONTEXT_SIZE ?? "", 10);
  523. return Number.isFinite(v) && v > 0 ? v : 2048;
  524. })();
  525. async ensureRerankContexts() {
  526. if (this.rerankContexts.length === 0) {
  527. const model = await this.ensureRerankModel();
  528. // ~960 MB per context with flash attention at contextSize 2048
  529. const n = Math.min(await this.computeParallelism(1000), 4);
  530. const threads = await this.threadsPerContext(n);
  531. for (let i = 0; i < n; i++) {
  532. try {
  533. this.rerankContexts.push(await model.createRankingContext({
  534. contextSize: LlamaCpp.RERANK_CONTEXT_SIZE,
  535. flashAttention: true,
  536. ...(threads > 0 ? { threads } : {}),
  537. }));
  538. }
  539. catch {
  540. if (this.rerankContexts.length === 0) {
  541. // Flash attention might not be supported — retry without it
  542. try {
  543. this.rerankContexts.push(await model.createRankingContext({
  544. contextSize: LlamaCpp.RERANK_CONTEXT_SIZE,
  545. ...(threads > 0 ? { threads } : {}),
  546. }));
  547. }
  548. catch {
  549. throw new Error("Failed to create any rerank context");
  550. }
  551. }
  552. break;
  553. }
  554. }
  555. }
  556. this.touchActivity();
  557. return this.rerankContexts;
  558. }
  559. // ==========================================================================
  560. // Tokenization
  561. // ==========================================================================
  562. /**
  563. * Tokenize text using the embedding model's tokenizer
  564. * Returns tokenizer tokens (opaque type from node-llama-cpp)
  565. */
  566. async tokenize(text) {
  567. await this.ensureEmbedContext(); // Ensure model is loaded
  568. if (!this.embedModel) {
  569. throw new Error("Embed model not loaded");
  570. }
  571. return this.embedModel.tokenize(text);
  572. }
  573. /**
  574. * Count tokens in text using the embedding model's tokenizer
  575. */
  576. async countTokens(text) {
  577. const tokens = await this.tokenize(text);
  578. return tokens.length;
  579. }
  580. /**
  581. * Detokenize token IDs back to text
  582. */
  583. async detokenize(tokens) {
  584. await this.ensureEmbedContext();
  585. if (!this.embedModel) {
  586. throw new Error("Embed model not loaded");
  587. }
  588. return this.embedModel.detokenize(tokens);
  589. }
  590. // ==========================================================================
  591. // Core API methods
  592. // ==========================================================================
  593. /**
  594. * Truncate text to fit within the embedding model's context window.
  595. * Uses the model's own tokenizer for accurate token counting, then
  596. * detokenizes back to text if truncation is needed.
  597. * Returns the (possibly truncated) text and whether truncation occurred.
  598. */
  599. async truncateToContextSize(text) {
  600. if (!this.embedModel)
  601. return { text, truncated: false };
  602. const maxTokens = this.embedModel.trainContextSize;
  603. if (maxTokens <= 0)
  604. return { text, truncated: false };
  605. const tokens = this.embedModel.tokenize(text);
  606. if (tokens.length <= maxTokens)
  607. return { text, truncated: false };
  608. // Leave a small margin (4 tokens) for BOS/EOS overhead
  609. const safeLimit = Math.max(1, maxTokens - 4);
  610. const truncatedTokens = tokens.slice(0, safeLimit);
  611. const truncatedText = this.embedModel.detokenize(truncatedTokens);
  612. return { text: truncatedText, truncated: true };
  613. }
  614. async embed(text, options = {}) {
  615. // Ping activity at start to keep models alive during this operation
  616. this.touchActivity();
  617. try {
  618. const context = await this.ensureEmbedContext();
  619. // Guard: truncate text that exceeds model context window to prevent GGML crash
  620. const { text: safeText, truncated } = await this.truncateToContextSize(text);
  621. if (truncated) {
  622. console.warn(`⚠ Text truncated to fit embedding context (${this.embedModel?.trainContextSize} tokens)`);
  623. }
  624. const embedding = await context.getEmbeddingFor(safeText);
  625. return {
  626. embedding: Array.from(embedding.vector),
  627. model: options.model ?? this.embedModelUri,
  628. };
  629. }
  630. catch (error) {
  631. console.error("Embedding error:", error);
  632. return null;
  633. }
  634. }
  635. /**
  636. * Batch embed multiple texts efficiently
  637. * Uses Promise.all for parallel embedding - node-llama-cpp handles batching internally
  638. */
  639. async embedBatch(texts, options = {}) {
  640. if (this._ciMode)
  641. throw new Error("LLM operations are disabled in CI (set CI=true)");
  642. // Ping activity at start to keep models alive during this operation
  643. this.touchActivity();
  644. if (texts.length === 0)
  645. return [];
  646. try {
  647. const contexts = await this.ensureEmbedContexts();
  648. const n = contexts.length;
  649. if (n === 1) {
  650. // Single context: sequential (no point splitting)
  651. const context = contexts[0];
  652. const embeddings = [];
  653. for (const text of texts) {
  654. try {
  655. const { text: safeText, truncated } = await this.truncateToContextSize(text);
  656. if (truncated) {
  657. console.warn(`⚠ Batch text truncated to fit embedding context (${this.embedModel?.trainContextSize} tokens)`);
  658. }
  659. const embedding = await context.getEmbeddingFor(safeText);
  660. this.touchActivity();
  661. embeddings.push({ embedding: Array.from(embedding.vector), model: options.model ?? this.embedModelUri });
  662. }
  663. catch (err) {
  664. console.error("Embedding error for text:", err);
  665. embeddings.push(null);
  666. }
  667. }
  668. return embeddings;
  669. }
  670. // Multiple contexts: split texts across contexts for parallel evaluation
  671. const chunkSize = Math.ceil(texts.length / n);
  672. const chunks = Array.from({ length: n }, (_, i) => texts.slice(i * chunkSize, (i + 1) * chunkSize));
  673. const chunkResults = await Promise.all(chunks.map(async (chunk, i) => {
  674. const ctx = contexts[i];
  675. const results = [];
  676. for (const text of chunk) {
  677. try {
  678. const { text: safeText, truncated } = await this.truncateToContextSize(text);
  679. if (truncated) {
  680. console.warn(`⚠ Batch text truncated to fit embedding context (${this.embedModel?.trainContextSize} tokens)`);
  681. }
  682. const embedding = await ctx.getEmbeddingFor(safeText);
  683. this.touchActivity();
  684. results.push({ embedding: Array.from(embedding.vector), model: options.model ?? this.embedModelUri });
  685. }
  686. catch (err) {
  687. console.error("Embedding error for text:", err);
  688. results.push(null);
  689. }
  690. }
  691. return results;
  692. }));
  693. return chunkResults.flat();
  694. }
  695. catch (error) {
  696. console.error("Batch embedding error:", error);
  697. return texts.map(() => null);
  698. }
  699. }
  700. async generate(prompt, options = {}) {
  701. if (this._ciMode)
  702. throw new Error("LLM operations are disabled in CI (set CI=true)");
  703. // Ping activity at start to keep models alive during this operation
  704. this.touchActivity();
  705. // Ensure model is loaded
  706. await this.ensureGenerateModel();
  707. // Create fresh context -> sequence -> session for each call
  708. const context = await this.generateModel.createContext();
  709. const sequence = context.getSequence();
  710. const session = new LlamaChatSession({ contextSequence: sequence });
  711. const maxTokens = options.maxTokens ?? 150;
  712. // Qwen3 recommends temp=0.7, topP=0.8, topK=20 for non-thinking mode
  713. // DO NOT use greedy decoding (temp=0) - causes repetition loops
  714. const temperature = options.temperature ?? 0.7;
  715. let result = "";
  716. try {
  717. await session.prompt(prompt, {
  718. maxTokens,
  719. temperature,
  720. topK: 20,
  721. topP: 0.8,
  722. onTextChunk: (text) => {
  723. result += text;
  724. },
  725. });
  726. return {
  727. text: result,
  728. model: this.generateModelUri,
  729. done: true,
  730. };
  731. }
  732. finally {
  733. // Dispose context (which disposes dependent sequences/sessions per lifecycle rules)
  734. await context.dispose();
  735. }
  736. }
  737. async modelExists(modelUri) {
  738. // For HuggingFace URIs, we assume they exist
  739. // For local paths, check if file exists
  740. if (modelUri.startsWith("hf:")) {
  741. return { name: modelUri, exists: true };
  742. }
  743. const exists = existsSync(modelUri);
  744. return {
  745. name: modelUri,
  746. exists,
  747. path: exists ? modelUri : undefined,
  748. };
  749. }
  750. // ==========================================================================
  751. // High-level abstractions
  752. // ==========================================================================
  753. async expandQuery(query, options = {}) {
  754. if (this._ciMode)
  755. throw new Error("LLM operations are disabled in CI (set CI=true)");
  756. // Ping activity at start to keep models alive during this operation
  757. this.touchActivity();
  758. const llama = await this.ensureLlama();
  759. await this.ensureGenerateModel();
  760. const includeLexical = options.includeLexical ?? true;
  761. const context = options.context;
  762. const grammar = await llama.createGrammar({
  763. grammar: `
  764. root ::= line+
  765. line ::= type ": " content "\\n"
  766. type ::= "lex" | "vec" | "hyde"
  767. content ::= [^\\n]+
  768. `
  769. });
  770. const intent = options.intent;
  771. const prompt = intent
  772. ? `/no_think Expand this search query: ${query}\nQuery intent: ${intent}`
  773. : `/no_think Expand this search query: ${query}`;
  774. // Create a bounded context for expansion to prevent large default VRAM allocations.
  775. const genContext = await this.generateModel.createContext({
  776. contextSize: this.expandContextSize,
  777. });
  778. const sequence = genContext.getSequence();
  779. const session = new LlamaChatSession({ contextSequence: sequence });
  780. try {
  781. // Qwen3 recommended settings for non-thinking mode:
  782. // temp=0.7, topP=0.8, topK=20, presence_penalty for repetition
  783. // DO NOT use greedy decoding (temp=0) - causes infinite loops
  784. const result = await session.prompt(prompt, {
  785. grammar,
  786. maxTokens: 600,
  787. temperature: 0.7,
  788. topK: 20,
  789. topP: 0.8,
  790. repeatPenalty: {
  791. lastTokens: 64,
  792. presencePenalty: 0.5,
  793. },
  794. });
  795. const lines = result.trim().split("\n");
  796. const queryLower = query.toLowerCase();
  797. const queryTerms = queryLower.replace(/[^a-z0-9\s]/g, " ").split(/\s+/).filter(Boolean);
  798. const hasQueryTerm = (text) => {
  799. const lower = text.toLowerCase();
  800. if (queryTerms.length === 0)
  801. return true;
  802. return queryTerms.some(term => lower.includes(term));
  803. };
  804. const queryables = lines.map(line => {
  805. const colonIdx = line.indexOf(":");
  806. if (colonIdx === -1)
  807. return null;
  808. const type = line.slice(0, colonIdx).trim();
  809. if (type !== 'lex' && type !== 'vec' && type !== 'hyde')
  810. return null;
  811. const text = line.slice(colonIdx + 1).trim();
  812. if (!hasQueryTerm(text))
  813. return null;
  814. return { type: type, text };
  815. }).filter((q) => q !== null);
  816. // Filter out lex entries if not requested
  817. const filtered = includeLexical ? queryables : queryables.filter(q => q.type !== 'lex');
  818. if (filtered.length > 0)
  819. return filtered;
  820. const fallback = [
  821. { type: 'hyde', text: `Information about ${query}` },
  822. { type: 'lex', text: query },
  823. { type: 'vec', text: query },
  824. ];
  825. return includeLexical ? fallback : fallback.filter(q => q.type !== 'lex');
  826. }
  827. catch (error) {
  828. console.error("Structured query expansion failed:", error);
  829. // Fallback to original query
  830. const fallback = [{ type: 'vec', text: query }];
  831. if (includeLexical)
  832. fallback.unshift({ type: 'lex', text: query });
  833. return fallback;
  834. }
  835. finally {
  836. await genContext.dispose();
  837. }
  838. }
  839. // Qwen3 reranker chat template overhead (system prompt, tags, separators).
  840. // Measured at ~350 tokens on real queries; use 512 as a safe upper bound so
  841. // the truncation budget never lets a document slip past the context limit.
  842. static RERANK_TEMPLATE_OVERHEAD = 512;
  843. static RERANK_TARGET_DOCS_PER_CONTEXT = 10;
  844. async rerank(query, documents, options = {}) {
  845. if (this._ciMode)
  846. throw new Error("LLM operations are disabled in CI (set CI=true)");
  847. // Ping activity at start to keep models alive during this operation
  848. this.touchActivity();
  849. const contexts = await this.ensureRerankContexts();
  850. const model = await this.ensureRerankModel();
  851. // Truncate documents that would exceed the rerank context size.
  852. // Budget = contextSize - template overhead - query tokens
  853. const queryTokens = model.tokenize(query).length;
  854. const maxDocTokens = LlamaCpp.RERANK_CONTEXT_SIZE - LlamaCpp.RERANK_TEMPLATE_OVERHEAD - queryTokens;
  855. const truncationCache = new Map();
  856. const truncatedDocs = documents.map((doc) => {
  857. const cached = truncationCache.get(doc.text);
  858. if (cached !== undefined) {
  859. return cached === doc.text ? doc : { ...doc, text: cached };
  860. }
  861. const tokens = model.tokenize(doc.text);
  862. const truncatedText = tokens.length <= maxDocTokens
  863. ? doc.text
  864. : model.detokenize(tokens.slice(0, maxDocTokens));
  865. truncationCache.set(doc.text, truncatedText);
  866. if (truncatedText === doc.text)
  867. return doc;
  868. return { ...doc, text: truncatedText };
  869. });
  870. // Deduplicate identical effective texts before scoring.
  871. // This avoids redundant work for repeated chunks and fixes collisions where
  872. // multiple docs map to the same chunk text.
  873. const textToDocs = new Map();
  874. truncatedDocs.forEach((doc, index) => {
  875. const existing = textToDocs.get(doc.text);
  876. if (existing) {
  877. existing.push({ file: doc.file, index });
  878. }
  879. else {
  880. textToDocs.set(doc.text, [{ file: doc.file, index }]);
  881. }
  882. });
  883. // Extract just the text for ranking
  884. const texts = Array.from(textToDocs.keys());
  885. // Split documents across contexts for parallel evaluation.
  886. // Each context has its own sequence with a lock, so parallelism comes
  887. // from multiple contexts evaluating different chunks simultaneously.
  888. const activeContextCount = Math.max(1, Math.min(contexts.length, Math.ceil(texts.length / LlamaCpp.RERANK_TARGET_DOCS_PER_CONTEXT)));
  889. const activeContexts = contexts.slice(0, activeContextCount);
  890. const chunkSize = Math.ceil(texts.length / activeContexts.length);
  891. const chunks = Array.from({ length: activeContexts.length }, (_, i) => texts.slice(i * chunkSize, (i + 1) * chunkSize)).filter(chunk => chunk.length > 0);
  892. const allScores = await Promise.all(chunks.map((chunk, i) => activeContexts[i].rankAll(query, chunk)));
  893. // Reassemble scores in original order and sort
  894. const flatScores = allScores.flat();
  895. const ranked = texts
  896. .map((text, i) => ({ document: text, score: flatScores[i] }))
  897. .sort((a, b) => b.score - a.score);
  898. // Map back to our result format.
  899. const results = [];
  900. for (const item of ranked) {
  901. const docInfos = textToDocs.get(item.document) ?? [];
  902. for (const docInfo of docInfos) {
  903. results.push({
  904. file: docInfo.file,
  905. score: item.score,
  906. index: docInfo.index,
  907. });
  908. }
  909. }
  910. return {
  911. results,
  912. model: this.rerankModelUri,
  913. };
  914. }
  915. /**
  916. * Get device/GPU info for status display.
  917. * Initializes llama if not already done.
  918. */
  919. async getDeviceInfo() {
  920. const llama = await this.ensureLlama();
  921. const gpuDevices = await llama.getGpuDeviceNames();
  922. let vram;
  923. if (llama.gpu) {
  924. try {
  925. const state = await llama.getVramState();
  926. vram = { total: state.total, used: state.used, free: state.free };
  927. }
  928. catch { /* no vram info */ }
  929. }
  930. return {
  931. gpu: llama.gpu,
  932. gpuOffloading: llama.supportsGpuOffloading,
  933. gpuDevices,
  934. vram,
  935. cpuCores: llama.cpuMathCores,
  936. };
  937. }
  938. async dispose() {
  939. // Prevent double-dispose
  940. if (this.disposed) {
  941. return;
  942. }
  943. this.disposed = true;
  944. // Clear inactivity timer
  945. if (this.inactivityTimer) {
  946. clearTimeout(this.inactivityTimer);
  947. this.inactivityTimer = null;
  948. }
  949. // Disposing llama cascades to models and contexts automatically
  950. // See: https://node-llama-cpp.withcat.ai/guide/objects-lifecycle
  951. // Note: llama.dispose() can hang indefinitely, so we use a timeout
  952. if (this.llama) {
  953. const disposePromise = this.llama.dispose();
  954. const timeoutPromise = new Promise((resolve) => setTimeout(resolve, 1000));
  955. await Promise.race([disposePromise, timeoutPromise]);
  956. }
  957. // Clear references
  958. this.embedContexts = [];
  959. this.rerankContexts = [];
  960. this.embedModel = null;
  961. this.generateModel = null;
  962. this.rerankModel = null;
  963. this.llama = null;
  964. // Clear any in-flight load/create promises
  965. this.embedModelLoadPromise = null;
  966. this.embedContextsCreatePromise = null;
  967. this.generateModelLoadPromise = null;
  968. this.rerankModelLoadPromise = null;
  969. }
  970. }
  971. // =============================================================================
  972. // Session Management Layer
  973. // =============================================================================
  974. /**
  975. * Manages LLM session lifecycle with reference counting.
  976. * Coordinates with LlamaCpp idle timeout to prevent disposal during active sessions.
  977. */
  978. class LLMSessionManager {
  979. llm;
  980. _activeSessionCount = 0;
  981. _inFlightOperations = 0;
  982. constructor(llm) {
  983. this.llm = llm;
  984. }
  985. get activeSessionCount() {
  986. return this._activeSessionCount;
  987. }
  988. get inFlightOperations() {
  989. return this._inFlightOperations;
  990. }
  991. /**
  992. * Returns true only when both session count and in-flight operations are 0.
  993. * Used by LlamaCpp to determine if idle unload is safe.
  994. */
  995. canUnload() {
  996. return this._activeSessionCount === 0 && this._inFlightOperations === 0;
  997. }
  998. acquire() {
  999. this._activeSessionCount++;
  1000. }
  1001. release() {
  1002. this._activeSessionCount = Math.max(0, this._activeSessionCount - 1);
  1003. }
  1004. operationStart() {
  1005. this._inFlightOperations++;
  1006. }
  1007. operationEnd() {
  1008. this._inFlightOperations = Math.max(0, this._inFlightOperations - 1);
  1009. }
  1010. getLlamaCpp() {
  1011. return this.llm;
  1012. }
  1013. }
  1014. /**
  1015. * Error thrown when an operation is attempted on a released or aborted session.
  1016. */
  1017. export class SessionReleasedError extends Error {
  1018. constructor(message = "LLM session has been released or aborted") {
  1019. super(message);
  1020. this.name = "SessionReleasedError";
  1021. }
  1022. }
  1023. /**
  1024. * Scoped LLM session with automatic lifecycle management.
  1025. * Wraps LlamaCpp methods with operation tracking and abort handling.
  1026. */
  1027. class LLMSession {
  1028. manager;
  1029. released = false;
  1030. abortController;
  1031. maxDurationTimer = null;
  1032. name;
  1033. constructor(manager, options = {}) {
  1034. this.manager = manager;
  1035. this.name = options.name || "unnamed";
  1036. this.abortController = new AbortController();
  1037. // Link external abort signal if provided
  1038. if (options.signal) {
  1039. if (options.signal.aborted) {
  1040. this.abortController.abort(options.signal.reason);
  1041. }
  1042. else {
  1043. options.signal.addEventListener("abort", () => {
  1044. this.abortController.abort(options.signal.reason);
  1045. }, { once: true });
  1046. }
  1047. }
  1048. // Set up max duration timer
  1049. const maxDuration = options.maxDuration ?? 10 * 60 * 1000; // Default 10 minutes
  1050. if (maxDuration > 0) {
  1051. this.maxDurationTimer = setTimeout(() => {
  1052. this.abortController.abort(new Error(`Session "${this.name}" exceeded max duration of ${maxDuration}ms`));
  1053. }, maxDuration);
  1054. this.maxDurationTimer.unref(); // Don't keep process alive
  1055. }
  1056. // Acquire session lease
  1057. this.manager.acquire();
  1058. }
  1059. get isValid() {
  1060. return !this.released && !this.abortController.signal.aborted;
  1061. }
  1062. get signal() {
  1063. return this.abortController.signal;
  1064. }
  1065. /**
  1066. * Release the session and decrement ref count.
  1067. * Called automatically by withLLMSession when the callback completes.
  1068. */
  1069. release() {
  1070. if (this.released)
  1071. return;
  1072. this.released = true;
  1073. if (this.maxDurationTimer) {
  1074. clearTimeout(this.maxDurationTimer);
  1075. this.maxDurationTimer = null;
  1076. }
  1077. this.abortController.abort(new Error("Session released"));
  1078. this.manager.release();
  1079. }
  1080. /**
  1081. * Wrap an operation with tracking and abort checking.
  1082. */
  1083. async withOperation(fn) {
  1084. if (!this.isValid) {
  1085. throw new SessionReleasedError();
  1086. }
  1087. this.manager.operationStart();
  1088. try {
  1089. // Check abort before starting
  1090. if (this.abortController.signal.aborted) {
  1091. throw new SessionReleasedError(this.abortController.signal.reason?.message || "Session aborted");
  1092. }
  1093. return await fn();
  1094. }
  1095. finally {
  1096. this.manager.operationEnd();
  1097. }
  1098. }
  1099. async embed(text, options) {
  1100. return this.withOperation(() => this.manager.getLlamaCpp().embed(text, options));
  1101. }
  1102. async embedBatch(texts, options) {
  1103. return this.withOperation(() => this.manager.getLlamaCpp().embedBatch(texts, options));
  1104. }
  1105. async expandQuery(query, options) {
  1106. return this.withOperation(() => this.manager.getLlamaCpp().expandQuery(query, options));
  1107. }
  1108. async rerank(query, documents, options) {
  1109. return this.withOperation(() => this.manager.getLlamaCpp().rerank(query, documents, options));
  1110. }
  1111. }
  1112. // Session manager for the default LlamaCpp instance
  1113. let defaultSessionManager = null;
  1114. /**
  1115. * Get the session manager for the default LlamaCpp instance.
  1116. */
  1117. function getSessionManager() {
  1118. const llm = getDefaultLlamaCpp();
  1119. if (!defaultSessionManager || defaultSessionManager.getLlamaCpp() !== llm) {
  1120. defaultSessionManager = new LLMSessionManager(llm);
  1121. }
  1122. return defaultSessionManager;
  1123. }
  1124. /**
  1125. * Execute a function with a scoped LLM session.
  1126. * The session provides lifecycle guarantees - resources won't be disposed mid-operation.
  1127. *
  1128. * @example
  1129. * ```typescript
  1130. * await withLLMSession(async (session) => {
  1131. * const expanded = await session.expandQuery(query);
  1132. * const embeddings = await session.embedBatch(texts);
  1133. * const reranked = await session.rerank(query, docs);
  1134. * return reranked;
  1135. * }, { maxDuration: 10 * 60 * 1000, name: 'querySearch' });
  1136. * ```
  1137. */
  1138. export async function withLLMSession(fn, options) {
  1139. const manager = getSessionManager();
  1140. const session = new LLMSession(manager, options);
  1141. try {
  1142. return await fn(session);
  1143. }
  1144. finally {
  1145. session.release();
  1146. }
  1147. }
  1148. /**
  1149. * Execute a function with a scoped LLM session using a specific LlamaCpp instance.
  1150. * Unlike withLLMSession, this does not use the global singleton.
  1151. */
  1152. export async function withLLMSessionForLlm(llm, fn, options) {
  1153. const manager = new LLMSessionManager(llm);
  1154. const session = new LLMSession(manager, options);
  1155. try {
  1156. return await fn(session);
  1157. }
  1158. finally {
  1159. session.release();
  1160. }
  1161. }
  1162. /**
  1163. * Check if idle unload is safe (no active sessions or operations).
  1164. * Used internally by LlamaCpp idle timer.
  1165. */
  1166. export function canUnloadLLM() {
  1167. if (!defaultSessionManager)
  1168. return true;
  1169. return defaultSessionManager.canUnload();
  1170. }
  1171. // =============================================================================
  1172. // Singleton for default LlamaCpp instance
  1173. // =============================================================================
  1174. let defaultLlamaCpp = null;
  1175. /**
  1176. * Get the default LlamaCpp instance (creates one if needed)
  1177. */
  1178. export function getDefaultLlamaCpp() {
  1179. if (!defaultLlamaCpp) {
  1180. defaultLlamaCpp = new LlamaCpp();
  1181. }
  1182. return defaultLlamaCpp;
  1183. }
  1184. /**
  1185. * Set a custom default LlamaCpp instance (useful for testing)
  1186. */
  1187. export function setDefaultLlamaCpp(llm) {
  1188. defaultLlamaCpp = llm;
  1189. }
  1190. /**
  1191. * Dispose the default LlamaCpp instance if it exists.
  1192. * Call this before process exit to prevent NAPI crashes.
  1193. */
  1194. export async function disposeDefaultLlamaCpp() {
  1195. if (defaultLlamaCpp) {
  1196. await defaultLlamaCpp.dispose();
  1197. defaultLlamaCpp = null;
  1198. }
  1199. }