llm.js 52 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287
  1. /**
  2. * llm.ts - LLM abstraction layer for QMD using node-llama-cpp
  3. *
  4. * Provides embeddings, text generation, and reranking using local GGUF models.
  5. */
  6. import { getLlama, resolveModelFile, LlamaChatSession, LlamaLogLevel, } from "node-llama-cpp";
  7. import { homedir } from "os";
  8. import { join } from "path";
  9. import { existsSync, mkdirSync, statSync, unlinkSync, readdirSync, readFileSync, writeFileSync } from "fs";
  10. // =============================================================================
  11. // Local-LLM env-var policy (i-c28wngnd)
  12. // =============================================================================
  13. /**
  14. * Truthy values for boolean-style env vars. Mirrors the convention used by
  15. * `QMD_LLAMA_GPU` (false-style) — kept narrow so unrelated values don't flip
  16. * the disable.
  17. */
  18. const TRUTHY_ENV_VALUES = new Set(["1", "true", "yes", "on"]);
  19. /**
  20. * Falsy / off-style values accepted by `QMD_LLAMA_GPU`.
  21. */
  22. const QMD_LLAMA_GPU_OFF_VALUES = new Set([
  23. "false", "off", "none", "disable", "disabled", "0",
  24. ]);
  25. /**
  26. * `QMD_DISABLE_LOCAL_LLM=1` opt-out: when set, `LlamaCpp.ensureLlama()`
  27. * throws on first invocation. Use for remote-only deployments where any
  28. * `getLlama()` call indicates an unintended fallback (e.g. cron host
  29. * without libvulkan-dev/glslc — issue i-c28wngnd).
  30. */
  31. export function isLocalLlmDisabled(env = process.env) {
  32. const raw = env.QMD_DISABLE_LOCAL_LLM?.trim().toLowerCase();
  33. return raw !== undefined && TRUTHY_ENV_VALUES.has(raw);
  34. }
  35. /**
  36. * Resolve the GPU mode for `getLlama()`:
  37. * 1. Explicit `QMD_LLAMA_GPU=off|none|0|...` → "cpu"
  38. * 2. Explicit `QMD_LLAMA_GPU=auto` → "auto"
  39. * 3. Auto-detect: `QMD_EMBED_ENDPOINT` set → "cpu"
  40. * (remote embed provider — embed never touches local LLM. Rerank/expand
  41. * still use prebuilt CPU binary; no Vulkan probe / cmake build.)
  42. * 4. Otherwise (legacy local-only setup) → "auto"
  43. */
  44. export function resolveLlamaGpuMode(env = process.env) {
  45. const explicit = env.QMD_LLAMA_GPU?.trim().toLowerCase();
  46. if (explicit !== undefined && explicit !== "") {
  47. if (QMD_LLAMA_GPU_OFF_VALUES.has(explicit))
  48. return "cpu";
  49. if (explicit === "auto" || explicit === "true" || explicit === "on") {
  50. return "auto";
  51. }
  52. // Unknown value — preserve legacy behavior (probe).
  53. return "auto";
  54. }
  55. // Auto-detect remote-only deployment. When QMD_EMBED_ENDPOINT is set the
  56. // embed path runs over HTTP (factory.ts resolveProviderKind), so any
  57. // local LLM access is for rerank/expand only — the prebuilt CPU binary
  58. // is sufficient and skipping the Vulkan probe avoids the ~30s cmake
  59. // attempt on hosts without libvulkan-dev/glslc.
  60. const remoteEmbed = env.QMD_EMBED_ENDPOINT?.trim();
  61. if (remoteEmbed && remoteEmbed !== "")
  62. return "cpu";
  63. return "auto";
  64. }
  65. // =============================================================================
  66. // Embedding Formatting Functions
  67. // =============================================================================
  68. /**
  69. * Detect if a model URI uses the Qwen3-Embedding format.
  70. * Qwen3-Embedding uses a different prompting style than nomic/embeddinggemma.
  71. */
  72. export function isQwen3EmbeddingModel(modelUri) {
  73. return /qwen.*embed/i.test(modelUri) || /embed.*qwen/i.test(modelUri);
  74. }
  75. /**
  76. * Format a query for embedding.
  77. * Uses nomic-style task prefix format for embeddinggemma (default).
  78. * Uses Qwen3-Embedding instruct format when a Qwen embedding model is active.
  79. */
  80. export function formatQueryForEmbedding(query, modelUri) {
  81. const uri = modelUri ?? process.env.QMD_EMBED_MODEL ?? DEFAULT_EMBED_MODEL;
  82. if (isQwen3EmbeddingModel(uri)) {
  83. return `Instruct: Retrieve relevant documents for the given query\nQuery: ${query}`;
  84. }
  85. return `task: search result | query: ${query}`;
  86. }
  87. /**
  88. * Format a document for embedding.
  89. * Uses nomic-style format with title and text fields (default).
  90. * Qwen3-Embedding encodes documents as raw text without special prefixes.
  91. */
  92. export function formatDocForEmbedding(text, title, modelUri) {
  93. const uri = modelUri ?? process.env.QMD_EMBED_MODEL ?? DEFAULT_EMBED_MODEL;
  94. if (isQwen3EmbeddingModel(uri)) {
  95. // Qwen3-Embedding: documents are raw text, no task prefix
  96. return title ? `${title}\n${text}` : text;
  97. }
  98. return `title: ${title || "none"} | text: ${text}`;
  99. }
  100. // =============================================================================
  101. // Model Configuration
  102. // =============================================================================
  103. // HuggingFace model URIs for node-llama-cpp
  104. // Format: hf:<user>/<repo>/<file>
  105. // Override via QMD_EMBED_MODEL env var (e.g. hf:Qwen/Qwen3-Embedding-0.6B-GGUF/Qwen3-Embedding-0.6B-Q8_0.gguf)
  106. const DEFAULT_EMBED_MODEL = "hf:ggml-org/embeddinggemma-300M-GGUF/embeddinggemma-300M-Q8_0.gguf";
  107. const DEFAULT_RERANK_MODEL = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
  108. // const DEFAULT_GENERATE_MODEL = "hf:ggml-org/Qwen3-0.6B-GGUF/Qwen3-0.6B-Q8_0.gguf";
  109. const DEFAULT_GENERATE_MODEL = "hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf";
  110. // Alternative generation models for query expansion:
  111. // LiquidAI LFM2 - hybrid architecture optimized for edge/on-device inference
  112. // Use these as base for fine-tuning with configs/sft_lfm2.yaml
  113. export const LFM2_GENERATE_MODEL = "hf:LiquidAI/LFM2-1.2B-GGUF/LFM2-1.2B-Q4_K_M.gguf";
  114. export const LFM2_INSTRUCT_MODEL = "hf:LiquidAI/LFM2.5-1.2B-Instruct-GGUF/LFM2.5-1.2B-Instruct-Q4_K_M.gguf";
  115. export const DEFAULT_EMBED_MODEL_URI = DEFAULT_EMBED_MODEL;
  116. export const DEFAULT_RERANK_MODEL_URI = DEFAULT_RERANK_MODEL;
  117. export const DEFAULT_GENERATE_MODEL_URI = DEFAULT_GENERATE_MODEL;
  118. // Local model cache directory
  119. const MODEL_CACHE_DIR = process.env.XDG_CACHE_HOME
  120. ? join(process.env.XDG_CACHE_HOME, "qmd", "models")
  121. : join(homedir(), ".cache", "qmd", "models");
  122. export const DEFAULT_MODEL_CACHE_DIR = MODEL_CACHE_DIR;
  123. function parseHfUri(model) {
  124. if (!model.startsWith("hf:"))
  125. return null;
  126. const without = model.slice(3);
  127. const parts = without.split("/");
  128. if (parts.length < 3)
  129. return null;
  130. const repo = parts.slice(0, 2).join("/");
  131. const file = parts.slice(2).join("/");
  132. return { repo, file };
  133. }
  134. async function getRemoteEtag(ref) {
  135. const url = `https://huggingface.co/${ref.repo}/resolve/main/${ref.file}`;
  136. try {
  137. const resp = await fetch(url, { method: "HEAD" });
  138. if (!resp.ok)
  139. return null;
  140. const etag = resp.headers.get("etag");
  141. return etag || null;
  142. }
  143. catch {
  144. return null;
  145. }
  146. }
  147. export async function pullModels(models, options = {}) {
  148. const cacheDir = options.cacheDir || MODEL_CACHE_DIR;
  149. if (!existsSync(cacheDir)) {
  150. mkdirSync(cacheDir, { recursive: true });
  151. }
  152. const results = [];
  153. for (const model of models) {
  154. let refreshed = false;
  155. const hfRef = parseHfUri(model);
  156. const filename = model.split("/").pop();
  157. const entries = readdirSync(cacheDir, { withFileTypes: true });
  158. const cached = filename
  159. ? entries
  160. .filter((entry) => entry.isFile() && entry.name.includes(filename))
  161. .map((entry) => join(cacheDir, entry.name))
  162. : [];
  163. if (hfRef && filename) {
  164. const etagPath = join(cacheDir, `${filename}.etag`);
  165. const remoteEtag = await getRemoteEtag(hfRef);
  166. const localEtag = existsSync(etagPath)
  167. ? readFileSync(etagPath, "utf-8").trim()
  168. : null;
  169. const shouldRefresh = options.refresh || !remoteEtag || remoteEtag !== localEtag || cached.length === 0;
  170. if (shouldRefresh) {
  171. for (const candidate of cached) {
  172. if (existsSync(candidate))
  173. unlinkSync(candidate);
  174. }
  175. if (existsSync(etagPath))
  176. unlinkSync(etagPath);
  177. refreshed = cached.length > 0;
  178. }
  179. }
  180. else if (options.refresh && filename) {
  181. for (const candidate of cached) {
  182. if (existsSync(candidate))
  183. unlinkSync(candidate);
  184. refreshed = true;
  185. }
  186. }
  187. const path = await resolveModelFile(model, cacheDir);
  188. const sizeBytes = existsSync(path) ? statSync(path).size : 0;
  189. if (hfRef && filename) {
  190. const remoteEtag = await getRemoteEtag(hfRef);
  191. if (remoteEtag) {
  192. const etagPath = join(cacheDir, `${filename}.etag`);
  193. writeFileSync(etagPath, remoteEtag + "\n", "utf-8");
  194. }
  195. }
  196. results.push({ model, path, sizeBytes, refreshed });
  197. }
  198. return results;
  199. }
  200. /**
  201. * LLM implementation using node-llama-cpp
  202. */
  203. // Default inactivity timeout: 5 minutes (keep models warm during typical search sessions)
  204. const DEFAULT_INACTIVITY_TIMEOUT_MS = 5 * 60 * 1000;
  205. const DEFAULT_EXPAND_CONTEXT_SIZE = 2048;
  206. function resolveExpandContextSize(configValue) {
  207. if (configValue !== undefined) {
  208. if (!Number.isInteger(configValue) || configValue <= 0) {
  209. throw new Error(`Invalid expandContextSize: ${configValue}. Must be a positive integer.`);
  210. }
  211. return configValue;
  212. }
  213. const envValue = process.env.QMD_EXPAND_CONTEXT_SIZE?.trim();
  214. if (!envValue)
  215. return DEFAULT_EXPAND_CONTEXT_SIZE;
  216. const parsed = Number.parseInt(envValue, 10);
  217. if (!Number.isInteger(parsed) || parsed <= 0) {
  218. process.stderr.write(`QMD Warning: invalid QMD_EXPAND_CONTEXT_SIZE="${envValue}", using default ${DEFAULT_EXPAND_CONTEXT_SIZE}.\n`);
  219. return DEFAULT_EXPAND_CONTEXT_SIZE;
  220. }
  221. return parsed;
  222. }
  223. export class LlamaCpp {
  224. _ciMode = !!process.env.CI;
  225. llama = null;
  226. embedModel = null;
  227. embedContexts = [];
  228. generateModel = null;
  229. rerankModel = null;
  230. rerankContexts = [];
  231. embedModelUri;
  232. generateModelUri;
  233. rerankModelUri;
  234. modelCacheDir;
  235. expandContextSize;
  236. // Ensure we don't load the same model/context concurrently (which can allocate duplicate VRAM).
  237. embedModelLoadPromise = null;
  238. generateModelLoadPromise = null;
  239. rerankModelLoadPromise = null;
  240. // Inactivity timer for auto-unloading models
  241. inactivityTimer = null;
  242. inactivityTimeoutMs;
  243. disposeModelsOnInactivity;
  244. // Track disposal state to prevent double-dispose
  245. disposed = false;
  246. constructor(config = {}) {
  247. this.embedModelUri = config.embedModel || process.env.QMD_EMBED_MODEL || DEFAULT_EMBED_MODEL;
  248. this.generateModelUri = config.generateModel || process.env.QMD_GENERATE_MODEL || DEFAULT_GENERATE_MODEL;
  249. this.rerankModelUri = config.rerankModel || process.env.QMD_RERANK_MODEL || DEFAULT_RERANK_MODEL;
  250. this.modelCacheDir = config.modelCacheDir || MODEL_CACHE_DIR;
  251. this.expandContextSize = resolveExpandContextSize(config.expandContextSize);
  252. this.inactivityTimeoutMs = config.inactivityTimeoutMs ?? DEFAULT_INACTIVITY_TIMEOUT_MS;
  253. this.disposeModelsOnInactivity = config.disposeModelsOnInactivity ?? false;
  254. }
  255. get embedModelName() {
  256. return this.embedModelUri;
  257. }
  258. /**
  259. * Reset the inactivity timer. Called after each model operation.
  260. * When timer fires, models are unloaded to free memory (if no active sessions).
  261. */
  262. touchActivity() {
  263. // Clear existing timer
  264. if (this.inactivityTimer) {
  265. clearTimeout(this.inactivityTimer);
  266. this.inactivityTimer = null;
  267. }
  268. // Only set timer if we have disposable contexts and timeout is enabled
  269. if (this.inactivityTimeoutMs > 0 && this.hasLoadedContexts()) {
  270. this.inactivityTimer = setTimeout(() => {
  271. // Check if session manager allows unloading
  272. // canUnloadLLM is defined later in this file - it checks the session manager
  273. // We use dynamic import pattern to avoid circular dependency issues
  274. if (typeof canUnloadLLM === 'function' && !canUnloadLLM()) {
  275. // Active sessions/operations - reschedule timer
  276. this.touchActivity();
  277. return;
  278. }
  279. this.unloadIdleResources().catch(err => {
  280. console.error("Error unloading idle resources:", err);
  281. });
  282. }, this.inactivityTimeoutMs);
  283. // Don't keep process alive just for this timer
  284. this.inactivityTimer.unref();
  285. }
  286. }
  287. /**
  288. * Check if any contexts are currently loaded (and therefore worth unloading on inactivity).
  289. */
  290. hasLoadedContexts() {
  291. return !!(this.embedContexts.length > 0 || this.rerankContexts.length > 0);
  292. }
  293. /**
  294. * Unload idle resources but keep the instance alive for future use.
  295. *
  296. * By default, this disposes contexts (and their dependent sequences), while keeping models loaded.
  297. * This matches the intended lifecycle: model → context → sequence, where contexts are per-session.
  298. */
  299. async unloadIdleResources() {
  300. // Don't unload if already disposed
  301. if (this.disposed) {
  302. return;
  303. }
  304. // Clear timer
  305. if (this.inactivityTimer) {
  306. clearTimeout(this.inactivityTimer);
  307. this.inactivityTimer = null;
  308. }
  309. // Dispose contexts first
  310. for (const ctx of this.embedContexts) {
  311. await ctx.dispose();
  312. }
  313. this.embedContexts = [];
  314. for (const ctx of this.rerankContexts) {
  315. await ctx.dispose();
  316. }
  317. this.rerankContexts = [];
  318. // Optionally dispose models too (opt-in)
  319. if (this.disposeModelsOnInactivity) {
  320. if (this.embedModel) {
  321. await this.embedModel.dispose();
  322. this.embedModel = null;
  323. }
  324. if (this.generateModel) {
  325. await this.generateModel.dispose();
  326. this.generateModel = null;
  327. }
  328. if (this.rerankModel) {
  329. await this.rerankModel.dispose();
  330. this.rerankModel = null;
  331. }
  332. // Reset load promises so models can be reloaded later
  333. this.embedModelLoadPromise = null;
  334. this.generateModelLoadPromise = null;
  335. this.rerankModelLoadPromise = null;
  336. }
  337. // Note: We keep llama instance alive - it's lightweight
  338. }
  339. /**
  340. * Ensure model cache directory exists
  341. */
  342. ensureModelCacheDir() {
  343. if (!existsSync(this.modelCacheDir)) {
  344. mkdirSync(this.modelCacheDir, { recursive: true });
  345. }
  346. }
  347. /**
  348. * Initialize the llama instance (lazy)
  349. *
  350. * Env-var controls (i-c28wngnd):
  351. * - QMD_DISABLE_LOCAL_LLM=1 : hard-disable; throws on first ensureLlama()
  352. * call. Use when the deployment must NEVER
  353. * load node-llama-cpp (e.g. headless cron
  354. * on a host without libvulkan-dev/glslc).
  355. * - QMD_LLAMA_GPU=off|none|... : force CPU-only (skip Vulkan probe).
  356. * - QMD_LLAMA_GPU=auto : explicit opt-in to GPU probe even when
  357. * QMD_EMBED_ENDPOINT is set (rare; useful
  358. * for hybrid local-rerank + remote-embed).
  359. *
  360. * Auto-detect: when QMD_EMBED_ENDPOINT is set (HTTP embed provider, e.g.
  361. * cron on `code` → ai.mm.mk → models:8082), we default to CPU-only because
  362. * the embed path runs over HTTP and the only remaining local LLM consumers
  363. * are rerank/query-expansion, which work fine on the prebuilt CPU binary
  364. * and never need to invoke cmake-js-llama. This silences ~30s/run of
  365. * Vulkan probe + cmake noise on headless LXCs.
  366. */
  367. async ensureLlama() {
  368. if (!this.llama) {
  369. // Hard-disable opt-out — fails fast so the caller knows. Throw early
  370. // so any path that ignores the documented `EmbeddingProvider` route
  371. // and reaches for the local LLM gets a loud, actionable error rather
  372. // than a silent 30s Vulkan compile attempt.
  373. if (isLocalLlmDisabled(process.env)) {
  374. throw new Error("QMD_DISABLE_LOCAL_LLM=1 — local node-llama-cpp is disabled. " +
  375. "This deployment is configured for remote embeddings only; the " +
  376. "code path that reached `ensureLlama()` should route through an " +
  377. "EmbeddingProvider (set QMD_EMBED_ENDPOINT) instead. Unset " +
  378. "QMD_DISABLE_LOCAL_LLM to re-enable local rerank/expand.");
  379. }
  380. // Resolve GPU mode: explicit QMD_LLAMA_GPU wins, else auto-detect
  381. // remote-only deployment (CPU when QMD_EMBED_ENDPOINT is set), else
  382. // probe GPU normally for legacy local-only setups.
  383. const gpuMode = resolveLlamaGpuMode(process.env);
  384. const loadLlama = async (gpu) => await getLlama({
  385. build: "autoAttempt",
  386. logLevel: LlamaLogLevel.error,
  387. gpu,
  388. });
  389. let llama;
  390. if (gpuMode === "cpu") {
  391. llama = await loadLlama(false);
  392. }
  393. else {
  394. try {
  395. llama = await loadLlama("auto");
  396. }
  397. catch (err) {
  398. // GPU backend (e.g. Vulkan on headless/driverless machines) can throw at init.
  399. // Fall back to CPU so qmd still works.
  400. process.stderr.write(`QMD Warning: GPU init failed (${err instanceof Error ? err.message : String(err)}), falling back to CPU.\n`);
  401. llama = await loadLlama(false);
  402. }
  403. }
  404. // Suppress the "running on CPU (slow)" warning when CPU was requested
  405. // explicitly or auto-selected for a remote-only deployment — there's
  406. // nothing the operator can do about it and the hint isn't relevant
  407. // (embed runs via HTTP; only rerank/expand use the local CPU path).
  408. if (llama.gpu === false && gpuMode === "auto") {
  409. process.stderr.write("QMD Warning: no GPU acceleration, running on CPU (slow). Run 'qmd status' for details.\n");
  410. }
  411. this.llama = llama;
  412. }
  413. return this.llama;
  414. }
  415. /**
  416. * Resolve a model URI to a local path, downloading if needed
  417. */
  418. async resolveModel(modelUri) {
  419. this.ensureModelCacheDir();
  420. // resolveModelFile handles HF URIs and downloads to the cache dir
  421. return await resolveModelFile(modelUri, this.modelCacheDir);
  422. }
  423. /**
  424. * Load embedding model (lazy)
  425. */
  426. async ensureEmbedModel() {
  427. if (this.embedModel) {
  428. return this.embedModel;
  429. }
  430. if (this.embedModelLoadPromise) {
  431. return await this.embedModelLoadPromise;
  432. }
  433. this.embedModelLoadPromise = (async () => {
  434. const llama = await this.ensureLlama();
  435. const modelPath = await this.resolveModel(this.embedModelUri);
  436. const model = await llama.loadModel({ modelPath });
  437. this.embedModel = model;
  438. // Model loading counts as activity - ping to keep alive
  439. this.touchActivity();
  440. return model;
  441. })();
  442. try {
  443. return await this.embedModelLoadPromise;
  444. }
  445. finally {
  446. // Keep the resolved model cached; clear only the in-flight promise.
  447. this.embedModelLoadPromise = null;
  448. }
  449. }
  450. /**
  451. * Compute how many parallel contexts to create.
  452. *
  453. * GPU: constrained by VRAM (25% of free, capped at 8).
  454. * CPU: constrained by cores. Splitting threads across contexts enables
  455. * true parallelism (each context runs on its own cores). Use at most
  456. * half the math cores, with at least 4 threads per context.
  457. */
  458. async computeParallelism(perContextMB) {
  459. const llama = await this.ensureLlama();
  460. if (llama.gpu) {
  461. try {
  462. const vram = await llama.getVramState();
  463. const freeMB = vram.free / (1024 * 1024);
  464. const maxByVram = Math.floor((freeMB * 0.25) / perContextMB);
  465. return Math.max(1, Math.min(8, maxByVram));
  466. }
  467. catch {
  468. return 2;
  469. }
  470. }
  471. // CPU: split cores across contexts. At least 4 threads per context.
  472. const cores = llama.cpuMathCores || 4;
  473. const maxContexts = Math.floor(cores / 4);
  474. return Math.max(1, Math.min(4, maxContexts));
  475. }
  476. /**
  477. * Get the number of threads each context should use, given N parallel contexts.
  478. * Splits available math cores evenly across contexts.
  479. */
  480. async threadsPerContext(parallelism) {
  481. const llama = await this.ensureLlama();
  482. if (llama.gpu)
  483. return 0; // GPU: let the library decide
  484. const cores = llama.cpuMathCores || 4;
  485. return Math.max(1, Math.floor(cores / parallelism));
  486. }
  487. /**
  488. * Load embedding contexts (lazy). Creates multiple for parallel embedding.
  489. * Uses promise guard to prevent concurrent context creation race condition.
  490. */
  491. embedContextsCreatePromise = null;
  492. async ensureEmbedContexts() {
  493. if (this.embedContexts.length > 0) {
  494. this.touchActivity();
  495. return this.embedContexts;
  496. }
  497. if (this.embedContextsCreatePromise) {
  498. return await this.embedContextsCreatePromise;
  499. }
  500. this.embedContextsCreatePromise = (async () => {
  501. const model = await this.ensureEmbedModel();
  502. // Embed contexts are ~143 MB each (nomic-embed 2048 ctx)
  503. const n = await this.computeParallelism(150);
  504. const threads = await this.threadsPerContext(n);
  505. for (let i = 0; i < n; i++) {
  506. try {
  507. this.embedContexts.push(await model.createEmbeddingContext({
  508. contextSize: LlamaCpp.EMBED_CONTEXT_SIZE,
  509. ...(threads > 0 ? { threads } : {}),
  510. }));
  511. }
  512. catch {
  513. if (this.embedContexts.length === 0)
  514. throw new Error("Failed to create any embedding context");
  515. break;
  516. }
  517. }
  518. this.touchActivity();
  519. return this.embedContexts;
  520. })();
  521. try {
  522. return await this.embedContextsCreatePromise;
  523. }
  524. finally {
  525. this.embedContextsCreatePromise = null;
  526. }
  527. }
  528. /**
  529. * Get a single embed context (for single-embed calls). Uses first from pool.
  530. */
  531. async ensureEmbedContext() {
  532. const contexts = await this.ensureEmbedContexts();
  533. return contexts[0];
  534. }
  535. /**
  536. * Load generation model (lazy) - context is created fresh per call
  537. */
  538. async ensureGenerateModel() {
  539. if (!this.generateModel) {
  540. if (this.generateModelLoadPromise) {
  541. return await this.generateModelLoadPromise;
  542. }
  543. this.generateModelLoadPromise = (async () => {
  544. const llama = await this.ensureLlama();
  545. const modelPath = await this.resolveModel(this.generateModelUri);
  546. const model = await llama.loadModel({ modelPath });
  547. this.generateModel = model;
  548. return model;
  549. })();
  550. try {
  551. await this.generateModelLoadPromise;
  552. }
  553. finally {
  554. this.generateModelLoadPromise = null;
  555. }
  556. }
  557. this.touchActivity();
  558. if (!this.generateModel) {
  559. throw new Error("Generate model not loaded");
  560. }
  561. return this.generateModel;
  562. }
  563. /**
  564. * Load rerank model (lazy)
  565. */
  566. async ensureRerankModel() {
  567. if (this.rerankModel) {
  568. return this.rerankModel;
  569. }
  570. if (this.rerankModelLoadPromise) {
  571. return await this.rerankModelLoadPromise;
  572. }
  573. this.rerankModelLoadPromise = (async () => {
  574. const llama = await this.ensureLlama();
  575. const modelPath = await this.resolveModel(this.rerankModelUri);
  576. const model = await llama.loadModel({ modelPath });
  577. this.rerankModel = model;
  578. // Model loading counts as activity - ping to keep alive
  579. this.touchActivity();
  580. return model;
  581. })();
  582. try {
  583. return await this.rerankModelLoadPromise;
  584. }
  585. finally {
  586. this.rerankModelLoadPromise = null;
  587. }
  588. }
  589. /**
  590. * Load rerank contexts (lazy). Creates multiple contexts for parallel ranking.
  591. * Each context has its own sequence, so they can evaluate independently.
  592. *
  593. * Tuning choices:
  594. * - contextSize 1024: reranking chunks are ~800 tokens max, 1024 is plenty
  595. * - flashAttention: ~20% less VRAM per context (568 vs 711 MB)
  596. * - Combined: drops from 11.6 GB (auto, no flash) to 568 MB per context (20×)
  597. */
  598. // Qwen3 reranker template adds ~200 tokens overhead (system prompt, tags, etc.)
  599. // Default 2048 was too small for longer documents (e.g. session transcripts,
  600. // CJK text, or large markdown files) — callers hit "input lengths exceed
  601. // context size" errors even after truncation because the overhead estimate
  602. // was insufficient. 4096 comfortably fits the largest real-world chunks
  603. // while staying well below the 40 960-token auto size.
  604. // Override with QMD_RERANK_CONTEXT_SIZE env var if you need more headroom.
  605. static RERANK_CONTEXT_SIZE = (() => {
  606. const v = parseInt(process.env.QMD_RERANK_CONTEXT_SIZE ?? "", 10);
  607. return Number.isFinite(v) && v > 0 ? v : 4096;
  608. })();
  609. static EMBED_CONTEXT_SIZE = (() => {
  610. const v = parseInt(process.env.QMD_EMBED_CONTEXT_SIZE ?? "", 10);
  611. return Number.isFinite(v) && v > 0 ? v : 2048;
  612. })();
  613. async ensureRerankContexts() {
  614. if (this.rerankContexts.length === 0) {
  615. const model = await this.ensureRerankModel();
  616. // ~960 MB per context with flash attention at contextSize 2048
  617. const n = Math.min(await this.computeParallelism(1000), 4);
  618. const threads = await this.threadsPerContext(n);
  619. for (let i = 0; i < n; i++) {
  620. try {
  621. this.rerankContexts.push(await model.createRankingContext({
  622. contextSize: LlamaCpp.RERANK_CONTEXT_SIZE,
  623. flashAttention: true,
  624. ...(threads > 0 ? { threads } : {}),
  625. }));
  626. }
  627. catch {
  628. if (this.rerankContexts.length === 0) {
  629. // Flash attention might not be supported — retry without it
  630. try {
  631. this.rerankContexts.push(await model.createRankingContext({
  632. contextSize: LlamaCpp.RERANK_CONTEXT_SIZE,
  633. ...(threads > 0 ? { threads } : {}),
  634. }));
  635. }
  636. catch {
  637. throw new Error("Failed to create any rerank context");
  638. }
  639. }
  640. break;
  641. }
  642. }
  643. }
  644. this.touchActivity();
  645. return this.rerankContexts;
  646. }
  647. // ==========================================================================
  648. // Tokenization
  649. // ==========================================================================
  650. /**
  651. * Tokenize text using the embedding model's tokenizer
  652. * Returns tokenizer tokens (opaque type from node-llama-cpp)
  653. */
  654. async tokenize(text) {
  655. await this.ensureEmbedContext(); // Ensure model is loaded
  656. if (!this.embedModel) {
  657. throw new Error("Embed model not loaded");
  658. }
  659. return this.embedModel.tokenize(text);
  660. }
  661. /**
  662. * Count tokens in text using the embedding model's tokenizer
  663. */
  664. async countTokens(text) {
  665. const tokens = await this.tokenize(text);
  666. return tokens.length;
  667. }
  668. /**
  669. * Detokenize token IDs back to text
  670. */
  671. async detokenize(tokens) {
  672. await this.ensureEmbedContext();
  673. if (!this.embedModel) {
  674. throw new Error("Embed model not loaded");
  675. }
  676. return this.embedModel.detokenize(tokens);
  677. }
  678. // ==========================================================================
  679. // Core API methods
  680. // ==========================================================================
  681. /**
  682. * Truncate text to fit within the embedding model's context window.
  683. * Uses the model's own tokenizer for accurate token counting, then
  684. * detokenizes back to text if truncation is needed.
  685. * Returns the (possibly truncated) text and whether truncation occurred.
  686. */
  687. async truncateToContextSize(text) {
  688. if (!this.embedModel)
  689. return { text, truncated: false };
  690. const maxTokens = this.embedModel.trainContextSize;
  691. if (maxTokens <= 0)
  692. return { text, truncated: false };
  693. const tokens = this.embedModel.tokenize(text);
  694. if (tokens.length <= maxTokens)
  695. return { text, truncated: false };
  696. // Leave a small margin (4 tokens) for BOS/EOS overhead
  697. const safeLimit = Math.max(1, maxTokens - 4);
  698. const truncatedTokens = tokens.slice(0, safeLimit);
  699. const truncatedText = this.embedModel.detokenize(truncatedTokens);
  700. return { text: truncatedText, truncated: true };
  701. }
  702. async embed(text, options = {}) {
  703. // Ping activity at start to keep models alive during this operation
  704. this.touchActivity();
  705. try {
  706. const context = await this.ensureEmbedContext();
  707. // Guard: truncate text that exceeds model context window to prevent GGML crash
  708. const { text: safeText, truncated } = await this.truncateToContextSize(text);
  709. if (truncated) {
  710. console.warn(`⚠ Text truncated to fit embedding context (${this.embedModel?.trainContextSize} tokens)`);
  711. }
  712. const embedding = await context.getEmbeddingFor(safeText);
  713. return {
  714. embedding: Array.from(embedding.vector),
  715. model: options.model ?? this.embedModelUri,
  716. };
  717. }
  718. catch (error) {
  719. console.error("Embedding error:", error);
  720. return null;
  721. }
  722. }
  723. /**
  724. * Batch embed multiple texts efficiently
  725. * Uses Promise.all for parallel embedding - node-llama-cpp handles batching internally
  726. */
  727. async embedBatch(texts, options = {}) {
  728. if (this._ciMode)
  729. throw new Error("LLM operations are disabled in CI (set CI=true)");
  730. // Ping activity at start to keep models alive during this operation
  731. this.touchActivity();
  732. if (texts.length === 0)
  733. return [];
  734. try {
  735. const contexts = await this.ensureEmbedContexts();
  736. const n = contexts.length;
  737. if (n === 1) {
  738. // Single context: sequential (no point splitting)
  739. const context = contexts[0];
  740. const embeddings = [];
  741. for (const text of texts) {
  742. try {
  743. const { text: safeText, truncated } = await this.truncateToContextSize(text);
  744. if (truncated) {
  745. console.warn(`⚠ Batch text truncated to fit embedding context (${this.embedModel?.trainContextSize} tokens)`);
  746. }
  747. const embedding = await context.getEmbeddingFor(safeText);
  748. this.touchActivity();
  749. embeddings.push({ embedding: Array.from(embedding.vector), model: options.model ?? this.embedModelUri });
  750. }
  751. catch (err) {
  752. console.error("Embedding error for text:", err);
  753. embeddings.push(null);
  754. }
  755. }
  756. return embeddings;
  757. }
  758. // Multiple contexts: split texts across contexts for parallel evaluation
  759. const chunkSize = Math.ceil(texts.length / n);
  760. const chunks = Array.from({ length: n }, (_, i) => texts.slice(i * chunkSize, (i + 1) * chunkSize));
  761. const chunkResults = await Promise.all(chunks.map(async (chunk, i) => {
  762. const ctx = contexts[i];
  763. const results = [];
  764. for (const text of chunk) {
  765. try {
  766. const { text: safeText, truncated } = await this.truncateToContextSize(text);
  767. if (truncated) {
  768. console.warn(`⚠ Batch text truncated to fit embedding context (${this.embedModel?.trainContextSize} tokens)`);
  769. }
  770. const embedding = await ctx.getEmbeddingFor(safeText);
  771. this.touchActivity();
  772. results.push({ embedding: Array.from(embedding.vector), model: options.model ?? this.embedModelUri });
  773. }
  774. catch (err) {
  775. console.error("Embedding error for text:", err);
  776. results.push(null);
  777. }
  778. }
  779. return results;
  780. }));
  781. return chunkResults.flat();
  782. }
  783. catch (error) {
  784. console.error("Batch embedding error:", error);
  785. return texts.map(() => null);
  786. }
  787. }
  788. async generate(prompt, options = {}) {
  789. if (this._ciMode)
  790. throw new Error("LLM operations are disabled in CI (set CI=true)");
  791. // Ping activity at start to keep models alive during this operation
  792. this.touchActivity();
  793. // Ensure model is loaded
  794. await this.ensureGenerateModel();
  795. // Create fresh context -> sequence -> session for each call
  796. const context = await this.generateModel.createContext();
  797. const sequence = context.getSequence();
  798. const session = new LlamaChatSession({ contextSequence: sequence });
  799. const maxTokens = options.maxTokens ?? 150;
  800. // Qwen3 recommends temp=0.7, topP=0.8, topK=20 for non-thinking mode
  801. // DO NOT use greedy decoding (temp=0) - causes repetition loops
  802. const temperature = options.temperature ?? 0.7;
  803. let result = "";
  804. try {
  805. await session.prompt(prompt, {
  806. maxTokens,
  807. temperature,
  808. topK: 20,
  809. topP: 0.8,
  810. onTextChunk: (text) => {
  811. result += text;
  812. },
  813. });
  814. return {
  815. text: result,
  816. model: this.generateModelUri,
  817. done: true,
  818. };
  819. }
  820. finally {
  821. // Dispose context (which disposes dependent sequences/sessions per lifecycle rules)
  822. await context.dispose();
  823. }
  824. }
  825. async modelExists(modelUri) {
  826. // For HuggingFace URIs, we assume they exist
  827. // For local paths, check if file exists
  828. if (modelUri.startsWith("hf:")) {
  829. return { name: modelUri, exists: true };
  830. }
  831. const exists = existsSync(modelUri);
  832. return {
  833. name: modelUri,
  834. exists,
  835. path: exists ? modelUri : undefined,
  836. };
  837. }
  838. // ==========================================================================
  839. // High-level abstractions
  840. // ==========================================================================
  841. async expandQuery(query, options = {}) {
  842. if (this._ciMode)
  843. throw new Error("LLM operations are disabled in CI (set CI=true)");
  844. // Ping activity at start to keep models alive during this operation
  845. this.touchActivity();
  846. const llama = await this.ensureLlama();
  847. await this.ensureGenerateModel();
  848. const includeLexical = options.includeLexical ?? true;
  849. const context = options.context;
  850. const grammar = await llama.createGrammar({
  851. grammar: `
  852. root ::= line+
  853. line ::= type ": " content "\\n"
  854. type ::= "lex" | "vec" | "hyde"
  855. content ::= [^\\n]+
  856. `
  857. });
  858. const intent = options.intent;
  859. const prompt = intent
  860. ? `/no_think Expand this search query: ${query}\nQuery intent: ${intent}`
  861. : `/no_think Expand this search query: ${query}`;
  862. // Create a bounded context for expansion to prevent large default VRAM allocations.
  863. const genContext = await this.generateModel.createContext({
  864. contextSize: this.expandContextSize,
  865. });
  866. const sequence = genContext.getSequence();
  867. const session = new LlamaChatSession({ contextSequence: sequence });
  868. try {
  869. // Qwen3 recommended settings for non-thinking mode:
  870. // temp=0.7, topP=0.8, topK=20, presence_penalty for repetition
  871. // DO NOT use greedy decoding (temp=0) - causes infinite loops
  872. const result = await session.prompt(prompt, {
  873. grammar,
  874. maxTokens: 600,
  875. temperature: 0.7,
  876. topK: 20,
  877. topP: 0.8,
  878. repeatPenalty: {
  879. lastTokens: 64,
  880. presencePenalty: 0.5,
  881. },
  882. });
  883. const lines = result.trim().split("\n");
  884. const queryLower = query.toLowerCase();
  885. const queryTerms = queryLower.replace(/[^a-z0-9\s]/g, " ").split(/\s+/).filter(Boolean);
  886. const hasQueryTerm = (text) => {
  887. const lower = text.toLowerCase();
  888. if (queryTerms.length === 0)
  889. return true;
  890. return queryTerms.some(term => lower.includes(term));
  891. };
  892. const queryables = lines.map(line => {
  893. const colonIdx = line.indexOf(":");
  894. if (colonIdx === -1)
  895. return null;
  896. const type = line.slice(0, colonIdx).trim();
  897. if (type !== 'lex' && type !== 'vec' && type !== 'hyde')
  898. return null;
  899. const text = line.slice(colonIdx + 1).trim();
  900. if (!hasQueryTerm(text))
  901. return null;
  902. return { type: type, text };
  903. }).filter((q) => q !== null);
  904. // Filter out lex entries if not requested
  905. const filtered = includeLexical ? queryables : queryables.filter(q => q.type !== 'lex');
  906. if (filtered.length > 0)
  907. return filtered;
  908. const fallback = [
  909. { type: 'hyde', text: `Information about ${query}` },
  910. { type: 'lex', text: query },
  911. { type: 'vec', text: query },
  912. ];
  913. return includeLexical ? fallback : fallback.filter(q => q.type !== 'lex');
  914. }
  915. catch (error) {
  916. console.error("Structured query expansion failed:", error);
  917. // Fallback to original query
  918. const fallback = [{ type: 'vec', text: query }];
  919. if (includeLexical)
  920. fallback.unshift({ type: 'lex', text: query });
  921. return fallback;
  922. }
  923. finally {
  924. await genContext.dispose();
  925. }
  926. }
  927. // Qwen3 reranker chat template overhead (system prompt, tags, separators).
  928. // Measured at ~350 tokens on real queries; use 512 as a safe upper bound so
  929. // the truncation budget never lets a document slip past the context limit.
  930. static RERANK_TEMPLATE_OVERHEAD = 512;
  931. static RERANK_TARGET_DOCS_PER_CONTEXT = 10;
  932. async rerank(query, documents, options = {}) {
  933. if (this._ciMode)
  934. throw new Error("LLM operations are disabled in CI (set CI=true)");
  935. // Ping activity at start to keep models alive during this operation
  936. this.touchActivity();
  937. const contexts = await this.ensureRerankContexts();
  938. const model = await this.ensureRerankModel();
  939. // Truncate documents that would exceed the rerank context size.
  940. // Budget = contextSize - template overhead - query tokens
  941. const queryTokens = model.tokenize(query).length;
  942. const maxDocTokens = LlamaCpp.RERANK_CONTEXT_SIZE - LlamaCpp.RERANK_TEMPLATE_OVERHEAD - queryTokens;
  943. const truncationCache = new Map();
  944. const truncatedDocs = documents.map((doc) => {
  945. const cached = truncationCache.get(doc.text);
  946. if (cached !== undefined) {
  947. return cached === doc.text ? doc : { ...doc, text: cached };
  948. }
  949. const tokens = model.tokenize(doc.text);
  950. const truncatedText = tokens.length <= maxDocTokens
  951. ? doc.text
  952. : model.detokenize(tokens.slice(0, maxDocTokens));
  953. truncationCache.set(doc.text, truncatedText);
  954. if (truncatedText === doc.text)
  955. return doc;
  956. return { ...doc, text: truncatedText };
  957. });
  958. // Deduplicate identical effective texts before scoring.
  959. // This avoids redundant work for repeated chunks and fixes collisions where
  960. // multiple docs map to the same chunk text.
  961. const textToDocs = new Map();
  962. truncatedDocs.forEach((doc, index) => {
  963. const existing = textToDocs.get(doc.text);
  964. if (existing) {
  965. existing.push({ file: doc.file, index });
  966. }
  967. else {
  968. textToDocs.set(doc.text, [{ file: doc.file, index }]);
  969. }
  970. });
  971. // Extract just the text for ranking
  972. const texts = Array.from(textToDocs.keys());
  973. // Split documents across contexts for parallel evaluation.
  974. // Each context has its own sequence with a lock, so parallelism comes
  975. // from multiple contexts evaluating different chunks simultaneously.
  976. const activeContextCount = Math.max(1, Math.min(contexts.length, Math.ceil(texts.length / LlamaCpp.RERANK_TARGET_DOCS_PER_CONTEXT)));
  977. const activeContexts = contexts.slice(0, activeContextCount);
  978. const chunkSize = Math.ceil(texts.length / activeContexts.length);
  979. const chunks = Array.from({ length: activeContexts.length }, (_, i) => texts.slice(i * chunkSize, (i + 1) * chunkSize)).filter(chunk => chunk.length > 0);
  980. const allScores = await Promise.all(chunks.map((chunk, i) => activeContexts[i].rankAll(query, chunk)));
  981. // Reassemble scores in original order and sort
  982. const flatScores = allScores.flat();
  983. const ranked = texts
  984. .map((text, i) => ({ document: text, score: flatScores[i] }))
  985. .sort((a, b) => b.score - a.score);
  986. // Map back to our result format.
  987. const results = [];
  988. for (const item of ranked) {
  989. const docInfos = textToDocs.get(item.document) ?? [];
  990. for (const docInfo of docInfos) {
  991. results.push({
  992. file: docInfo.file,
  993. score: item.score,
  994. index: docInfo.index,
  995. });
  996. }
  997. }
  998. return {
  999. results,
  1000. model: this.rerankModelUri,
  1001. };
  1002. }
  1003. /**
  1004. * Get device/GPU info for status display.
  1005. * Initializes llama if not already done.
  1006. */
  1007. async getDeviceInfo() {
  1008. const llama = await this.ensureLlama();
  1009. const gpuDevices = await llama.getGpuDeviceNames();
  1010. let vram;
  1011. if (llama.gpu) {
  1012. try {
  1013. const state = await llama.getVramState();
  1014. vram = { total: state.total, used: state.used, free: state.free };
  1015. }
  1016. catch { /* no vram info */ }
  1017. }
  1018. return {
  1019. gpu: llama.gpu,
  1020. gpuOffloading: llama.supportsGpuOffloading,
  1021. gpuDevices,
  1022. vram,
  1023. cpuCores: llama.cpuMathCores,
  1024. };
  1025. }
  1026. async dispose() {
  1027. // Prevent double-dispose
  1028. if (this.disposed) {
  1029. return;
  1030. }
  1031. this.disposed = true;
  1032. // Clear inactivity timer
  1033. if (this.inactivityTimer) {
  1034. clearTimeout(this.inactivityTimer);
  1035. this.inactivityTimer = null;
  1036. }
  1037. // Disposing llama cascades to models and contexts automatically
  1038. // See: https://node-llama-cpp.withcat.ai/guide/objects-lifecycle
  1039. // Note: llama.dispose() can hang indefinitely, so we use a timeout
  1040. if (this.llama) {
  1041. const disposePromise = this.llama.dispose();
  1042. const timeoutPromise = new Promise((resolve) => setTimeout(resolve, 1000));
  1043. await Promise.race([disposePromise, timeoutPromise]);
  1044. }
  1045. // Clear references
  1046. this.embedContexts = [];
  1047. this.rerankContexts = [];
  1048. this.embedModel = null;
  1049. this.generateModel = null;
  1050. this.rerankModel = null;
  1051. this.llama = null;
  1052. // Clear any in-flight load/create promises
  1053. this.embedModelLoadPromise = null;
  1054. this.embedContextsCreatePromise = null;
  1055. this.generateModelLoadPromise = null;
  1056. this.rerankModelLoadPromise = null;
  1057. }
  1058. }
  1059. // =============================================================================
  1060. // Session Management Layer
  1061. // =============================================================================
  1062. /**
  1063. * Manages LLM session lifecycle with reference counting.
  1064. * Coordinates with LlamaCpp idle timeout to prevent disposal during active sessions.
  1065. */
  1066. class LLMSessionManager {
  1067. llm;
  1068. _activeSessionCount = 0;
  1069. _inFlightOperations = 0;
  1070. constructor(llm) {
  1071. this.llm = llm;
  1072. }
  1073. get activeSessionCount() {
  1074. return this._activeSessionCount;
  1075. }
  1076. get inFlightOperations() {
  1077. return this._inFlightOperations;
  1078. }
  1079. /**
  1080. * Returns true only when both session count and in-flight operations are 0.
  1081. * Used by LlamaCpp to determine if idle unload is safe.
  1082. */
  1083. canUnload() {
  1084. return this._activeSessionCount === 0 && this._inFlightOperations === 0;
  1085. }
  1086. acquire() {
  1087. this._activeSessionCount++;
  1088. }
  1089. release() {
  1090. this._activeSessionCount = Math.max(0, this._activeSessionCount - 1);
  1091. }
  1092. operationStart() {
  1093. this._inFlightOperations++;
  1094. }
  1095. operationEnd() {
  1096. this._inFlightOperations = Math.max(0, this._inFlightOperations - 1);
  1097. }
  1098. getLlamaCpp() {
  1099. return this.llm;
  1100. }
  1101. }
  1102. /**
  1103. * Error thrown when an operation is attempted on a released or aborted session.
  1104. */
  1105. export class SessionReleasedError extends Error {
  1106. constructor(message = "LLM session has been released or aborted") {
  1107. super(message);
  1108. this.name = "SessionReleasedError";
  1109. }
  1110. }
  1111. /**
  1112. * Scoped LLM session with automatic lifecycle management.
  1113. * Wraps LlamaCpp methods with operation tracking and abort handling.
  1114. */
  1115. class LLMSession {
  1116. manager;
  1117. released = false;
  1118. abortController;
  1119. maxDurationTimer = null;
  1120. name;
  1121. constructor(manager, options = {}) {
  1122. this.manager = manager;
  1123. this.name = options.name || "unnamed";
  1124. this.abortController = new AbortController();
  1125. // Link external abort signal if provided
  1126. if (options.signal) {
  1127. if (options.signal.aborted) {
  1128. this.abortController.abort(options.signal.reason);
  1129. }
  1130. else {
  1131. options.signal.addEventListener("abort", () => {
  1132. this.abortController.abort(options.signal.reason);
  1133. }, { once: true });
  1134. }
  1135. }
  1136. // Set up max duration timer
  1137. const maxDuration = options.maxDuration ?? 10 * 60 * 1000; // Default 10 minutes
  1138. if (maxDuration > 0) {
  1139. this.maxDurationTimer = setTimeout(() => {
  1140. this.abortController.abort(new Error(`Session "${this.name}" exceeded max duration of ${maxDuration}ms`));
  1141. }, maxDuration);
  1142. this.maxDurationTimer.unref(); // Don't keep process alive
  1143. }
  1144. // Acquire session lease
  1145. this.manager.acquire();
  1146. }
  1147. get isValid() {
  1148. return !this.released && !this.abortController.signal.aborted;
  1149. }
  1150. get signal() {
  1151. return this.abortController.signal;
  1152. }
  1153. /**
  1154. * Release the session and decrement ref count.
  1155. * Called automatically by withLLMSession when the callback completes.
  1156. */
  1157. release() {
  1158. if (this.released)
  1159. return;
  1160. this.released = true;
  1161. if (this.maxDurationTimer) {
  1162. clearTimeout(this.maxDurationTimer);
  1163. this.maxDurationTimer = null;
  1164. }
  1165. this.abortController.abort(new Error("Session released"));
  1166. this.manager.release();
  1167. }
  1168. /**
  1169. * Wrap an operation with tracking and abort checking.
  1170. */
  1171. async withOperation(fn) {
  1172. if (!this.isValid) {
  1173. throw new SessionReleasedError();
  1174. }
  1175. this.manager.operationStart();
  1176. try {
  1177. // Check abort before starting
  1178. if (this.abortController.signal.aborted) {
  1179. throw new SessionReleasedError(this.abortController.signal.reason?.message || "Session aborted");
  1180. }
  1181. return await fn();
  1182. }
  1183. finally {
  1184. this.manager.operationEnd();
  1185. }
  1186. }
  1187. async embed(text, options) {
  1188. return this.withOperation(() => this.manager.getLlamaCpp().embed(text, options));
  1189. }
  1190. async embedBatch(texts, options) {
  1191. return this.withOperation(() => this.manager.getLlamaCpp().embedBatch(texts, options));
  1192. }
  1193. async expandQuery(query, options) {
  1194. return this.withOperation(() => this.manager.getLlamaCpp().expandQuery(query, options));
  1195. }
  1196. async rerank(query, documents, options) {
  1197. return this.withOperation(() => this.manager.getLlamaCpp().rerank(query, documents, options));
  1198. }
  1199. }
  1200. // Session manager for the default LlamaCpp instance
  1201. let defaultSessionManager = null;
  1202. /**
  1203. * Get the session manager for the default LlamaCpp instance.
  1204. */
  1205. function getSessionManager() {
  1206. const llm = getDefaultLlamaCpp();
  1207. if (!defaultSessionManager || defaultSessionManager.getLlamaCpp() !== llm) {
  1208. defaultSessionManager = new LLMSessionManager(llm);
  1209. }
  1210. return defaultSessionManager;
  1211. }
  1212. /**
  1213. * Execute a function with a scoped LLM session.
  1214. * The session provides lifecycle guarantees - resources won't be disposed mid-operation.
  1215. *
  1216. * @example
  1217. * ```typescript
  1218. * await withLLMSession(async (session) => {
  1219. * const expanded = await session.expandQuery(query);
  1220. * const embeddings = await session.embedBatch(texts);
  1221. * const reranked = await session.rerank(query, docs);
  1222. * return reranked;
  1223. * }, { maxDuration: 10 * 60 * 1000, name: 'querySearch' });
  1224. * ```
  1225. */
  1226. export async function withLLMSession(fn, options) {
  1227. const manager = getSessionManager();
  1228. const session = new LLMSession(manager, options);
  1229. try {
  1230. return await fn(session);
  1231. }
  1232. finally {
  1233. session.release();
  1234. }
  1235. }
  1236. /**
  1237. * Execute a function with a scoped LLM session using a specific LlamaCpp instance.
  1238. * Unlike withLLMSession, this does not use the global singleton.
  1239. */
  1240. export async function withLLMSessionForLlm(llm, fn, options) {
  1241. const manager = new LLMSessionManager(llm);
  1242. const session = new LLMSession(manager, options);
  1243. try {
  1244. return await fn(session);
  1245. }
  1246. finally {
  1247. session.release();
  1248. }
  1249. }
  1250. /**
  1251. * Check if idle unload is safe (no active sessions or operations).
  1252. * Used internally by LlamaCpp idle timer.
  1253. */
  1254. export function canUnloadLLM() {
  1255. if (!defaultSessionManager)
  1256. return true;
  1257. return defaultSessionManager.canUnload();
  1258. }
  1259. // =============================================================================
  1260. // Singleton for default LlamaCpp instance
  1261. // =============================================================================
  1262. let defaultLlamaCpp = null;
  1263. /**
  1264. * Get the default LlamaCpp instance (creates one if needed)
  1265. */
  1266. export function getDefaultLlamaCpp() {
  1267. if (!defaultLlamaCpp) {
  1268. defaultLlamaCpp = new LlamaCpp();
  1269. }
  1270. return defaultLlamaCpp;
  1271. }
  1272. /**
  1273. * Set a custom default LlamaCpp instance (useful for testing)
  1274. */
  1275. export function setDefaultLlamaCpp(llm) {
  1276. defaultLlamaCpp = llm;
  1277. }
  1278. /**
  1279. * Dispose the default LlamaCpp instance if it exists.
  1280. * Call this before process exit to prevent NAPI crashes.
  1281. */
  1282. export async function disposeDefaultLlamaCpp() {
  1283. if (defaultLlamaCpp) {
  1284. await defaultLlamaCpp.dispose();
  1285. defaultLlamaCpp = null;
  1286. }
  1287. }