llm.ts 50 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555
  1. /**
  2. * llm.ts - LLM abstraction layer for QMD using node-llama-cpp
  3. *
  4. * Provides embeddings, text generation, and reranking using local GGUF models.
  5. */
  6. import {
  7. getLlama,
  8. resolveModelFile,
  9. LlamaChatSession,
  10. LlamaLogLevel,
  11. type Llama,
  12. type LlamaModel,
  13. type LlamaEmbeddingContext,
  14. type Token as LlamaToken,
  15. } from "node-llama-cpp";
  16. import { homedir } from "os";
  17. import { join } from "path";
  18. import { existsSync, mkdirSync, statSync, unlinkSync, readdirSync, readFileSync, writeFileSync } from "fs";
  19. // =============================================================================
  20. // Embedding Formatting Functions
  21. // =============================================================================
  22. /**
  23. * Detect if a model URI uses the Qwen3-Embedding format.
  24. * Qwen3-Embedding uses a different prompting style than nomic/embeddinggemma.
  25. */
  26. export function isQwen3EmbeddingModel(modelUri: string): boolean {
  27. return /qwen.*embed/i.test(modelUri) || /embed.*qwen/i.test(modelUri);
  28. }
  29. /**
  30. * Format a query for embedding.
  31. * Uses nomic-style task prefix format for embeddinggemma (default).
  32. * Uses Qwen3-Embedding instruct format when a Qwen embedding model is active.
  33. */
  34. export function formatQueryForEmbedding(query: string, modelUri?: string): string {
  35. const uri = modelUri ?? process.env.QMD_EMBED_MODEL ?? DEFAULT_EMBED_MODEL;
  36. if (isQwen3EmbeddingModel(uri)) {
  37. return `Instruct: Retrieve relevant documents for the given query\nQuery: ${query}`;
  38. }
  39. return `task: search result | query: ${query}`;
  40. }
  41. /**
  42. * Format a document for embedding.
  43. * Uses nomic-style format with title and text fields (default).
  44. * Qwen3-Embedding encodes documents as raw text without special prefixes.
  45. */
  46. export function formatDocForEmbedding(text: string, title?: string, modelUri?: string): string {
  47. const uri = modelUri ?? process.env.QMD_EMBED_MODEL ?? DEFAULT_EMBED_MODEL;
  48. if (isQwen3EmbeddingModel(uri)) {
  49. // Qwen3-Embedding: documents are raw text, no task prefix
  50. return title ? `${title}\n${text}` : text;
  51. }
  52. return `title: ${title || "none"} | text: ${text}`;
  53. }
  54. // =============================================================================
  55. // Types
  56. // =============================================================================
  57. /**
  58. * Token with log probability
  59. */
  60. export type TokenLogProb = {
  61. token: string;
  62. logprob: number;
  63. };
  64. /**
  65. * Embedding result
  66. */
  67. export type EmbeddingResult = {
  68. embedding: number[];
  69. model: string;
  70. };
  71. /**
  72. * Generation result with optional logprobs
  73. */
  74. export type GenerateResult = {
  75. text: string;
  76. model: string;
  77. logprobs?: TokenLogProb[];
  78. done: boolean;
  79. };
  80. /**
  81. * Rerank result for a single document
  82. */
  83. export type RerankDocumentResult = {
  84. file: string;
  85. score: number;
  86. index: number;
  87. };
  88. /**
  89. * Batch rerank result
  90. */
  91. export type RerankResult = {
  92. results: RerankDocumentResult[];
  93. model: string;
  94. };
  95. /**
  96. * Model info
  97. */
  98. export type ModelInfo = {
  99. name: string;
  100. exists: boolean;
  101. path?: string;
  102. };
  103. /**
  104. * Options for embedding
  105. */
  106. export type EmbedOptions = {
  107. model?: string;
  108. isQuery?: boolean;
  109. title?: string;
  110. };
  111. /**
  112. * Options for text generation
  113. */
  114. export type GenerateOptions = {
  115. model?: string;
  116. maxTokens?: number;
  117. temperature?: number;
  118. };
  119. /**
  120. * Options for reranking
  121. */
  122. export type RerankOptions = {
  123. model?: string;
  124. };
  125. /**
  126. * Options for LLM sessions
  127. */
  128. export type LLMSessionOptions = {
  129. /** Max session duration in ms (default: 10 minutes) */
  130. maxDuration?: number;
  131. /** External abort signal */
  132. signal?: AbortSignal;
  133. /** Debug name for logging */
  134. name?: string;
  135. };
  136. /**
  137. * Session interface for scoped LLM access with lifecycle guarantees
  138. */
  139. export interface ILLMSession {
  140. embed(text: string, options?: EmbedOptions): Promise<EmbeddingResult | null>;
  141. embedBatch(texts: string[]): Promise<(EmbeddingResult | null)[]>;
  142. expandQuery(query: string, options?: { context?: string; includeLexical?: boolean }): Promise<Queryable[]>;
  143. rerank(query: string, documents: RerankDocument[], options?: RerankOptions): Promise<RerankResult>;
  144. /** Whether this session is still valid (not released or aborted) */
  145. readonly isValid: boolean;
  146. /** Abort signal for this session (aborts on release or maxDuration) */
  147. readonly signal: AbortSignal;
  148. }
  149. /**
  150. * Supported query types for different search backends
  151. */
  152. export type QueryType = 'lex' | 'vec' | 'hyde';
  153. /**
  154. * A single query and its target backend type
  155. */
  156. export type Queryable = {
  157. type: QueryType;
  158. text: string;
  159. };
  160. /**
  161. * Document to rerank
  162. */
  163. export type RerankDocument = {
  164. file: string;
  165. text: string;
  166. title?: string;
  167. };
  168. // =============================================================================
  169. // Model Configuration
  170. // =============================================================================
  171. // HuggingFace model URIs for node-llama-cpp
  172. // Format: hf:<user>/<repo>/<file>
  173. // Override via QMD_EMBED_MODEL env var (e.g. hf:Qwen/Qwen3-Embedding-0.6B-GGUF/Qwen3-Embedding-0.6B-Q8_0.gguf)
  174. const DEFAULT_EMBED_MODEL = process.env.QMD_EMBED_MODEL ?? "hf:ggml-org/embeddinggemma-300M-GGUF/embeddinggemma-300M-Q8_0.gguf";
  175. const DEFAULT_RERANK_MODEL = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
  176. // const DEFAULT_GENERATE_MODEL = "hf:ggml-org/Qwen3-0.6B-GGUF/Qwen3-0.6B-Q8_0.gguf";
  177. const DEFAULT_GENERATE_MODEL = "hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf";
  178. // Alternative generation models for query expansion:
  179. // LiquidAI LFM2 - hybrid architecture optimized for edge/on-device inference
  180. // Use these as base for fine-tuning with configs/sft_lfm2.yaml
  181. export const LFM2_GENERATE_MODEL = "hf:LiquidAI/LFM2-1.2B-GGUF/LFM2-1.2B-Q4_K_M.gguf";
  182. export const LFM2_INSTRUCT_MODEL = "hf:LiquidAI/LFM2.5-1.2B-Instruct-GGUF/LFM2.5-1.2B-Instruct-Q4_K_M.gguf";
  183. export const DEFAULT_EMBED_MODEL_URI = DEFAULT_EMBED_MODEL;
  184. export const DEFAULT_RERANK_MODEL_URI = DEFAULT_RERANK_MODEL;
  185. export const DEFAULT_GENERATE_MODEL_URI = DEFAULT_GENERATE_MODEL;
  186. // Local model cache directory
  187. const MODEL_CACHE_DIR = join(homedir(), ".cache", "qmd", "models");
  188. export const DEFAULT_MODEL_CACHE_DIR = MODEL_CACHE_DIR;
  189. export type PullResult = {
  190. model: string;
  191. path: string;
  192. sizeBytes: number;
  193. refreshed: boolean;
  194. };
  195. type HfRef = {
  196. repo: string;
  197. file: string;
  198. };
  199. function parseHfUri(model: string): HfRef | null {
  200. if (!model.startsWith("hf:")) return null;
  201. const without = model.slice(3);
  202. const parts = without.split("/");
  203. if (parts.length < 3) return null;
  204. const repo = parts.slice(0, 2).join("/");
  205. const file = parts.slice(2).join("/");
  206. return { repo, file };
  207. }
  208. async function getRemoteEtag(ref: HfRef): Promise<string | null> {
  209. const url = `https://huggingface.co/${ref.repo}/resolve/main/${ref.file}`;
  210. try {
  211. const resp = await fetch(url, { method: "HEAD" });
  212. if (!resp.ok) return null;
  213. const etag = resp.headers.get("etag");
  214. return etag || null;
  215. } catch {
  216. return null;
  217. }
  218. }
  219. export async function pullModels(
  220. models: string[],
  221. options: { refresh?: boolean; cacheDir?: string } = {}
  222. ): Promise<PullResult[]> {
  223. const cacheDir = options.cacheDir || MODEL_CACHE_DIR;
  224. if (!existsSync(cacheDir)) {
  225. mkdirSync(cacheDir, { recursive: true });
  226. }
  227. const results: PullResult[] = [];
  228. for (const model of models) {
  229. let refreshed = false;
  230. const hfRef = parseHfUri(model);
  231. const filename = model.split("/").pop();
  232. const entries = readdirSync(cacheDir, { withFileTypes: true });
  233. const cached = filename
  234. ? entries
  235. .filter((entry) => entry.isFile() && entry.name.includes(filename))
  236. .map((entry) => join(cacheDir, entry.name))
  237. : [];
  238. if (hfRef && filename) {
  239. const etagPath = join(cacheDir, `${filename}.etag`);
  240. const remoteEtag = await getRemoteEtag(hfRef);
  241. const localEtag = existsSync(etagPath)
  242. ? readFileSync(etagPath, "utf-8").trim()
  243. : null;
  244. const shouldRefresh =
  245. options.refresh || !remoteEtag || remoteEtag !== localEtag || cached.length === 0;
  246. if (shouldRefresh) {
  247. for (const candidate of cached) {
  248. if (existsSync(candidate)) unlinkSync(candidate);
  249. }
  250. if (existsSync(etagPath)) unlinkSync(etagPath);
  251. refreshed = cached.length > 0;
  252. }
  253. } else if (options.refresh && filename) {
  254. for (const candidate of cached) {
  255. if (existsSync(candidate)) unlinkSync(candidate);
  256. refreshed = true;
  257. }
  258. }
  259. const path = await resolveModelFile(model, cacheDir);
  260. const sizeBytes = existsSync(path) ? statSync(path).size : 0;
  261. if (hfRef && filename) {
  262. const remoteEtag = await getRemoteEtag(hfRef);
  263. if (remoteEtag) {
  264. const etagPath = join(cacheDir, `${filename}.etag`);
  265. writeFileSync(etagPath, remoteEtag + "\n", "utf-8");
  266. }
  267. }
  268. results.push({ model, path, sizeBytes, refreshed });
  269. }
  270. return results;
  271. }
  272. // =============================================================================
  273. // LLM Interface
  274. // =============================================================================
  275. /**
  276. * Abstract LLM interface - implement this for different backends
  277. */
  278. export interface LLM {
  279. /**
  280. * Get embeddings for text
  281. */
  282. embed(text: string, options?: EmbedOptions): Promise<EmbeddingResult | null>;
  283. /**
  284. * Generate text completion
  285. */
  286. generate(prompt: string, options?: GenerateOptions): Promise<GenerateResult | null>;
  287. /**
  288. * Check if a model exists/is available
  289. */
  290. modelExists(model: string): Promise<ModelInfo>;
  291. /**
  292. * Expand a search query into multiple variations for different backends.
  293. * Returns a list of Queryable objects.
  294. */
  295. expandQuery(query: string, options?: { context?: string, includeLexical?: boolean }): Promise<Queryable[]>;
  296. /**
  297. * Rerank documents by relevance to a query
  298. * Returns list of documents with relevance scores (higher = more relevant)
  299. */
  300. rerank(query: string, documents: RerankDocument[], options?: RerankOptions): Promise<RerankResult>;
  301. /**
  302. * Dispose of resources
  303. */
  304. dispose(): Promise<void>;
  305. }
  306. // =============================================================================
  307. // node-llama-cpp Implementation
  308. // =============================================================================
  309. export type LlamaCppConfig = {
  310. embedModel?: string;
  311. generateModel?: string;
  312. rerankModel?: string;
  313. modelCacheDir?: string;
  314. /**
  315. * Context size used for query expansion generation contexts.
  316. * Default: 2048. Can also be set via QMD_EXPAND_CONTEXT_SIZE.
  317. */
  318. expandContextSize?: number;
  319. /**
  320. * Inactivity timeout in ms before unloading contexts (default: 2 minutes, 0 to disable).
  321. *
  322. * Per node-llama-cpp lifecycle guidance, we prefer keeping models loaded and only disposing
  323. * contexts when idle, since contexts (and their sequences) are the heavy per-session objects.
  324. * @see https://node-llama-cpp.withcat.ai/guide/objects-lifecycle
  325. */
  326. inactivityTimeoutMs?: number;
  327. /**
  328. * Whether to dispose models on inactivity (default: false).
  329. *
  330. * Keeping models loaded avoids repeated VRAM thrash; set to true only if you need aggressive
  331. * memory reclaim.
  332. */
  333. disposeModelsOnInactivity?: boolean;
  334. };
  335. /**
  336. * LLM implementation using node-llama-cpp
  337. */
  338. // Default inactivity timeout: 5 minutes (keep models warm during typical search sessions)
  339. const DEFAULT_INACTIVITY_TIMEOUT_MS = 5 * 60 * 1000;
  340. const DEFAULT_EXPAND_CONTEXT_SIZE = 2048;
  341. function resolveExpandContextSize(configValue?: number): number {
  342. if (configValue !== undefined) {
  343. if (!Number.isInteger(configValue) || configValue <= 0) {
  344. throw new Error(`Invalid expandContextSize: ${configValue}. Must be a positive integer.`);
  345. }
  346. return configValue;
  347. }
  348. const envValue = process.env.QMD_EXPAND_CONTEXT_SIZE?.trim();
  349. if (!envValue) return DEFAULT_EXPAND_CONTEXT_SIZE;
  350. const parsed = Number.parseInt(envValue, 10);
  351. if (!Number.isInteger(parsed) || parsed <= 0) {
  352. process.stderr.write(
  353. `QMD Warning: invalid QMD_EXPAND_CONTEXT_SIZE="${envValue}", using default ${DEFAULT_EXPAND_CONTEXT_SIZE}.\n`
  354. );
  355. return DEFAULT_EXPAND_CONTEXT_SIZE;
  356. }
  357. return parsed;
  358. }
  359. export class LlamaCpp implements LLM {
  360. private readonly _ciMode = !!process.env.CI;
  361. private llama: Llama | null = null;
  362. private embedModel: LlamaModel | null = null;
  363. private embedContexts: LlamaEmbeddingContext[] = [];
  364. private generateModel: LlamaModel | null = null;
  365. private rerankModel: LlamaModel | null = null;
  366. private rerankContexts: Awaited<ReturnType<LlamaModel["createRankingContext"]>>[] = [];
  367. private embedModelUri: string;
  368. private generateModelUri: string;
  369. private rerankModelUri: string;
  370. private modelCacheDir: string;
  371. private expandContextSize: number;
  372. // Ensure we don't load the same model/context concurrently (which can allocate duplicate VRAM).
  373. private embedModelLoadPromise: Promise<LlamaModel> | null = null;
  374. private generateModelLoadPromise: Promise<LlamaModel> | null = null;
  375. private rerankModelLoadPromise: Promise<LlamaModel> | null = null;
  376. // Inactivity timer for auto-unloading models
  377. private inactivityTimer: ReturnType<typeof setTimeout> | null = null;
  378. private inactivityTimeoutMs: number;
  379. private disposeModelsOnInactivity: boolean;
  380. // Track disposal state to prevent double-dispose
  381. private disposed = false;
  382. constructor(config: LlamaCppConfig = {}) {
  383. this.embedModelUri = config.embedModel || DEFAULT_EMBED_MODEL;
  384. this.generateModelUri = config.generateModel || DEFAULT_GENERATE_MODEL;
  385. this.rerankModelUri = config.rerankModel || DEFAULT_RERANK_MODEL;
  386. this.modelCacheDir = config.modelCacheDir || MODEL_CACHE_DIR;
  387. this.expandContextSize = resolveExpandContextSize(config.expandContextSize);
  388. this.inactivityTimeoutMs = config.inactivityTimeoutMs ?? DEFAULT_INACTIVITY_TIMEOUT_MS;
  389. this.disposeModelsOnInactivity = config.disposeModelsOnInactivity ?? false;
  390. }
  391. /**
  392. * Reset the inactivity timer. Called after each model operation.
  393. * When timer fires, models are unloaded to free memory (if no active sessions).
  394. */
  395. private touchActivity(): void {
  396. // Clear existing timer
  397. if (this.inactivityTimer) {
  398. clearTimeout(this.inactivityTimer);
  399. this.inactivityTimer = null;
  400. }
  401. // Only set timer if we have disposable contexts and timeout is enabled
  402. if (this.inactivityTimeoutMs > 0 && this.hasLoadedContexts()) {
  403. this.inactivityTimer = setTimeout(() => {
  404. // Check if session manager allows unloading
  405. // canUnloadLLM is defined later in this file - it checks the session manager
  406. // We use dynamic import pattern to avoid circular dependency issues
  407. if (typeof canUnloadLLM === 'function' && !canUnloadLLM()) {
  408. // Active sessions/operations - reschedule timer
  409. this.touchActivity();
  410. return;
  411. }
  412. this.unloadIdleResources().catch(err => {
  413. console.error("Error unloading idle resources:", err);
  414. });
  415. }, this.inactivityTimeoutMs);
  416. // Don't keep process alive just for this timer
  417. this.inactivityTimer.unref();
  418. }
  419. }
  420. /**
  421. * Check if any contexts are currently loaded (and therefore worth unloading on inactivity).
  422. */
  423. private hasLoadedContexts(): boolean {
  424. return !!(this.embedContexts.length > 0 || this.rerankContexts.length > 0);
  425. }
  426. /**
  427. * Unload idle resources but keep the instance alive for future use.
  428. *
  429. * By default, this disposes contexts (and their dependent sequences), while keeping models loaded.
  430. * This matches the intended lifecycle: model → context → sequence, where contexts are per-session.
  431. */
  432. async unloadIdleResources(): Promise<void> {
  433. // Don't unload if already disposed
  434. if (this.disposed) {
  435. return;
  436. }
  437. // Clear timer
  438. if (this.inactivityTimer) {
  439. clearTimeout(this.inactivityTimer);
  440. this.inactivityTimer = null;
  441. }
  442. // Dispose contexts first
  443. for (const ctx of this.embedContexts) {
  444. await ctx.dispose();
  445. }
  446. this.embedContexts = [];
  447. for (const ctx of this.rerankContexts) {
  448. await ctx.dispose();
  449. }
  450. this.rerankContexts = [];
  451. // Optionally dispose models too (opt-in)
  452. if (this.disposeModelsOnInactivity) {
  453. if (this.embedModel) {
  454. await this.embedModel.dispose();
  455. this.embedModel = null;
  456. }
  457. if (this.generateModel) {
  458. await this.generateModel.dispose();
  459. this.generateModel = null;
  460. }
  461. if (this.rerankModel) {
  462. await this.rerankModel.dispose();
  463. this.rerankModel = null;
  464. }
  465. // Reset load promises so models can be reloaded later
  466. this.embedModelLoadPromise = null;
  467. this.generateModelLoadPromise = null;
  468. this.rerankModelLoadPromise = null;
  469. }
  470. // Note: We keep llama instance alive - it's lightweight
  471. }
  472. /**
  473. * Ensure model cache directory exists
  474. */
  475. private ensureModelCacheDir(): void {
  476. if (!existsSync(this.modelCacheDir)) {
  477. mkdirSync(this.modelCacheDir, { recursive: true });
  478. }
  479. }
  480. /**
  481. * Initialize the llama instance (lazy)
  482. */
  483. private async ensureLlama(): Promise<Llama> {
  484. if (!this.llama) {
  485. const llama = await getLlama({
  486. // attempt to build
  487. build: "autoAttempt",
  488. logLevel: LlamaLogLevel.error
  489. });
  490. if (llama.gpu === false) {
  491. process.stderr.write(
  492. "QMD Warning: no GPU acceleration, running on CPU (slow). Run 'qmd status' for details.\n"
  493. );
  494. }
  495. this.llama = llama;
  496. }
  497. return this.llama;
  498. }
  499. /**
  500. * Resolve a model URI to a local path, downloading if needed
  501. */
  502. private async resolveModel(modelUri: string): Promise<string> {
  503. this.ensureModelCacheDir();
  504. // resolveModelFile handles HF URIs and downloads to the cache dir
  505. return await resolveModelFile(modelUri, this.modelCacheDir);
  506. }
  507. /**
  508. * Load embedding model (lazy)
  509. */
  510. private async ensureEmbedModel(): Promise<LlamaModel> {
  511. if (this.embedModel) {
  512. return this.embedModel;
  513. }
  514. if (this.embedModelLoadPromise) {
  515. return await this.embedModelLoadPromise;
  516. }
  517. this.embedModelLoadPromise = (async () => {
  518. const llama = await this.ensureLlama();
  519. const modelPath = await this.resolveModel(this.embedModelUri);
  520. const model = await llama.loadModel({ modelPath });
  521. this.embedModel = model;
  522. // Model loading counts as activity - ping to keep alive
  523. this.touchActivity();
  524. return model;
  525. })();
  526. try {
  527. return await this.embedModelLoadPromise;
  528. } finally {
  529. // Keep the resolved model cached; clear only the in-flight promise.
  530. this.embedModelLoadPromise = null;
  531. }
  532. }
  533. /**
  534. * Compute how many parallel contexts to create.
  535. *
  536. * GPU: constrained by VRAM (25% of free, capped at 8).
  537. * CPU: constrained by cores. Splitting threads across contexts enables
  538. * true parallelism (each context runs on its own cores). Use at most
  539. * half the math cores, with at least 4 threads per context.
  540. */
  541. private async computeParallelism(perContextMB: number): Promise<number> {
  542. const llama = await this.ensureLlama();
  543. if (llama.gpu) {
  544. try {
  545. const vram = await llama.getVramState();
  546. const freeMB = vram.free / (1024 * 1024);
  547. const maxByVram = Math.floor((freeMB * 0.25) / perContextMB);
  548. return Math.max(1, Math.min(8, maxByVram));
  549. } catch {
  550. return 2;
  551. }
  552. }
  553. // CPU: split cores across contexts. At least 4 threads per context.
  554. const cores = llama.cpuMathCores || 4;
  555. const maxContexts = Math.floor(cores / 4);
  556. return Math.max(1, Math.min(4, maxContexts));
  557. }
  558. /**
  559. * Get the number of threads each context should use, given N parallel contexts.
  560. * Splits available math cores evenly across contexts.
  561. */
  562. private async threadsPerContext(parallelism: number): Promise<number> {
  563. const llama = await this.ensureLlama();
  564. if (llama.gpu) return 0; // GPU: let the library decide
  565. const cores = llama.cpuMathCores || 4;
  566. return Math.max(1, Math.floor(cores / parallelism));
  567. }
  568. /**
  569. * Load embedding contexts (lazy). Creates multiple for parallel embedding.
  570. * Uses promise guard to prevent concurrent context creation race condition.
  571. */
  572. private embedContextsCreatePromise: Promise<LlamaEmbeddingContext[]> | null = null;
  573. private async ensureEmbedContexts(): Promise<LlamaEmbeddingContext[]> {
  574. if (this.embedContexts.length > 0) {
  575. this.touchActivity();
  576. return this.embedContexts;
  577. }
  578. if (this.embedContextsCreatePromise) {
  579. return await this.embedContextsCreatePromise;
  580. }
  581. this.embedContextsCreatePromise = (async () => {
  582. const model = await this.ensureEmbedModel();
  583. // Embed contexts are ~143 MB each (nomic-embed 2048 ctx)
  584. const n = await this.computeParallelism(150);
  585. const threads = await this.threadsPerContext(n);
  586. for (let i = 0; i < n; i++) {
  587. try {
  588. this.embedContexts.push(await model.createEmbeddingContext({
  589. ...(threads > 0 ? { threads } : {}),
  590. }));
  591. } catch {
  592. if (this.embedContexts.length === 0) throw new Error("Failed to create any embedding context");
  593. break;
  594. }
  595. }
  596. this.touchActivity();
  597. return this.embedContexts;
  598. })();
  599. try {
  600. return await this.embedContextsCreatePromise;
  601. } finally {
  602. this.embedContextsCreatePromise = null;
  603. }
  604. }
  605. /**
  606. * Get a single embed context (for single-embed calls). Uses first from pool.
  607. */
  608. private async ensureEmbedContext(): Promise<LlamaEmbeddingContext> {
  609. const contexts = await this.ensureEmbedContexts();
  610. return contexts[0]!;
  611. }
  612. /**
  613. * Load generation model (lazy) - context is created fresh per call
  614. */
  615. private async ensureGenerateModel(): Promise<LlamaModel> {
  616. if (!this.generateModel) {
  617. if (this.generateModelLoadPromise) {
  618. return await this.generateModelLoadPromise;
  619. }
  620. this.generateModelLoadPromise = (async () => {
  621. const llama = await this.ensureLlama();
  622. const modelPath = await this.resolveModel(this.generateModelUri);
  623. const model = await llama.loadModel({ modelPath });
  624. this.generateModel = model;
  625. return model;
  626. })();
  627. try {
  628. await this.generateModelLoadPromise;
  629. } finally {
  630. this.generateModelLoadPromise = null;
  631. }
  632. }
  633. this.touchActivity();
  634. if (!this.generateModel) {
  635. throw new Error("Generate model not loaded");
  636. }
  637. return this.generateModel;
  638. }
  639. /**
  640. * Load rerank model (lazy)
  641. */
  642. private async ensureRerankModel(): Promise<LlamaModel> {
  643. if (this.rerankModel) {
  644. return this.rerankModel;
  645. }
  646. if (this.rerankModelLoadPromise) {
  647. return await this.rerankModelLoadPromise;
  648. }
  649. this.rerankModelLoadPromise = (async () => {
  650. const llama = await this.ensureLlama();
  651. const modelPath = await this.resolveModel(this.rerankModelUri);
  652. const model = await llama.loadModel({ modelPath });
  653. this.rerankModel = model;
  654. // Model loading counts as activity - ping to keep alive
  655. this.touchActivity();
  656. return model;
  657. })();
  658. try {
  659. return await this.rerankModelLoadPromise;
  660. } finally {
  661. this.rerankModelLoadPromise = null;
  662. }
  663. }
  664. /**
  665. * Load rerank contexts (lazy). Creates multiple contexts for parallel ranking.
  666. * Each context has its own sequence, so they can evaluate independently.
  667. *
  668. * Tuning choices:
  669. * - contextSize 1024: reranking chunks are ~800 tokens max, 1024 is plenty
  670. * - flashAttention: ~20% less VRAM per context (568 vs 711 MB)
  671. * - Combined: drops from 11.6 GB (auto, no flash) to 568 MB per context (20×)
  672. */
  673. // Qwen3 reranker template adds ~200 tokens overhead (system prompt, tags, etc.)
  674. // Default 2048 was too small for longer documents (e.g. session transcripts,
  675. // CJK text, or large markdown files) — callers hit "input lengths exceed
  676. // context size" errors even after truncation because the overhead estimate
  677. // was insufficient. 4096 comfortably fits the largest real-world chunks
  678. // while staying well below the 40 960-token auto size.
  679. // Override with QMD_RERANK_CONTEXT_SIZE env var if you need more headroom.
  680. private static readonly RERANK_CONTEXT_SIZE: number = (() => {
  681. const v = parseInt(process.env.QMD_RERANK_CONTEXT_SIZE ?? "", 10);
  682. return Number.isFinite(v) && v > 0 ? v : 4096;
  683. })();
  684. private async ensureRerankContexts(): Promise<Awaited<ReturnType<LlamaModel["createRankingContext"]>>[]> {
  685. if (this.rerankContexts.length === 0) {
  686. const model = await this.ensureRerankModel();
  687. // ~960 MB per context with flash attention at contextSize 2048
  688. const n = Math.min(await this.computeParallelism(1000), 4);
  689. const threads = await this.threadsPerContext(n);
  690. for (let i = 0; i < n; i++) {
  691. try {
  692. this.rerankContexts.push(await model.createRankingContext({
  693. contextSize: LlamaCpp.RERANK_CONTEXT_SIZE,
  694. flashAttention: true,
  695. ...(threads > 0 ? { threads } : {}),
  696. } as any));
  697. } catch {
  698. if (this.rerankContexts.length === 0) {
  699. // Flash attention might not be supported — retry without it
  700. try {
  701. this.rerankContexts.push(await model.createRankingContext({
  702. contextSize: LlamaCpp.RERANK_CONTEXT_SIZE,
  703. ...(threads > 0 ? { threads } : {}),
  704. }));
  705. } catch {
  706. throw new Error("Failed to create any rerank context");
  707. }
  708. }
  709. break;
  710. }
  711. }
  712. }
  713. this.touchActivity();
  714. return this.rerankContexts;
  715. }
  716. // ==========================================================================
  717. // Tokenization
  718. // ==========================================================================
  719. /**
  720. * Tokenize text using the embedding model's tokenizer
  721. * Returns tokenizer tokens (opaque type from node-llama-cpp)
  722. */
  723. async tokenize(text: string): Promise<readonly LlamaToken[]> {
  724. await this.ensureEmbedContext(); // Ensure model is loaded
  725. if (!this.embedModel) {
  726. throw new Error("Embed model not loaded");
  727. }
  728. return this.embedModel.tokenize(text);
  729. }
  730. /**
  731. * Count tokens in text using the embedding model's tokenizer
  732. */
  733. async countTokens(text: string): Promise<number> {
  734. const tokens = await this.tokenize(text);
  735. return tokens.length;
  736. }
  737. /**
  738. * Detokenize token IDs back to text
  739. */
  740. async detokenize(tokens: readonly LlamaToken[]): Promise<string> {
  741. await this.ensureEmbedContext();
  742. if (!this.embedModel) {
  743. throw new Error("Embed model not loaded");
  744. }
  745. return this.embedModel.detokenize(tokens);
  746. }
  747. // ==========================================================================
  748. // Core API methods
  749. // ==========================================================================
  750. /**
  751. * Truncate text to fit within the embedding model's context window.
  752. * Uses the model's own tokenizer for accurate token counting, then
  753. * detokenizes back to text if truncation is needed.
  754. * Returns the (possibly truncated) text and whether truncation occurred.
  755. */
  756. private async truncateToContextSize(text: string): Promise<{ text: string; truncated: boolean }> {
  757. if (!this.embedModel) return { text, truncated: false };
  758. const maxTokens = this.embedModel.trainContextSize;
  759. if (maxTokens <= 0) return { text, truncated: false };
  760. const tokens = this.embedModel.tokenize(text);
  761. if (tokens.length <= maxTokens) return { text, truncated: false };
  762. // Leave a small margin (4 tokens) for BOS/EOS overhead
  763. const safeLimit = Math.max(1, maxTokens - 4);
  764. const truncatedTokens = tokens.slice(0, safeLimit);
  765. const truncatedText = this.embedModel.detokenize(truncatedTokens);
  766. return { text: truncatedText, truncated: true };
  767. }
  768. async embed(text: string, options: EmbedOptions = {}): Promise<EmbeddingResult | null> {
  769. // Ping activity at start to keep models alive during this operation
  770. this.touchActivity();
  771. try {
  772. const context = await this.ensureEmbedContext();
  773. // Guard: truncate text that exceeds model context window to prevent GGML crash
  774. const { text: safeText, truncated } = await this.truncateToContextSize(text);
  775. if (truncated) {
  776. console.warn(`⚠ Text truncated to fit embedding context (${this.embedModel?.trainContextSize} tokens)`);
  777. }
  778. const embedding = await context.getEmbeddingFor(safeText);
  779. return {
  780. embedding: Array.from(embedding.vector),
  781. model: this.embedModelUri,
  782. };
  783. } catch (error) {
  784. console.error("Embedding error:", error);
  785. return null;
  786. }
  787. }
  788. /**
  789. * Batch embed multiple texts efficiently
  790. * Uses Promise.all for parallel embedding - node-llama-cpp handles batching internally
  791. */
  792. async embedBatch(texts: string[]): Promise<(EmbeddingResult | null)[]> {
  793. if (this._ciMode) throw new Error("LLM operations are disabled in CI (set CI=true)");
  794. // Ping activity at start to keep models alive during this operation
  795. this.touchActivity();
  796. if (texts.length === 0) return [];
  797. try {
  798. const contexts = await this.ensureEmbedContexts();
  799. const n = contexts.length;
  800. if (n === 1) {
  801. // Single context: sequential (no point splitting)
  802. const context = contexts[0]!;
  803. const embeddings: ({ embedding: number[]; model: string } | null)[] = [];
  804. for (const text of texts) {
  805. try {
  806. const { text: safeText, truncated } = await this.truncateToContextSize(text);
  807. if (truncated) {
  808. console.warn(`⚠ Batch text truncated to fit embedding context (${this.embedModel?.trainContextSize} tokens)`);
  809. }
  810. const embedding = await context.getEmbeddingFor(safeText);
  811. this.touchActivity();
  812. embeddings.push({ embedding: Array.from(embedding.vector), model: this.embedModelUri });
  813. } catch (err) {
  814. console.error("Embedding error for text:", err);
  815. embeddings.push(null);
  816. }
  817. }
  818. return embeddings;
  819. }
  820. // Multiple contexts: split texts across contexts for parallel evaluation
  821. const chunkSize = Math.ceil(texts.length / n);
  822. const chunks = Array.from({ length: n }, (_, i) =>
  823. texts.slice(i * chunkSize, (i + 1) * chunkSize)
  824. );
  825. const chunkResults = await Promise.all(
  826. chunks.map(async (chunk, i) => {
  827. const ctx = contexts[i]!;
  828. const results: (EmbeddingResult | null)[] = [];
  829. for (const text of chunk) {
  830. try {
  831. const { text: safeText, truncated } = await this.truncateToContextSize(text);
  832. if (truncated) {
  833. console.warn(`⚠ Batch text truncated to fit embedding context (${this.embedModel?.trainContextSize} tokens)`);
  834. }
  835. const embedding = await ctx.getEmbeddingFor(safeText);
  836. this.touchActivity();
  837. results.push({ embedding: Array.from(embedding.vector), model: this.embedModelUri });
  838. } catch (err) {
  839. console.error("Embedding error for text:", err);
  840. results.push(null);
  841. }
  842. }
  843. return results;
  844. })
  845. );
  846. return chunkResults.flat();
  847. } catch (error) {
  848. console.error("Batch embedding error:", error);
  849. return texts.map(() => null);
  850. }
  851. }
  852. async generate(prompt: string, options: GenerateOptions = {}): Promise<GenerateResult | null> {
  853. if (this._ciMode) throw new Error("LLM operations are disabled in CI (set CI=true)");
  854. // Ping activity at start to keep models alive during this operation
  855. this.touchActivity();
  856. // Ensure model is loaded
  857. await this.ensureGenerateModel();
  858. // Create fresh context -> sequence -> session for each call
  859. const context = await this.generateModel!.createContext();
  860. const sequence = context.getSequence();
  861. const session = new LlamaChatSession({ contextSequence: sequence });
  862. const maxTokens = options.maxTokens ?? 150;
  863. // Qwen3 recommends temp=0.7, topP=0.8, topK=20 for non-thinking mode
  864. // DO NOT use greedy decoding (temp=0) - causes repetition loops
  865. const temperature = options.temperature ?? 0.7;
  866. let result = "";
  867. try {
  868. await session.prompt(prompt, {
  869. maxTokens,
  870. temperature,
  871. topK: 20,
  872. topP: 0.8,
  873. onTextChunk: (text) => {
  874. result += text;
  875. },
  876. });
  877. return {
  878. text: result,
  879. model: this.generateModelUri,
  880. done: true,
  881. };
  882. } finally {
  883. // Dispose context (which disposes dependent sequences/sessions per lifecycle rules)
  884. await context.dispose();
  885. }
  886. }
  887. async modelExists(modelUri: string): Promise<ModelInfo> {
  888. // For HuggingFace URIs, we assume they exist
  889. // For local paths, check if file exists
  890. if (modelUri.startsWith("hf:")) {
  891. return { name: modelUri, exists: true };
  892. }
  893. const exists = existsSync(modelUri);
  894. return {
  895. name: modelUri,
  896. exists,
  897. path: exists ? modelUri : undefined,
  898. };
  899. }
  900. // ==========================================================================
  901. // High-level abstractions
  902. // ==========================================================================
  903. async expandQuery(query: string, options: { context?: string, includeLexical?: boolean, intent?: string } = {}): Promise<Queryable[]> {
  904. if (this._ciMode) throw new Error("LLM operations are disabled in CI (set CI=true)");
  905. // Ping activity at start to keep models alive during this operation
  906. this.touchActivity();
  907. const llama = await this.ensureLlama();
  908. await this.ensureGenerateModel();
  909. const includeLexical = options.includeLexical ?? true;
  910. const context = options.context;
  911. const grammar = await llama.createGrammar({
  912. grammar: `
  913. root ::= line+
  914. line ::= type ": " content "\\n"
  915. type ::= "lex" | "vec" | "hyde"
  916. content ::= [^\\n]+
  917. `
  918. });
  919. const intent = options.intent;
  920. const prompt = intent
  921. ? `/no_think Expand this search query: ${query}\nQuery intent: ${intent}`
  922. : `/no_think Expand this search query: ${query}`;
  923. // Create a bounded context for expansion to prevent large default VRAM allocations.
  924. const genContext = await this.generateModel!.createContext({
  925. contextSize: this.expandContextSize,
  926. });
  927. const sequence = genContext.getSequence();
  928. const session = new LlamaChatSession({ contextSequence: sequence });
  929. try {
  930. // Qwen3 recommended settings for non-thinking mode:
  931. // temp=0.7, topP=0.8, topK=20, presence_penalty for repetition
  932. // DO NOT use greedy decoding (temp=0) - causes infinite loops
  933. const result = await session.prompt(prompt, {
  934. grammar,
  935. maxTokens: 600,
  936. temperature: 0.7,
  937. topK: 20,
  938. topP: 0.8,
  939. repeatPenalty: {
  940. lastTokens: 64,
  941. presencePenalty: 0.5,
  942. },
  943. });
  944. const lines = result.trim().split("\n");
  945. const queryLower = query.toLowerCase();
  946. const queryTerms = queryLower.replace(/[^a-z0-9\s]/g, " ").split(/\s+/).filter(Boolean);
  947. const hasQueryTerm = (text: string): boolean => {
  948. const lower = text.toLowerCase();
  949. if (queryTerms.length === 0) return true;
  950. return queryTerms.some(term => lower.includes(term));
  951. };
  952. const queryables: Queryable[] = lines.map(line => {
  953. const colonIdx = line.indexOf(":");
  954. if (colonIdx === -1) return null;
  955. const type = line.slice(0, colonIdx).trim();
  956. if (type !== 'lex' && type !== 'vec' && type !== 'hyde') return null;
  957. const text = line.slice(colonIdx + 1).trim();
  958. if (!hasQueryTerm(text)) return null;
  959. return { type: type as QueryType, text };
  960. }).filter((q): q is Queryable => q !== null);
  961. // Filter out lex entries if not requested
  962. const filtered = includeLexical ? queryables : queryables.filter(q => q.type !== 'lex');
  963. if (filtered.length > 0) return filtered;
  964. const fallback: Queryable[] = [
  965. { type: 'hyde', text: `Information about ${query}` },
  966. { type: 'lex', text: query },
  967. { type: 'vec', text: query },
  968. ];
  969. return includeLexical ? fallback : fallback.filter(q => q.type !== 'lex');
  970. } catch (error) {
  971. console.error("Structured query expansion failed:", error);
  972. // Fallback to original query
  973. const fallback: Queryable[] = [{ type: 'vec', text: query }];
  974. if (includeLexical) fallback.unshift({ type: 'lex', text: query });
  975. return fallback;
  976. } finally {
  977. await genContext.dispose();
  978. }
  979. }
  980. // Qwen3 reranker chat template overhead (system prompt, tags, separators).
  981. // Measured at ~350 tokens on real queries; use 512 as a safe upper bound so
  982. // the truncation budget never lets a document slip past the context limit.
  983. private static readonly RERANK_TEMPLATE_OVERHEAD = 512;
  984. private static readonly RERANK_TARGET_DOCS_PER_CONTEXT = 10;
  985. async rerank(
  986. query: string,
  987. documents: RerankDocument[],
  988. options: RerankOptions = {}
  989. ): Promise<RerankResult> {
  990. if (this._ciMode) throw new Error("LLM operations are disabled in CI (set CI=true)");
  991. // Ping activity at start to keep models alive during this operation
  992. this.touchActivity();
  993. const contexts = await this.ensureRerankContexts();
  994. const model = await this.ensureRerankModel();
  995. // Truncate documents that would exceed the rerank context size.
  996. // Budget = contextSize - template overhead - query tokens
  997. const queryTokens = model.tokenize(query).length;
  998. const maxDocTokens = LlamaCpp.RERANK_CONTEXT_SIZE - LlamaCpp.RERANK_TEMPLATE_OVERHEAD - queryTokens;
  999. const truncationCache = new Map<string, string>();
  1000. const truncatedDocs = documents.map((doc) => {
  1001. const cached = truncationCache.get(doc.text);
  1002. if (cached !== undefined) {
  1003. return cached === doc.text ? doc : { ...doc, text: cached };
  1004. }
  1005. const tokens = model.tokenize(doc.text);
  1006. const truncatedText = tokens.length <= maxDocTokens
  1007. ? doc.text
  1008. : model.detokenize(tokens.slice(0, maxDocTokens));
  1009. truncationCache.set(doc.text, truncatedText);
  1010. if (truncatedText === doc.text) return doc;
  1011. return { ...doc, text: truncatedText };
  1012. });
  1013. // Deduplicate identical effective texts before scoring.
  1014. // This avoids redundant work for repeated chunks and fixes collisions where
  1015. // multiple docs map to the same chunk text.
  1016. const textToDocs = new Map<string, { file: string; index: number }[]>();
  1017. truncatedDocs.forEach((doc, index) => {
  1018. const existing = textToDocs.get(doc.text);
  1019. if (existing) {
  1020. existing.push({ file: doc.file, index });
  1021. } else {
  1022. textToDocs.set(doc.text, [{ file: doc.file, index }]);
  1023. }
  1024. });
  1025. // Extract just the text for ranking
  1026. const texts = Array.from(textToDocs.keys());
  1027. // Split documents across contexts for parallel evaluation.
  1028. // Each context has its own sequence with a lock, so parallelism comes
  1029. // from multiple contexts evaluating different chunks simultaneously.
  1030. const activeContextCount = Math.max(
  1031. 1,
  1032. Math.min(
  1033. contexts.length,
  1034. Math.ceil(texts.length / LlamaCpp.RERANK_TARGET_DOCS_PER_CONTEXT)
  1035. )
  1036. );
  1037. const activeContexts = contexts.slice(0, activeContextCount);
  1038. const chunkSize = Math.ceil(texts.length / activeContexts.length);
  1039. const chunks = Array.from({ length: activeContexts.length }, (_, i) =>
  1040. texts.slice(i * chunkSize, (i + 1) * chunkSize)
  1041. ).filter(chunk => chunk.length > 0);
  1042. const allScores = await Promise.all(
  1043. chunks.map((chunk, i) => activeContexts[i]!.rankAll(query, chunk))
  1044. );
  1045. // Reassemble scores in original order and sort
  1046. const flatScores = allScores.flat();
  1047. const ranked = texts
  1048. .map((text, i) => ({ document: text, score: flatScores[i]! }))
  1049. .sort((a, b) => b.score - a.score);
  1050. // Map back to our result format.
  1051. const results: RerankDocumentResult[] = [];
  1052. for (const item of ranked) {
  1053. const docInfos = textToDocs.get(item.document) ?? [];
  1054. for (const docInfo of docInfos) {
  1055. results.push({
  1056. file: docInfo.file,
  1057. score: item.score,
  1058. index: docInfo.index,
  1059. });
  1060. }
  1061. }
  1062. return {
  1063. results,
  1064. model: this.rerankModelUri,
  1065. };
  1066. }
  1067. /**
  1068. * Get device/GPU info for status display.
  1069. * Initializes llama if not already done.
  1070. */
  1071. async getDeviceInfo(): Promise<{
  1072. gpu: string | false;
  1073. gpuOffloading: boolean;
  1074. gpuDevices: string[];
  1075. vram?: { total: number; used: number; free: number };
  1076. cpuCores: number;
  1077. }> {
  1078. const llama = await this.ensureLlama();
  1079. const gpuDevices = await llama.getGpuDeviceNames();
  1080. let vram: { total: number; used: number; free: number } | undefined;
  1081. if (llama.gpu) {
  1082. try {
  1083. const state = await llama.getVramState();
  1084. vram = { total: state.total, used: state.used, free: state.free };
  1085. } catch { /* no vram info */ }
  1086. }
  1087. return {
  1088. gpu: llama.gpu,
  1089. gpuOffloading: llama.supportsGpuOffloading,
  1090. gpuDevices,
  1091. vram,
  1092. cpuCores: llama.cpuMathCores,
  1093. };
  1094. }
  1095. async dispose(): Promise<void> {
  1096. // Prevent double-dispose
  1097. if (this.disposed) {
  1098. return;
  1099. }
  1100. this.disposed = true;
  1101. // Clear inactivity timer
  1102. if (this.inactivityTimer) {
  1103. clearTimeout(this.inactivityTimer);
  1104. this.inactivityTimer = null;
  1105. }
  1106. // Disposing llama cascades to models and contexts automatically
  1107. // See: https://node-llama-cpp.withcat.ai/guide/objects-lifecycle
  1108. // Note: llama.dispose() can hang indefinitely, so we use a timeout
  1109. if (this.llama) {
  1110. const disposePromise = this.llama.dispose();
  1111. const timeoutPromise = new Promise<void>((resolve) => setTimeout(resolve, 1000));
  1112. await Promise.race([disposePromise, timeoutPromise]);
  1113. }
  1114. // Clear references
  1115. this.embedContexts = [];
  1116. this.rerankContexts = [];
  1117. this.embedModel = null;
  1118. this.generateModel = null;
  1119. this.rerankModel = null;
  1120. this.llama = null;
  1121. // Clear any in-flight load/create promises
  1122. this.embedModelLoadPromise = null;
  1123. this.embedContextsCreatePromise = null;
  1124. this.generateModelLoadPromise = null;
  1125. this.rerankModelLoadPromise = null;
  1126. }
  1127. }
  1128. // =============================================================================
  1129. // Session Management Layer
  1130. // =============================================================================
  1131. /**
  1132. * Manages LLM session lifecycle with reference counting.
  1133. * Coordinates with LlamaCpp idle timeout to prevent disposal during active sessions.
  1134. */
  1135. class LLMSessionManager {
  1136. private llm: LlamaCpp;
  1137. private _activeSessionCount = 0;
  1138. private _inFlightOperations = 0;
  1139. constructor(llm: LlamaCpp) {
  1140. this.llm = llm;
  1141. }
  1142. get activeSessionCount(): number {
  1143. return this._activeSessionCount;
  1144. }
  1145. get inFlightOperations(): number {
  1146. return this._inFlightOperations;
  1147. }
  1148. /**
  1149. * Returns true only when both session count and in-flight operations are 0.
  1150. * Used by LlamaCpp to determine if idle unload is safe.
  1151. */
  1152. canUnload(): boolean {
  1153. return this._activeSessionCount === 0 && this._inFlightOperations === 0;
  1154. }
  1155. acquire(): void {
  1156. this._activeSessionCount++;
  1157. }
  1158. release(): void {
  1159. this._activeSessionCount = Math.max(0, this._activeSessionCount - 1);
  1160. }
  1161. operationStart(): void {
  1162. this._inFlightOperations++;
  1163. }
  1164. operationEnd(): void {
  1165. this._inFlightOperations = Math.max(0, this._inFlightOperations - 1);
  1166. }
  1167. getLlamaCpp(): LlamaCpp {
  1168. return this.llm;
  1169. }
  1170. }
  1171. /**
  1172. * Error thrown when an operation is attempted on a released or aborted session.
  1173. */
  1174. export class SessionReleasedError extends Error {
  1175. constructor(message = "LLM session has been released or aborted") {
  1176. super(message);
  1177. this.name = "SessionReleasedError";
  1178. }
  1179. }
  1180. /**
  1181. * Scoped LLM session with automatic lifecycle management.
  1182. * Wraps LlamaCpp methods with operation tracking and abort handling.
  1183. */
  1184. class LLMSession implements ILLMSession {
  1185. private manager: LLMSessionManager;
  1186. private released = false;
  1187. private abortController: AbortController;
  1188. private maxDurationTimer: ReturnType<typeof setTimeout> | null = null;
  1189. private name: string;
  1190. constructor(manager: LLMSessionManager, options: LLMSessionOptions = {}) {
  1191. this.manager = manager;
  1192. this.name = options.name || "unnamed";
  1193. this.abortController = new AbortController();
  1194. // Link external abort signal if provided
  1195. if (options.signal) {
  1196. if (options.signal.aborted) {
  1197. this.abortController.abort(options.signal.reason);
  1198. } else {
  1199. options.signal.addEventListener("abort", () => {
  1200. this.abortController.abort(options.signal!.reason);
  1201. }, { once: true });
  1202. }
  1203. }
  1204. // Set up max duration timer
  1205. const maxDuration = options.maxDuration ?? 10 * 60 * 1000; // Default 10 minutes
  1206. if (maxDuration > 0) {
  1207. this.maxDurationTimer = setTimeout(() => {
  1208. this.abortController.abort(new Error(`Session "${this.name}" exceeded max duration of ${maxDuration}ms`));
  1209. }, maxDuration);
  1210. this.maxDurationTimer.unref(); // Don't keep process alive
  1211. }
  1212. // Acquire session lease
  1213. this.manager.acquire();
  1214. }
  1215. get isValid(): boolean {
  1216. return !this.released && !this.abortController.signal.aborted;
  1217. }
  1218. get signal(): AbortSignal {
  1219. return this.abortController.signal;
  1220. }
  1221. /**
  1222. * Release the session and decrement ref count.
  1223. * Called automatically by withLLMSession when the callback completes.
  1224. */
  1225. release(): void {
  1226. if (this.released) return;
  1227. this.released = true;
  1228. if (this.maxDurationTimer) {
  1229. clearTimeout(this.maxDurationTimer);
  1230. this.maxDurationTimer = null;
  1231. }
  1232. this.abortController.abort(new Error("Session released"));
  1233. this.manager.release();
  1234. }
  1235. /**
  1236. * Wrap an operation with tracking and abort checking.
  1237. */
  1238. private async withOperation<T>(fn: () => Promise<T>): Promise<T> {
  1239. if (!this.isValid) {
  1240. throw new SessionReleasedError();
  1241. }
  1242. this.manager.operationStart();
  1243. try {
  1244. // Check abort before starting
  1245. if (this.abortController.signal.aborted) {
  1246. throw new SessionReleasedError(
  1247. this.abortController.signal.reason?.message || "Session aborted"
  1248. );
  1249. }
  1250. return await fn();
  1251. } finally {
  1252. this.manager.operationEnd();
  1253. }
  1254. }
  1255. async embed(text: string, options?: EmbedOptions): Promise<EmbeddingResult | null> {
  1256. return this.withOperation(() => this.manager.getLlamaCpp().embed(text, options));
  1257. }
  1258. async embedBatch(texts: string[]): Promise<(EmbeddingResult | null)[]> {
  1259. return this.withOperation(() => this.manager.getLlamaCpp().embedBatch(texts));
  1260. }
  1261. async expandQuery(
  1262. query: string,
  1263. options?: { context?: string; includeLexical?: boolean }
  1264. ): Promise<Queryable[]> {
  1265. return this.withOperation(() => this.manager.getLlamaCpp().expandQuery(query, options));
  1266. }
  1267. async rerank(
  1268. query: string,
  1269. documents: RerankDocument[],
  1270. options?: RerankOptions
  1271. ): Promise<RerankResult> {
  1272. return this.withOperation(() => this.manager.getLlamaCpp().rerank(query, documents, options));
  1273. }
  1274. }
  1275. // Session manager for the default LlamaCpp instance
  1276. let defaultSessionManager: LLMSessionManager | null = null;
  1277. /**
  1278. * Get the session manager for the default LlamaCpp instance.
  1279. */
  1280. function getSessionManager(): LLMSessionManager {
  1281. const llm = getDefaultLlamaCpp();
  1282. if (!defaultSessionManager || defaultSessionManager.getLlamaCpp() !== llm) {
  1283. defaultSessionManager = new LLMSessionManager(llm);
  1284. }
  1285. return defaultSessionManager;
  1286. }
  1287. /**
  1288. * Execute a function with a scoped LLM session.
  1289. * The session provides lifecycle guarantees - resources won't be disposed mid-operation.
  1290. *
  1291. * @example
  1292. * ```typescript
  1293. * await withLLMSession(async (session) => {
  1294. * const expanded = await session.expandQuery(query);
  1295. * const embeddings = await session.embedBatch(texts);
  1296. * const reranked = await session.rerank(query, docs);
  1297. * return reranked;
  1298. * }, { maxDuration: 10 * 60 * 1000, name: 'querySearch' });
  1299. * ```
  1300. */
  1301. export async function withLLMSession<T>(
  1302. fn: (session: ILLMSession) => Promise<T>,
  1303. options?: LLMSessionOptions
  1304. ): Promise<T> {
  1305. const manager = getSessionManager();
  1306. const session = new LLMSession(manager, options);
  1307. try {
  1308. return await fn(session);
  1309. } finally {
  1310. session.release();
  1311. }
  1312. }
  1313. /**
  1314. * Execute a function with a scoped LLM session using a specific LlamaCpp instance.
  1315. * Unlike withLLMSession, this does not use the global singleton.
  1316. */
  1317. export async function withLLMSessionForLlm<T>(
  1318. llm: LlamaCpp,
  1319. fn: (session: ILLMSession) => Promise<T>,
  1320. options?: LLMSessionOptions
  1321. ): Promise<T> {
  1322. const manager = new LLMSessionManager(llm);
  1323. const session = new LLMSession(manager, options);
  1324. try {
  1325. return await fn(session);
  1326. } finally {
  1327. session.release();
  1328. }
  1329. }
  1330. /**
  1331. * Check if idle unload is safe (no active sessions or operations).
  1332. * Used internally by LlamaCpp idle timer.
  1333. */
  1334. export function canUnloadLLM(): boolean {
  1335. if (!defaultSessionManager) return true;
  1336. return defaultSessionManager.canUnload();
  1337. }
  1338. // =============================================================================
  1339. // Singleton for default LlamaCpp instance
  1340. // =============================================================================
  1341. let defaultLlamaCpp: LlamaCpp | null = null;
  1342. /**
  1343. * Get the default LlamaCpp instance (creates one if needed)
  1344. */
  1345. export function getDefaultLlamaCpp(): LlamaCpp {
  1346. if (!defaultLlamaCpp) {
  1347. const embedModel = process.env.QMD_EMBED_MODEL;
  1348. defaultLlamaCpp = new LlamaCpp(embedModel ? { embedModel } : {});
  1349. }
  1350. return defaultLlamaCpp;
  1351. }
  1352. /**
  1353. * Set a custom default LlamaCpp instance (useful for testing)
  1354. */
  1355. export function setDefaultLlamaCpp(llm: LlamaCpp | null): void {
  1356. defaultLlamaCpp = llm;
  1357. }
  1358. /**
  1359. * Dispose the default LlamaCpp instance if it exists.
  1360. * Call this before process exit to prevent NAPI crashes.
  1361. */
  1362. export async function disposeDefaultLlamaCpp(): Promise<void> {
  1363. if (defaultLlamaCpp) {
  1364. await defaultLlamaCpp.dispose();
  1365. defaultLlamaCpp = null;
  1366. }
  1367. }