llm.ts 43 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404
  1. /**
  2. * llm.ts - LLM abstraction layer for QMD using node-llama-cpp
  3. *
  4. * Provides embeddings, text generation, and reranking using local GGUF models.
  5. */
  6. import {
  7. getLlama,
  8. getLlamaGpuTypes,
  9. resolveModelFile,
  10. LlamaChatSession,
  11. LlamaLogLevel,
  12. type Llama,
  13. type LlamaModel,
  14. type LlamaEmbeddingContext,
  15. type Token as LlamaToken,
  16. } from "node-llama-cpp";
  17. import { homedir } from "os";
  18. import { join } from "path";
  19. import { existsSync, mkdirSync, statSync, unlinkSync, readdirSync, readFileSync, writeFileSync } from "fs";
  20. // =============================================================================
  21. // Embedding Formatting Functions
  22. // =============================================================================
  23. /**
  24. * Format a query for embedding.
  25. * Uses nomic-style task prefix format for embeddinggemma.
  26. */
  27. export function formatQueryForEmbedding(query: string): string {
  28. return `task: search result | query: ${query}`;
  29. }
  30. /**
  31. * Format a document for embedding.
  32. * Uses nomic-style format with title and text fields.
  33. */
  34. export function formatDocForEmbedding(text: string, title?: string): string {
  35. return `title: ${title || "none"} | text: ${text}`;
  36. }
  37. // =============================================================================
  38. // Types
  39. // =============================================================================
  40. /**
  41. * Token with log probability
  42. */
  43. export type TokenLogProb = {
  44. token: string;
  45. logprob: number;
  46. };
  47. /**
  48. * Embedding result
  49. */
  50. export type EmbeddingResult = {
  51. embedding: number[];
  52. model: string;
  53. };
  54. /**
  55. * Generation result with optional logprobs
  56. */
  57. export type GenerateResult = {
  58. text: string;
  59. model: string;
  60. logprobs?: TokenLogProb[];
  61. done: boolean;
  62. };
  63. /**
  64. * Rerank result for a single document
  65. */
  66. export type RerankDocumentResult = {
  67. file: string;
  68. score: number;
  69. index: number;
  70. };
  71. /**
  72. * Batch rerank result
  73. */
  74. export type RerankResult = {
  75. results: RerankDocumentResult[];
  76. model: string;
  77. };
  78. /**
  79. * Model info
  80. */
  81. export type ModelInfo = {
  82. name: string;
  83. exists: boolean;
  84. path?: string;
  85. };
  86. /**
  87. * Options for embedding
  88. */
  89. export type EmbedOptions = {
  90. model?: string;
  91. isQuery?: boolean;
  92. title?: string;
  93. };
  94. /**
  95. * Options for text generation
  96. */
  97. export type GenerateOptions = {
  98. model?: string;
  99. maxTokens?: number;
  100. temperature?: number;
  101. };
  102. /**
  103. * Options for reranking
  104. */
  105. export type RerankOptions = {
  106. model?: string;
  107. };
  108. /**
  109. * Options for LLM sessions
  110. */
  111. export type LLMSessionOptions = {
  112. /** Max session duration in ms (default: 10 minutes) */
  113. maxDuration?: number;
  114. /** External abort signal */
  115. signal?: AbortSignal;
  116. /** Debug name for logging */
  117. name?: string;
  118. };
  119. /**
  120. * Session interface for scoped LLM access with lifecycle guarantees
  121. */
  122. export interface ILLMSession {
  123. embed(text: string, options?: EmbedOptions): Promise<EmbeddingResult | null>;
  124. embedBatch(texts: string[]): Promise<(EmbeddingResult | null)[]>;
  125. expandQuery(query: string, options?: { context?: string; includeLexical?: boolean }): Promise<Queryable[]>;
  126. rerank(query: string, documents: RerankDocument[], options?: RerankOptions): Promise<RerankResult>;
  127. /** Whether this session is still valid (not released or aborted) */
  128. readonly isValid: boolean;
  129. /** Abort signal for this session (aborts on release or maxDuration) */
  130. readonly signal: AbortSignal;
  131. }
  132. /**
  133. * Supported query types for different search backends
  134. */
  135. export type QueryType = 'lex' | 'vec' | 'hyde';
  136. /**
  137. * A single query and its target backend type
  138. */
  139. export type Queryable = {
  140. type: QueryType;
  141. text: string;
  142. };
  143. /**
  144. * Document to rerank
  145. */
  146. export type RerankDocument = {
  147. file: string;
  148. text: string;
  149. title?: string;
  150. };
  151. // =============================================================================
  152. // Model Configuration
  153. // =============================================================================
  154. // HuggingFace model URIs for node-llama-cpp
  155. // Format: hf:<user>/<repo>/<file>
  156. const DEFAULT_EMBED_MODEL = "hf:ggml-org/embeddinggemma-300M-GGUF/embeddinggemma-300M-Q8_0.gguf";
  157. const DEFAULT_RERANK_MODEL = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
  158. // const DEFAULT_GENERATE_MODEL = "hf:ggml-org/Qwen3-0.6B-GGUF/Qwen3-0.6B-Q8_0.gguf";
  159. const DEFAULT_GENERATE_MODEL = "hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf";
  160. // Alternative generation models for query expansion:
  161. // LiquidAI LFM2 - hybrid architecture optimized for edge/on-device inference
  162. // Use these as base for fine-tuning with configs/sft_lfm2.yaml
  163. export const LFM2_GENERATE_MODEL = "hf:LiquidAI/LFM2-1.2B-GGUF/LFM2-1.2B-Q4_K_M.gguf";
  164. export const LFM2_INSTRUCT_MODEL = "hf:LiquidAI/LFM2.5-1.2B-Instruct-GGUF/LFM2.5-1.2B-Instruct-Q4_K_M.gguf";
  165. export const DEFAULT_EMBED_MODEL_URI = DEFAULT_EMBED_MODEL;
  166. export const DEFAULT_RERANK_MODEL_URI = DEFAULT_RERANK_MODEL;
  167. export const DEFAULT_GENERATE_MODEL_URI = DEFAULT_GENERATE_MODEL;
  168. // Local model cache directory
  169. const MODEL_CACHE_DIR = join(homedir(), ".cache", "qmd", "models");
  170. export const DEFAULT_MODEL_CACHE_DIR = MODEL_CACHE_DIR;
  171. export type PullResult = {
  172. model: string;
  173. path: string;
  174. sizeBytes: number;
  175. refreshed: boolean;
  176. };
  177. type HfRef = {
  178. repo: string;
  179. file: string;
  180. };
  181. function parseHfUri(model: string): HfRef | null {
  182. if (!model.startsWith("hf:")) return null;
  183. const without = model.slice(3);
  184. const parts = without.split("/");
  185. if (parts.length < 3) return null;
  186. const repo = parts.slice(0, 2).join("/");
  187. const file = parts.slice(2).join("/");
  188. return { repo, file };
  189. }
  190. async function getRemoteEtag(ref: HfRef): Promise<string | null> {
  191. const url = `https://huggingface.co/${ref.repo}/resolve/main/${ref.file}`;
  192. try {
  193. const resp = await fetch(url, { method: "HEAD" });
  194. if (!resp.ok) return null;
  195. const etag = resp.headers.get("etag");
  196. return etag || null;
  197. } catch {
  198. return null;
  199. }
  200. }
  201. export async function pullModels(
  202. models: string[],
  203. options: { refresh?: boolean; cacheDir?: string } = {}
  204. ): Promise<PullResult[]> {
  205. const cacheDir = options.cacheDir || MODEL_CACHE_DIR;
  206. if (!existsSync(cacheDir)) {
  207. mkdirSync(cacheDir, { recursive: true });
  208. }
  209. const results: PullResult[] = [];
  210. for (const model of models) {
  211. let refreshed = false;
  212. const hfRef = parseHfUri(model);
  213. const filename = model.split("/").pop();
  214. const entries = readdirSync(cacheDir, { withFileTypes: true });
  215. const cached = filename
  216. ? entries
  217. .filter((entry) => entry.isFile() && entry.name.includes(filename))
  218. .map((entry) => join(cacheDir, entry.name))
  219. : [];
  220. if (hfRef && filename) {
  221. const etagPath = join(cacheDir, `${filename}.etag`);
  222. const remoteEtag = await getRemoteEtag(hfRef);
  223. const localEtag = existsSync(etagPath)
  224. ? readFileSync(etagPath, "utf-8").trim()
  225. : null;
  226. const shouldRefresh =
  227. options.refresh || !remoteEtag || remoteEtag !== localEtag || cached.length === 0;
  228. if (shouldRefresh) {
  229. for (const candidate of cached) {
  230. if (existsSync(candidate)) unlinkSync(candidate);
  231. }
  232. if (existsSync(etagPath)) unlinkSync(etagPath);
  233. refreshed = cached.length > 0;
  234. }
  235. } else if (options.refresh && filename) {
  236. for (const candidate of cached) {
  237. if (existsSync(candidate)) unlinkSync(candidate);
  238. refreshed = true;
  239. }
  240. }
  241. const path = await resolveModelFile(model, cacheDir);
  242. const sizeBytes = existsSync(path) ? statSync(path).size : 0;
  243. if (hfRef && filename) {
  244. const remoteEtag = await getRemoteEtag(hfRef);
  245. if (remoteEtag) {
  246. const etagPath = join(cacheDir, `${filename}.etag`);
  247. writeFileSync(etagPath, remoteEtag + "\n", "utf-8");
  248. }
  249. }
  250. results.push({ model, path, sizeBytes, refreshed });
  251. }
  252. return results;
  253. }
  254. // =============================================================================
  255. // LLM Interface
  256. // =============================================================================
  257. /**
  258. * Abstract LLM interface - implement this for different backends
  259. */
  260. export interface LLM {
  261. /**
  262. * Get embeddings for text
  263. */
  264. embed(text: string, options?: EmbedOptions): Promise<EmbeddingResult | null>;
  265. /**
  266. * Generate text completion
  267. */
  268. generate(prompt: string, options?: GenerateOptions): Promise<GenerateResult | null>;
  269. /**
  270. * Check if a model exists/is available
  271. */
  272. modelExists(model: string): Promise<ModelInfo>;
  273. /**
  274. * Expand a search query into multiple variations for different backends.
  275. * Returns a list of Queryable objects.
  276. */
  277. expandQuery(query: string, options?: { context?: string, includeLexical?: boolean }): Promise<Queryable[]>;
  278. /**
  279. * Rerank documents by relevance to a query
  280. * Returns list of documents with relevance scores (higher = more relevant)
  281. */
  282. rerank(query: string, documents: RerankDocument[], options?: RerankOptions): Promise<RerankResult>;
  283. /**
  284. * Dispose of resources
  285. */
  286. dispose(): Promise<void>;
  287. }
  288. // =============================================================================
  289. // node-llama-cpp Implementation
  290. // =============================================================================
  291. export type LlamaCppConfig = {
  292. embedModel?: string;
  293. generateModel?: string;
  294. rerankModel?: string;
  295. modelCacheDir?: string;
  296. /**
  297. * Inactivity timeout in ms before unloading contexts (default: 2 minutes, 0 to disable).
  298. *
  299. * Per node-llama-cpp lifecycle guidance, we prefer keeping models loaded and only disposing
  300. * contexts when idle, since contexts (and their sequences) are the heavy per-session objects.
  301. * @see https://node-llama-cpp.withcat.ai/guide/objects-lifecycle
  302. */
  303. inactivityTimeoutMs?: number;
  304. /**
  305. * Whether to dispose models on inactivity (default: false).
  306. *
  307. * Keeping models loaded avoids repeated VRAM thrash; set to true only if you need aggressive
  308. * memory reclaim.
  309. */
  310. disposeModelsOnInactivity?: boolean;
  311. };
  312. /**
  313. * LLM implementation using node-llama-cpp
  314. */
  315. // Default inactivity timeout: 5 minutes (keep models warm during typical search sessions)
  316. const DEFAULT_INACTIVITY_TIMEOUT_MS = 5 * 60 * 1000;
  317. export class LlamaCpp implements LLM {
  318. private llama: Llama | null = null;
  319. private embedModel: LlamaModel | null = null;
  320. private embedContexts: LlamaEmbeddingContext[] = [];
  321. private generateModel: LlamaModel | null = null;
  322. private rerankModel: LlamaModel | null = null;
  323. private rerankContexts: Awaited<ReturnType<LlamaModel["createRankingContext"]>>[] = [];
  324. private embedModelUri: string;
  325. private generateModelUri: string;
  326. private rerankModelUri: string;
  327. private modelCacheDir: string;
  328. // Ensure we don't load the same model/context concurrently (which can allocate duplicate VRAM).
  329. private embedModelLoadPromise: Promise<LlamaModel> | null = null;
  330. private generateModelLoadPromise: Promise<LlamaModel> | null = null;
  331. private rerankModelLoadPromise: Promise<LlamaModel> | null = null;
  332. // Inactivity timer for auto-unloading models
  333. private inactivityTimer: ReturnType<typeof setTimeout> | null = null;
  334. private inactivityTimeoutMs: number;
  335. private disposeModelsOnInactivity: boolean;
  336. // Track disposal state to prevent double-dispose
  337. private disposed = false;
  338. constructor(config: LlamaCppConfig = {}) {
  339. this.embedModelUri = config.embedModel || DEFAULT_EMBED_MODEL;
  340. this.generateModelUri = config.generateModel || DEFAULT_GENERATE_MODEL;
  341. this.rerankModelUri = config.rerankModel || DEFAULT_RERANK_MODEL;
  342. this.modelCacheDir = config.modelCacheDir || MODEL_CACHE_DIR;
  343. this.inactivityTimeoutMs = config.inactivityTimeoutMs ?? DEFAULT_INACTIVITY_TIMEOUT_MS;
  344. this.disposeModelsOnInactivity = config.disposeModelsOnInactivity ?? false;
  345. }
  346. /**
  347. * Reset the inactivity timer. Called after each model operation.
  348. * When timer fires, models are unloaded to free memory (if no active sessions).
  349. */
  350. private touchActivity(): void {
  351. // Clear existing timer
  352. if (this.inactivityTimer) {
  353. clearTimeout(this.inactivityTimer);
  354. this.inactivityTimer = null;
  355. }
  356. // Only set timer if we have disposable contexts and timeout is enabled
  357. if (this.inactivityTimeoutMs > 0 && this.hasLoadedContexts()) {
  358. this.inactivityTimer = setTimeout(() => {
  359. // Check if session manager allows unloading
  360. // canUnloadLLM is defined later in this file - it checks the session manager
  361. // We use dynamic import pattern to avoid circular dependency issues
  362. if (typeof canUnloadLLM === 'function' && !canUnloadLLM()) {
  363. // Active sessions/operations - reschedule timer
  364. this.touchActivity();
  365. return;
  366. }
  367. this.unloadIdleResources().catch(err => {
  368. console.error("Error unloading idle resources:", err);
  369. });
  370. }, this.inactivityTimeoutMs);
  371. // Don't keep process alive just for this timer
  372. this.inactivityTimer.unref();
  373. }
  374. }
  375. /**
  376. * Check if any contexts are currently loaded (and therefore worth unloading on inactivity).
  377. */
  378. private hasLoadedContexts(): boolean {
  379. return !!(this.embedContexts.length > 0 || this.rerankContexts.length > 0);
  380. }
  381. /**
  382. * Unload idle resources but keep the instance alive for future use.
  383. *
  384. * By default, this disposes contexts (and their dependent sequences), while keeping models loaded.
  385. * This matches the intended lifecycle: model → context → sequence, where contexts are per-session.
  386. */
  387. async unloadIdleResources(): Promise<void> {
  388. // Don't unload if already disposed
  389. if (this.disposed) {
  390. return;
  391. }
  392. // Clear timer
  393. if (this.inactivityTimer) {
  394. clearTimeout(this.inactivityTimer);
  395. this.inactivityTimer = null;
  396. }
  397. // Dispose contexts first
  398. for (const ctx of this.embedContexts) {
  399. await ctx.dispose();
  400. }
  401. this.embedContexts = [];
  402. for (const ctx of this.rerankContexts) {
  403. await ctx.dispose();
  404. }
  405. this.rerankContexts = [];
  406. // Optionally dispose models too (opt-in)
  407. if (this.disposeModelsOnInactivity) {
  408. if (this.embedModel) {
  409. await this.embedModel.dispose();
  410. this.embedModel = null;
  411. }
  412. if (this.generateModel) {
  413. await this.generateModel.dispose();
  414. this.generateModel = null;
  415. }
  416. if (this.rerankModel) {
  417. await this.rerankModel.dispose();
  418. this.rerankModel = null;
  419. }
  420. // Reset load promises so models can be reloaded later
  421. this.embedModelLoadPromise = null;
  422. this.generateModelLoadPromise = null;
  423. this.rerankModelLoadPromise = null;
  424. }
  425. // Note: We keep llama instance alive - it's lightweight
  426. }
  427. /**
  428. * Ensure model cache directory exists
  429. */
  430. private ensureModelCacheDir(): void {
  431. if (!existsSync(this.modelCacheDir)) {
  432. mkdirSync(this.modelCacheDir, { recursive: true });
  433. }
  434. }
  435. /**
  436. * Initialize the llama instance (lazy)
  437. */
  438. private async ensureLlama(): Promise<Llama> {
  439. if (!this.llama) {
  440. // Detect available GPU types and use the best one.
  441. // We can't rely on gpu:"auto" — it returns false even when CUDA is available
  442. // (likely a binary/build config issue in node-llama-cpp).
  443. // @ts-expect-error node-llama-cpp API compat
  444. const gpuTypes = await getLlamaGpuTypes();
  445. // Prefer CUDA > Metal > Vulkan > CPU
  446. const preferred = (["cuda", "metal", "vulkan"] as const).find(g => gpuTypes.includes(g));
  447. let llama: Llama;
  448. if (preferred) {
  449. try {
  450. llama = await getLlama({ gpu: preferred, logLevel: LlamaLogLevel.error });
  451. } catch {
  452. llama = await getLlama({ gpu: false, logLevel: LlamaLogLevel.error });
  453. process.stderr.write(
  454. `QMD Warning: ${preferred} reported available but failed to initialize. Falling back to CPU.\n`
  455. );
  456. }
  457. } else {
  458. llama = await getLlama({ gpu: false, logLevel: LlamaLogLevel.error });
  459. }
  460. if (!llama.gpu) {
  461. process.stderr.write(
  462. "QMD Warning: no GPU acceleration, running on CPU (slow). Run 'qmd status' for details.\n"
  463. );
  464. }
  465. this.llama = llama;
  466. }
  467. return this.llama;
  468. }
  469. /**
  470. * Resolve a model URI to a local path, downloading if needed
  471. */
  472. private async resolveModel(modelUri: string): Promise<string> {
  473. this.ensureModelCacheDir();
  474. // resolveModelFile handles HF URIs and downloads to the cache dir
  475. return await resolveModelFile(modelUri, this.modelCacheDir);
  476. }
  477. /**
  478. * Load embedding model (lazy)
  479. */
  480. private async ensureEmbedModel(): Promise<LlamaModel> {
  481. if (this.embedModel) {
  482. return this.embedModel;
  483. }
  484. if (this.embedModelLoadPromise) {
  485. return await this.embedModelLoadPromise;
  486. }
  487. this.embedModelLoadPromise = (async () => {
  488. const llama = await this.ensureLlama();
  489. const modelPath = await this.resolveModel(this.embedModelUri);
  490. const model = await llama.loadModel({ modelPath });
  491. this.embedModel = model;
  492. // Model loading counts as activity - ping to keep alive
  493. this.touchActivity();
  494. return model;
  495. })();
  496. try {
  497. return await this.embedModelLoadPromise;
  498. } finally {
  499. // Keep the resolved model cached; clear only the in-flight promise.
  500. this.embedModelLoadPromise = null;
  501. }
  502. }
  503. /**
  504. * Compute how many parallel contexts to create.
  505. *
  506. * GPU: constrained by VRAM (25% of free, capped at 8).
  507. * CPU: constrained by cores. Splitting threads across contexts enables
  508. * true parallelism (each context runs on its own cores). Use at most
  509. * half the math cores, with at least 4 threads per context.
  510. */
  511. private async computeParallelism(perContextMB: number): Promise<number> {
  512. const llama = await this.ensureLlama();
  513. if (llama.gpu) {
  514. try {
  515. const vram = await llama.getVramState();
  516. const freeMB = vram.free / (1024 * 1024);
  517. const maxByVram = Math.floor((freeMB * 0.25) / perContextMB);
  518. return Math.max(1, Math.min(8, maxByVram));
  519. } catch {
  520. return 2;
  521. }
  522. }
  523. // CPU: split cores across contexts. At least 4 threads per context.
  524. const cores = llama.cpuMathCores || 4;
  525. const maxContexts = Math.floor(cores / 4);
  526. return Math.max(1, Math.min(4, maxContexts));
  527. }
  528. /**
  529. * Get the number of threads each context should use, given N parallel contexts.
  530. * Splits available math cores evenly across contexts.
  531. */
  532. private async threadsPerContext(parallelism: number): Promise<number> {
  533. const llama = await this.ensureLlama();
  534. if (llama.gpu) return 0; // GPU: let the library decide
  535. const cores = llama.cpuMathCores || 4;
  536. return Math.max(1, Math.floor(cores / parallelism));
  537. }
  538. /**
  539. * Load embedding contexts (lazy). Creates multiple for parallel embedding.
  540. * Uses promise guard to prevent concurrent context creation race condition.
  541. */
  542. private embedContextsCreatePromise: Promise<LlamaEmbeddingContext[]> | null = null;
  543. private async ensureEmbedContexts(): Promise<LlamaEmbeddingContext[]> {
  544. if (this.embedContexts.length > 0) {
  545. this.touchActivity();
  546. return this.embedContexts;
  547. }
  548. if (this.embedContextsCreatePromise) {
  549. return await this.embedContextsCreatePromise;
  550. }
  551. this.embedContextsCreatePromise = (async () => {
  552. const model = await this.ensureEmbedModel();
  553. // Embed contexts are ~143 MB each (nomic-embed 2048 ctx)
  554. const n = await this.computeParallelism(150);
  555. const threads = await this.threadsPerContext(n);
  556. for (let i = 0; i < n; i++) {
  557. try {
  558. this.embedContexts.push(await model.createEmbeddingContext({
  559. ...(threads > 0 ? { threads } : {}),
  560. }));
  561. } catch {
  562. if (this.embedContexts.length === 0) throw new Error("Failed to create any embedding context");
  563. break;
  564. }
  565. }
  566. this.touchActivity();
  567. return this.embedContexts;
  568. })();
  569. try {
  570. return await this.embedContextsCreatePromise;
  571. } finally {
  572. this.embedContextsCreatePromise = null;
  573. }
  574. }
  575. /**
  576. * Get a single embed context (for single-embed calls). Uses first from pool.
  577. */
  578. private async ensureEmbedContext(): Promise<LlamaEmbeddingContext> {
  579. const contexts = await this.ensureEmbedContexts();
  580. return contexts[0]!;
  581. }
  582. /**
  583. * Load generation model (lazy) - context is created fresh per call
  584. */
  585. private async ensureGenerateModel(): Promise<LlamaModel> {
  586. if (!this.generateModel) {
  587. if (this.generateModelLoadPromise) {
  588. return await this.generateModelLoadPromise;
  589. }
  590. this.generateModelLoadPromise = (async () => {
  591. const llama = await this.ensureLlama();
  592. const modelPath = await this.resolveModel(this.generateModelUri);
  593. const model = await llama.loadModel({ modelPath });
  594. this.generateModel = model;
  595. return model;
  596. })();
  597. try {
  598. await this.generateModelLoadPromise;
  599. } finally {
  600. this.generateModelLoadPromise = null;
  601. }
  602. }
  603. this.touchActivity();
  604. if (!this.generateModel) {
  605. throw new Error("Generate model not loaded");
  606. }
  607. return this.generateModel;
  608. }
  609. /**
  610. * Load rerank model (lazy)
  611. */
  612. private async ensureRerankModel(): Promise<LlamaModel> {
  613. if (this.rerankModel) {
  614. return this.rerankModel;
  615. }
  616. if (this.rerankModelLoadPromise) {
  617. return await this.rerankModelLoadPromise;
  618. }
  619. this.rerankModelLoadPromise = (async () => {
  620. const llama = await this.ensureLlama();
  621. const modelPath = await this.resolveModel(this.rerankModelUri);
  622. const model = await llama.loadModel({ modelPath });
  623. this.rerankModel = model;
  624. // Model loading counts as activity - ping to keep alive
  625. this.touchActivity();
  626. return model;
  627. })();
  628. try {
  629. return await this.rerankModelLoadPromise;
  630. } finally {
  631. this.rerankModelLoadPromise = null;
  632. }
  633. }
  634. /**
  635. * Load rerank contexts (lazy). Creates multiple contexts for parallel ranking.
  636. * Each context has its own sequence, so they can evaluate independently.
  637. *
  638. * Tuning choices:
  639. * - contextSize 1024: reranking chunks are ~800 tokens max, 1024 is plenty
  640. * - flashAttention: ~20% less VRAM per context (568 vs 711 MB)
  641. * - Combined: drops from 11.6 GB (auto, no flash) to 568 MB per context (20×)
  642. */
  643. // Qwen3 reranker template adds ~200 tokens overhead (system prompt, tags, etc.)
  644. // Chunks are max 800 tokens, so 800 + 200 + query ≈ 1100 tokens typical.
  645. // Use 2048 for safety margin. Still 17× less than auto (40960).
  646. private static readonly RERANK_CONTEXT_SIZE = 2048;
  647. private async ensureRerankContexts(): Promise<Awaited<ReturnType<LlamaModel["createRankingContext"]>>[]> {
  648. if (this.rerankContexts.length === 0) {
  649. const model = await this.ensureRerankModel();
  650. // ~960 MB per context with flash attention at contextSize 2048
  651. const n = await this.computeParallelism(1000);
  652. const threads = await this.threadsPerContext(n);
  653. for (let i = 0; i < n; i++) {
  654. try {
  655. this.rerankContexts.push(await model.createRankingContext({
  656. contextSize: LlamaCpp.RERANK_CONTEXT_SIZE,
  657. flashAttention: true,
  658. ...(threads > 0 ? { threads } : {}),
  659. } as any));
  660. } catch {
  661. if (this.rerankContexts.length === 0) {
  662. // Flash attention might not be supported — retry without it
  663. try {
  664. this.rerankContexts.push(await model.createRankingContext({
  665. contextSize: LlamaCpp.RERANK_CONTEXT_SIZE,
  666. ...(threads > 0 ? { threads } : {}),
  667. }));
  668. } catch {
  669. throw new Error("Failed to create any rerank context");
  670. }
  671. }
  672. break;
  673. }
  674. }
  675. }
  676. this.touchActivity();
  677. return this.rerankContexts;
  678. }
  679. // ==========================================================================
  680. // Tokenization
  681. // ==========================================================================
  682. /**
  683. * Tokenize text using the embedding model's tokenizer
  684. * Returns tokenizer tokens (opaque type from node-llama-cpp)
  685. */
  686. async tokenize(text: string): Promise<readonly LlamaToken[]> {
  687. await this.ensureEmbedContext(); // Ensure model is loaded
  688. if (!this.embedModel) {
  689. throw new Error("Embed model not loaded");
  690. }
  691. return this.embedModel.tokenize(text);
  692. }
  693. /**
  694. * Count tokens in text using the embedding model's tokenizer
  695. */
  696. async countTokens(text: string): Promise<number> {
  697. const tokens = await this.tokenize(text);
  698. return tokens.length;
  699. }
  700. /**
  701. * Detokenize token IDs back to text
  702. */
  703. async detokenize(tokens: readonly LlamaToken[]): Promise<string> {
  704. await this.ensureEmbedContext();
  705. if (!this.embedModel) {
  706. throw new Error("Embed model not loaded");
  707. }
  708. return this.embedModel.detokenize(tokens);
  709. }
  710. // ==========================================================================
  711. // Core API methods
  712. // ==========================================================================
  713. async embed(text: string, options: EmbedOptions = {}): Promise<EmbeddingResult | null> {
  714. // Ping activity at start to keep models alive during this operation
  715. this.touchActivity();
  716. try {
  717. const context = await this.ensureEmbedContext();
  718. const embedding = await context.getEmbeddingFor(text);
  719. return {
  720. embedding: Array.from(embedding.vector),
  721. model: this.embedModelUri,
  722. };
  723. } catch (error) {
  724. console.error("Embedding error:", error);
  725. return null;
  726. }
  727. }
  728. /**
  729. * Batch embed multiple texts efficiently
  730. * Uses Promise.all for parallel embedding - node-llama-cpp handles batching internally
  731. */
  732. async embedBatch(texts: string[]): Promise<(EmbeddingResult | null)[]> {
  733. // Ping activity at start to keep models alive during this operation
  734. this.touchActivity();
  735. if (texts.length === 0) return [];
  736. try {
  737. const contexts = await this.ensureEmbedContexts();
  738. const n = contexts.length;
  739. if (n === 1) {
  740. // Single context: sequential (no point splitting)
  741. const context = contexts[0]!;
  742. const embeddings: ({ embedding: number[]; model: string } | null)[] = [];
  743. for (const text of texts) {
  744. try {
  745. const embedding = await context.getEmbeddingFor(text);
  746. this.touchActivity();
  747. embeddings.push({ embedding: Array.from(embedding.vector), model: this.embedModelUri });
  748. } catch (err) {
  749. console.error("Embedding error for text:", err);
  750. embeddings.push(null);
  751. }
  752. }
  753. return embeddings;
  754. }
  755. // Multiple contexts: split texts across contexts for parallel evaluation
  756. const chunkSize = Math.ceil(texts.length / n);
  757. const chunks = Array.from({ length: n }, (_, i) =>
  758. texts.slice(i * chunkSize, (i + 1) * chunkSize)
  759. );
  760. const chunkResults = await Promise.all(
  761. chunks.map(async (chunk, i) => {
  762. const ctx = contexts[i]!;
  763. const results: (EmbeddingResult | null)[] = [];
  764. for (const text of chunk) {
  765. try {
  766. const embedding = await ctx.getEmbeddingFor(text);
  767. this.touchActivity();
  768. results.push({ embedding: Array.from(embedding.vector), model: this.embedModelUri });
  769. } catch (err) {
  770. console.error("Embedding error for text:", err);
  771. results.push(null);
  772. }
  773. }
  774. return results;
  775. })
  776. );
  777. return chunkResults.flat();
  778. } catch (error) {
  779. console.error("Batch embedding error:", error);
  780. return texts.map(() => null);
  781. }
  782. }
  783. async generate(prompt: string, options: GenerateOptions = {}): Promise<GenerateResult | null> {
  784. // Ping activity at start to keep models alive during this operation
  785. this.touchActivity();
  786. // Ensure model is loaded
  787. await this.ensureGenerateModel();
  788. // Create fresh context -> sequence -> session for each call
  789. const context = await this.generateModel!.createContext();
  790. const sequence = context.getSequence();
  791. const session = new LlamaChatSession({ contextSequence: sequence });
  792. const maxTokens = options.maxTokens ?? 150;
  793. // Qwen3 recommends temp=0.7, topP=0.8, topK=20 for non-thinking mode
  794. // DO NOT use greedy decoding (temp=0) - causes repetition loops
  795. const temperature = options.temperature ?? 0.7;
  796. let result = "";
  797. try {
  798. await session.prompt(prompt, {
  799. maxTokens,
  800. temperature,
  801. topK: 20,
  802. topP: 0.8,
  803. onTextChunk: (text) => {
  804. result += text;
  805. },
  806. });
  807. return {
  808. text: result,
  809. model: this.generateModelUri,
  810. done: true,
  811. };
  812. } finally {
  813. // Dispose context (which disposes dependent sequences/sessions per lifecycle rules)
  814. await context.dispose();
  815. }
  816. }
  817. async modelExists(modelUri: string): Promise<ModelInfo> {
  818. // For HuggingFace URIs, we assume they exist
  819. // For local paths, check if file exists
  820. if (modelUri.startsWith("hf:")) {
  821. return { name: modelUri, exists: true };
  822. }
  823. const exists = existsSync(modelUri);
  824. return {
  825. name: modelUri,
  826. exists,
  827. path: exists ? modelUri : undefined,
  828. };
  829. }
  830. // ==========================================================================
  831. // High-level abstractions
  832. // ==========================================================================
  833. async expandQuery(query: string, options: { context?: string, includeLexical?: boolean } = {}): Promise<Queryable[]> {
  834. // Ping activity at start to keep models alive during this operation
  835. this.touchActivity();
  836. const llama = await this.ensureLlama();
  837. await this.ensureGenerateModel();
  838. const includeLexical = options.includeLexical ?? true;
  839. const context = options.context;
  840. const grammar = await llama.createGrammar({
  841. grammar: `
  842. root ::= line+
  843. line ::= type ": " content "\\n"
  844. type ::= "lex" | "vec" | "hyde"
  845. content ::= [^\\n]+
  846. `
  847. });
  848. const prompt = `/no_think Expand this search query: ${query}`;
  849. // Create fresh context for each call
  850. const genContext = await this.generateModel!.createContext();
  851. const sequence = genContext.getSequence();
  852. const session = new LlamaChatSession({ contextSequence: sequence });
  853. try {
  854. // Qwen3 recommended settings for non-thinking mode:
  855. // temp=0.7, topP=0.8, topK=20, presence_penalty for repetition
  856. // DO NOT use greedy decoding (temp=0) - causes infinite loops
  857. const result = await session.prompt(prompt, {
  858. grammar,
  859. maxTokens: 600,
  860. temperature: 0.7,
  861. topK: 20,
  862. topP: 0.8,
  863. repeatPenalty: {
  864. lastTokens: 64,
  865. presencePenalty: 0.5,
  866. },
  867. });
  868. const lines = result.trim().split("\n");
  869. const queryLower = query.toLowerCase();
  870. const queryTerms = queryLower.replace(/[^a-z0-9\s]/g, " ").split(/\s+/).filter(Boolean);
  871. const hasQueryTerm = (text: string): boolean => {
  872. const lower = text.toLowerCase();
  873. if (queryTerms.length === 0) return true;
  874. return queryTerms.some(term => lower.includes(term));
  875. };
  876. const queryables: Queryable[] = lines.map(line => {
  877. const colonIdx = line.indexOf(":");
  878. if (colonIdx === -1) return null;
  879. const type = line.slice(0, colonIdx).trim();
  880. if (type !== 'lex' && type !== 'vec' && type !== 'hyde') return null;
  881. const text = line.slice(colonIdx + 1).trim();
  882. if (!hasQueryTerm(text)) return null;
  883. return { type: type as QueryType, text };
  884. }).filter((q): q is Queryable => q !== null);
  885. // Filter out lex entries if not requested
  886. const filtered = includeLexical ? queryables : queryables.filter(q => q.type !== 'lex');
  887. if (filtered.length > 0) return filtered;
  888. const fallback: Queryable[] = [
  889. { type: 'hyde', text: `Information about ${query}` },
  890. { type: 'lex', text: query },
  891. { type: 'vec', text: query },
  892. ];
  893. return includeLexical ? fallback : fallback.filter(q => q.type !== 'lex');
  894. } catch (error) {
  895. console.error("Structured query expansion failed:", error);
  896. // Fallback to original query
  897. const fallback: Queryable[] = [{ type: 'vec', text: query }];
  898. if (includeLexical) fallback.unshift({ type: 'lex', text: query });
  899. return fallback;
  900. } finally {
  901. await genContext.dispose();
  902. }
  903. }
  904. async rerank(
  905. query: string,
  906. documents: RerankDocument[],
  907. options: RerankOptions = {}
  908. ): Promise<RerankResult> {
  909. // Ping activity at start to keep models alive during this operation
  910. this.touchActivity();
  911. const contexts = await this.ensureRerankContexts();
  912. // Build a map from document text to original indices (for lookup after sorting)
  913. const textToDoc = new Map<string, { file: string; index: number }>();
  914. documents.forEach((doc, index) => {
  915. textToDoc.set(doc.text, { file: doc.file, index });
  916. });
  917. // Extract just the text for ranking
  918. const texts = documents.map((doc) => doc.text);
  919. // Split documents across contexts for parallel evaluation.
  920. // Each context has its own sequence with a lock, so parallelism comes
  921. // from multiple contexts evaluating different chunks simultaneously.
  922. const n = contexts.length;
  923. const chunkSize = Math.ceil(texts.length / n);
  924. const chunks = Array.from({ length: n }, (_, i) =>
  925. texts.slice(i * chunkSize, (i + 1) * chunkSize)
  926. ).filter(chunk => chunk.length > 0);
  927. const allScores = await Promise.all(
  928. chunks.map((chunk, i) => contexts[i]!.rankAll(query, chunk))
  929. );
  930. // Reassemble scores in original order and sort
  931. const flatScores = allScores.flat();
  932. const ranked = texts
  933. .map((text, i) => ({ document: text, score: flatScores[i]! }))
  934. .sort((a, b) => b.score - a.score);
  935. // Map back to our result format using the text-to-doc map
  936. const results: RerankDocumentResult[] = ranked.map((item) => {
  937. const docInfo = textToDoc.get(item.document)!;
  938. return {
  939. file: docInfo.file,
  940. score: item.score,
  941. index: docInfo.index,
  942. };
  943. });
  944. return {
  945. results,
  946. model: this.rerankModelUri,
  947. };
  948. }
  949. /**
  950. * Get device/GPU info for status display.
  951. * Initializes llama if not already done.
  952. */
  953. async getDeviceInfo(): Promise<{
  954. gpu: string | false;
  955. gpuOffloading: boolean;
  956. gpuDevices: string[];
  957. vram?: { total: number; used: number; free: number };
  958. cpuCores: number;
  959. }> {
  960. const llama = await this.ensureLlama();
  961. const gpuDevices = await llama.getGpuDeviceNames();
  962. let vram: { total: number; used: number; free: number } | undefined;
  963. if (llama.gpu) {
  964. try {
  965. const state = await llama.getVramState();
  966. vram = { total: state.total, used: state.used, free: state.free };
  967. } catch { /* no vram info */ }
  968. }
  969. return {
  970. gpu: llama.gpu,
  971. gpuOffloading: llama.supportsGpuOffloading,
  972. gpuDevices,
  973. vram,
  974. cpuCores: llama.cpuMathCores,
  975. };
  976. }
  977. async dispose(): Promise<void> {
  978. // Prevent double-dispose
  979. if (this.disposed) {
  980. return;
  981. }
  982. this.disposed = true;
  983. // Clear inactivity timer
  984. if (this.inactivityTimer) {
  985. clearTimeout(this.inactivityTimer);
  986. this.inactivityTimer = null;
  987. }
  988. // Disposing llama cascades to models and contexts automatically
  989. // See: https://node-llama-cpp.withcat.ai/guide/objects-lifecycle
  990. // Note: llama.dispose() can hang indefinitely, so we use a timeout
  991. if (this.llama) {
  992. const disposePromise = this.llama.dispose();
  993. const timeoutPromise = new Promise<void>((resolve) => setTimeout(resolve, 1000));
  994. await Promise.race([disposePromise, timeoutPromise]);
  995. }
  996. // Clear references
  997. this.embedContexts = [];
  998. this.rerankContexts = [];
  999. this.embedModel = null;
  1000. this.generateModel = null;
  1001. this.rerankModel = null;
  1002. this.llama = null;
  1003. // Clear any in-flight load/create promises
  1004. this.embedModelLoadPromise = null;
  1005. this.embedContextsCreatePromise = null;
  1006. this.generateModelLoadPromise = null;
  1007. this.rerankModelLoadPromise = null;
  1008. }
  1009. }
  1010. // =============================================================================
  1011. // Session Management Layer
  1012. // =============================================================================
  1013. /**
  1014. * Manages LLM session lifecycle with reference counting.
  1015. * Coordinates with LlamaCpp idle timeout to prevent disposal during active sessions.
  1016. */
  1017. class LLMSessionManager {
  1018. private llm: LlamaCpp;
  1019. private _activeSessionCount = 0;
  1020. private _inFlightOperations = 0;
  1021. constructor(llm: LlamaCpp) {
  1022. this.llm = llm;
  1023. }
  1024. get activeSessionCount(): number {
  1025. return this._activeSessionCount;
  1026. }
  1027. get inFlightOperations(): number {
  1028. return this._inFlightOperations;
  1029. }
  1030. /**
  1031. * Returns true only when both session count and in-flight operations are 0.
  1032. * Used by LlamaCpp to determine if idle unload is safe.
  1033. */
  1034. canUnload(): boolean {
  1035. return this._activeSessionCount === 0 && this._inFlightOperations === 0;
  1036. }
  1037. acquire(): void {
  1038. this._activeSessionCount++;
  1039. }
  1040. release(): void {
  1041. this._activeSessionCount = Math.max(0, this._activeSessionCount - 1);
  1042. }
  1043. operationStart(): void {
  1044. this._inFlightOperations++;
  1045. }
  1046. operationEnd(): void {
  1047. this._inFlightOperations = Math.max(0, this._inFlightOperations - 1);
  1048. }
  1049. getLlamaCpp(): LlamaCpp {
  1050. return this.llm;
  1051. }
  1052. }
  1053. /**
  1054. * Error thrown when an operation is attempted on a released or aborted session.
  1055. */
  1056. export class SessionReleasedError extends Error {
  1057. constructor(message = "LLM session has been released or aborted") {
  1058. super(message);
  1059. this.name = "SessionReleasedError";
  1060. }
  1061. }
  1062. /**
  1063. * Scoped LLM session with automatic lifecycle management.
  1064. * Wraps LlamaCpp methods with operation tracking and abort handling.
  1065. */
  1066. class LLMSession implements ILLMSession {
  1067. private manager: LLMSessionManager;
  1068. private released = false;
  1069. private abortController: AbortController;
  1070. private maxDurationTimer: ReturnType<typeof setTimeout> | null = null;
  1071. private name: string;
  1072. constructor(manager: LLMSessionManager, options: LLMSessionOptions = {}) {
  1073. this.manager = manager;
  1074. this.name = options.name || "unnamed";
  1075. this.abortController = new AbortController();
  1076. // Link external abort signal if provided
  1077. if (options.signal) {
  1078. if (options.signal.aborted) {
  1079. this.abortController.abort(options.signal.reason);
  1080. } else {
  1081. options.signal.addEventListener("abort", () => {
  1082. this.abortController.abort(options.signal!.reason);
  1083. }, { once: true });
  1084. }
  1085. }
  1086. // Set up max duration timer
  1087. const maxDuration = options.maxDuration ?? 10 * 60 * 1000; // Default 10 minutes
  1088. if (maxDuration > 0) {
  1089. this.maxDurationTimer = setTimeout(() => {
  1090. this.abortController.abort(new Error(`Session "${this.name}" exceeded max duration of ${maxDuration}ms`));
  1091. }, maxDuration);
  1092. this.maxDurationTimer.unref(); // Don't keep process alive
  1093. }
  1094. // Acquire session lease
  1095. this.manager.acquire();
  1096. }
  1097. get isValid(): boolean {
  1098. return !this.released && !this.abortController.signal.aborted;
  1099. }
  1100. get signal(): AbortSignal {
  1101. return this.abortController.signal;
  1102. }
  1103. /**
  1104. * Release the session and decrement ref count.
  1105. * Called automatically by withLLMSession when the callback completes.
  1106. */
  1107. release(): void {
  1108. if (this.released) return;
  1109. this.released = true;
  1110. if (this.maxDurationTimer) {
  1111. clearTimeout(this.maxDurationTimer);
  1112. this.maxDurationTimer = null;
  1113. }
  1114. this.abortController.abort(new Error("Session released"));
  1115. this.manager.release();
  1116. }
  1117. /**
  1118. * Wrap an operation with tracking and abort checking.
  1119. */
  1120. private async withOperation<T>(fn: () => Promise<T>): Promise<T> {
  1121. if (!this.isValid) {
  1122. throw new SessionReleasedError();
  1123. }
  1124. this.manager.operationStart();
  1125. try {
  1126. // Check abort before starting
  1127. if (this.abortController.signal.aborted) {
  1128. throw new SessionReleasedError(
  1129. this.abortController.signal.reason?.message || "Session aborted"
  1130. );
  1131. }
  1132. return await fn();
  1133. } finally {
  1134. this.manager.operationEnd();
  1135. }
  1136. }
  1137. async embed(text: string, options?: EmbedOptions): Promise<EmbeddingResult | null> {
  1138. return this.withOperation(() => this.manager.getLlamaCpp().embed(text, options));
  1139. }
  1140. async embedBatch(texts: string[]): Promise<(EmbeddingResult | null)[]> {
  1141. return this.withOperation(() => this.manager.getLlamaCpp().embedBatch(texts));
  1142. }
  1143. async expandQuery(
  1144. query: string,
  1145. options?: { context?: string; includeLexical?: boolean }
  1146. ): Promise<Queryable[]> {
  1147. return this.withOperation(() => this.manager.getLlamaCpp().expandQuery(query, options));
  1148. }
  1149. async rerank(
  1150. query: string,
  1151. documents: RerankDocument[],
  1152. options?: RerankOptions
  1153. ): Promise<RerankResult> {
  1154. return this.withOperation(() => this.manager.getLlamaCpp().rerank(query, documents, options));
  1155. }
  1156. }
  1157. // Session manager for the default LlamaCpp instance
  1158. let defaultSessionManager: LLMSessionManager | null = null;
  1159. /**
  1160. * Get the session manager for the default LlamaCpp instance.
  1161. */
  1162. function getSessionManager(): LLMSessionManager {
  1163. const llm = getDefaultLlamaCpp();
  1164. if (!defaultSessionManager || defaultSessionManager.getLlamaCpp() !== llm) {
  1165. defaultSessionManager = new LLMSessionManager(llm);
  1166. }
  1167. return defaultSessionManager;
  1168. }
  1169. /**
  1170. * Execute a function with a scoped LLM session.
  1171. * The session provides lifecycle guarantees - resources won't be disposed mid-operation.
  1172. *
  1173. * @example
  1174. * ```typescript
  1175. * await withLLMSession(async (session) => {
  1176. * const expanded = await session.expandQuery(query);
  1177. * const embeddings = await session.embedBatch(texts);
  1178. * const reranked = await session.rerank(query, docs);
  1179. * return reranked;
  1180. * }, { maxDuration: 10 * 60 * 1000, name: 'querySearch' });
  1181. * ```
  1182. */
  1183. export async function withLLMSession<T>(
  1184. fn: (session: ILLMSession) => Promise<T>,
  1185. options?: LLMSessionOptions
  1186. ): Promise<T> {
  1187. const manager = getSessionManager();
  1188. const session = new LLMSession(manager, options);
  1189. try {
  1190. return await fn(session);
  1191. } finally {
  1192. session.release();
  1193. }
  1194. }
  1195. /**
  1196. * Check if idle unload is safe (no active sessions or operations).
  1197. * Used internally by LlamaCpp idle timer.
  1198. */
  1199. export function canUnloadLLM(): boolean {
  1200. if (!defaultSessionManager) return true;
  1201. return defaultSessionManager.canUnload();
  1202. }
  1203. // =============================================================================
  1204. // Singleton for default LlamaCpp instance
  1205. // =============================================================================
  1206. let defaultLlamaCpp: LlamaCpp | null = null;
  1207. /**
  1208. * Get the default LlamaCpp instance (creates one if needed)
  1209. */
  1210. export function getDefaultLlamaCpp(): LlamaCpp {
  1211. if (!defaultLlamaCpp) {
  1212. defaultLlamaCpp = new LlamaCpp();
  1213. }
  1214. return defaultLlamaCpp;
  1215. }
  1216. /**
  1217. * Set a custom default LlamaCpp instance (useful for testing)
  1218. */
  1219. export function setDefaultLlamaCpp(llm: LlamaCpp | null): void {
  1220. defaultLlamaCpp = llm;
  1221. }
  1222. /**
  1223. * Dispose the default LlamaCpp instance if it exists.
  1224. * Call this before process exit to prevent NAPI crashes.
  1225. */
  1226. export async function disposeDefaultLlamaCpp(): Promise<void> {
  1227. if (defaultLlamaCpp) {
  1228. await defaultLlamaCpp.dispose();
  1229. defaultLlamaCpp = null;
  1230. }
  1231. }