llm.ts 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824
  1. /**
  2. * llm.ts - LLM abstraction layer for QMD using node-llama-cpp
  3. *
  4. * Provides embeddings, text generation, and reranking using local GGUF models.
  5. */
  6. import {
  7. getLlama,
  8. resolveModelFile,
  9. LlamaChatSession,
  10. LlamaLogLevel,
  11. type Llama,
  12. type LlamaModel,
  13. type LlamaEmbeddingContext,
  14. type Token as LlamaToken,
  15. } from "node-llama-cpp";
  16. import { homedir } from "os";
  17. import { join } from "path";
  18. import { existsSync, mkdirSync } from "fs";
  19. // =============================================================================
  20. // Embedding Formatting Functions
  21. // =============================================================================
  22. /**
  23. * Format a query for embedding.
  24. * Uses nomic-style task prefix format for embeddinggemma.
  25. */
  26. export function formatQueryForEmbedding(query: string): string {
  27. return `task: search result | query: ${query}`;
  28. }
  29. /**
  30. * Format a document for embedding.
  31. * Uses nomic-style format with title and text fields.
  32. */
  33. export function formatDocForEmbedding(text: string, title?: string): string {
  34. return `title: ${title || "none"} | text: ${text}`;
  35. }
  36. // =============================================================================
  37. // Types
  38. // =============================================================================
  39. /**
  40. * Token with log probability
  41. */
  42. export type TokenLogProb = {
  43. token: string;
  44. logprob: number;
  45. };
  46. /**
  47. * Embedding result
  48. */
  49. export type EmbeddingResult = {
  50. embedding: number[];
  51. model: string;
  52. };
  53. /**
  54. * Generation result with optional logprobs
  55. */
  56. export type GenerateResult = {
  57. text: string;
  58. model: string;
  59. logprobs?: TokenLogProb[];
  60. done: boolean;
  61. };
  62. /**
  63. * Rerank result for a single document
  64. */
  65. export type RerankDocumentResult = {
  66. file: string;
  67. score: number;
  68. index: number;
  69. };
  70. /**
  71. * Batch rerank result
  72. */
  73. export type RerankResult = {
  74. results: RerankDocumentResult[];
  75. model: string;
  76. };
  77. /**
  78. * Model info
  79. */
  80. export type ModelInfo = {
  81. name: string;
  82. exists: boolean;
  83. path?: string;
  84. };
  85. /**
  86. * Options for embedding
  87. */
  88. export type EmbedOptions = {
  89. model?: string;
  90. isQuery?: boolean;
  91. title?: string;
  92. };
  93. /**
  94. * Options for text generation
  95. */
  96. export type GenerateOptions = {
  97. model?: string;
  98. maxTokens?: number;
  99. temperature?: number;
  100. };
  101. /**
  102. * Options for reranking
  103. */
  104. export type RerankOptions = {
  105. model?: string;
  106. };
  107. /**
  108. * Supported query types for different search backends
  109. */
  110. export type QueryType = 'lex' | 'vec' | 'hyde';
  111. /**
  112. * A single query and its target backend type
  113. */
  114. export type Queryable = {
  115. type: QueryType;
  116. text: string;
  117. };
  118. /**
  119. * Document to rerank
  120. */
  121. export type RerankDocument = {
  122. file: string;
  123. text: string;
  124. title?: string;
  125. };
  126. // =============================================================================
  127. // Model Configuration
  128. // =============================================================================
  129. // HuggingFace model URIs for node-llama-cpp
  130. // Format: hf:<user>/<repo>/<file>
  131. const DEFAULT_EMBED_MODEL = "hf:ggml-org/embeddinggemma-300M-GGUF/embeddinggemma-300M-Q8_0.gguf";
  132. const DEFAULT_RERANK_MODEL = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
  133. // const DEFAULT_GENERATE_MODEL = "hf:ggml-org/Qwen3-0.6B-GGUF/Qwen3-0.6B-Q8_0.gguf";
  134. const DEFAULT_GENERATE_MODEL = "hf:ggml-org/Qwen3-1.7B-GGUF/Qwen3-1.7b-q8_0.gguf";
  135. // Local model cache directory
  136. const MODEL_CACHE_DIR = join(homedir(), ".cache", "qmd", "models");
  137. // =============================================================================
  138. // LLM Interface
  139. // =============================================================================
  140. /**
  141. * Abstract LLM interface - implement this for different backends
  142. */
  143. export interface LLM {
  144. /**
  145. * Get embeddings for text
  146. */
  147. embed(text: string, options?: EmbedOptions): Promise<EmbeddingResult | null>;
  148. /**
  149. * Generate text completion
  150. */
  151. generate(prompt: string, options?: GenerateOptions): Promise<GenerateResult | null>;
  152. /**
  153. * Check if a model exists/is available
  154. */
  155. modelExists(model: string): Promise<ModelInfo>;
  156. /**
  157. * Expand a search query into multiple variations for different backends.
  158. * Returns a list of Queryable objects.
  159. */
  160. expandQuery(query: string, options?: { context?: string, includeLexical?: boolean }): Promise<Queryable[]>;
  161. /**
  162. * Rerank documents by relevance to a query
  163. * Returns list of documents with relevance scores (higher = more relevant)
  164. */
  165. rerank(query: string, documents: RerankDocument[], options?: RerankOptions): Promise<RerankResult>;
  166. /**
  167. * Dispose of resources
  168. */
  169. dispose(): Promise<void>;
  170. }
  171. // =============================================================================
  172. // node-llama-cpp Implementation
  173. // =============================================================================
  174. export type LlamaCppConfig = {
  175. embedModel?: string;
  176. generateModel?: string;
  177. rerankModel?: string;
  178. modelCacheDir?: string;
  179. /**
  180. * Inactivity timeout in ms before unloading contexts (default: 2 minutes, 0 to disable).
  181. *
  182. * Per node-llama-cpp lifecycle guidance, we prefer keeping models loaded and only disposing
  183. * contexts when idle, since contexts (and their sequences) are the heavy per-session objects.
  184. * @see https://node-llama-cpp.withcat.ai/guide/objects-lifecycle
  185. */
  186. inactivityTimeoutMs?: number;
  187. /**
  188. * Whether to dispose models on inactivity (default: false).
  189. *
  190. * Keeping models loaded avoids repeated VRAM thrash; set to true only if you need aggressive
  191. * memory reclaim.
  192. */
  193. disposeModelsOnInactivity?: boolean;
  194. };
  195. /**
  196. * LLM implementation using node-llama-cpp
  197. */
  198. // Default inactivity timeout: 2 minutes
  199. const DEFAULT_INACTIVITY_TIMEOUT_MS = 2 * 60 * 1000;
  200. export class LlamaCpp implements LLM {
  201. private llama: Llama | null = null;
  202. private embedModel: LlamaModel | null = null;
  203. private embedContext: LlamaEmbeddingContext | null = null;
  204. private generateModel: LlamaModel | null = null;
  205. private rerankModel: LlamaModel | null = null;
  206. private rerankContext: Awaited<ReturnType<LlamaModel["createRankingContext"]>> | null = null;
  207. private embedModelUri: string;
  208. private generateModelUri: string;
  209. private rerankModelUri: string;
  210. private modelCacheDir: string;
  211. // Ensure we don't load the same model concurrently (which can allocate duplicate VRAM).
  212. private embedModelLoadPromise: Promise<LlamaModel> | null = null;
  213. private generateModelLoadPromise: Promise<LlamaModel> | null = null;
  214. private rerankModelLoadPromise: Promise<LlamaModel> | null = null;
  215. // Inactivity timer for auto-unloading models
  216. private inactivityTimer: ReturnType<typeof setTimeout> | null = null;
  217. private inactivityTimeoutMs: number;
  218. private disposeModelsOnInactivity: boolean;
  219. // Track disposal state to prevent double-dispose
  220. private disposed = false;
  221. constructor(config: LlamaCppConfig = {}) {
  222. this.embedModelUri = config.embedModel || DEFAULT_EMBED_MODEL;
  223. this.generateModelUri = config.generateModel || DEFAULT_GENERATE_MODEL;
  224. this.rerankModelUri = config.rerankModel || DEFAULT_RERANK_MODEL;
  225. this.modelCacheDir = config.modelCacheDir || MODEL_CACHE_DIR;
  226. this.inactivityTimeoutMs = config.inactivityTimeoutMs ?? DEFAULT_INACTIVITY_TIMEOUT_MS;
  227. this.disposeModelsOnInactivity = config.disposeModelsOnInactivity ?? false;
  228. }
  229. /**
  230. * Reset the inactivity timer. Called after each model operation.
  231. * When timer fires, models are unloaded to free memory.
  232. */
  233. private touchActivity(): void {
  234. // Clear existing timer
  235. if (this.inactivityTimer) {
  236. clearTimeout(this.inactivityTimer);
  237. this.inactivityTimer = null;
  238. }
  239. // Only set timer if we have disposable contexts and timeout is enabled
  240. if (this.inactivityTimeoutMs > 0 && this.hasLoadedContexts()) {
  241. this.inactivityTimer = setTimeout(() => {
  242. this.unloadIdleResources().catch(err => {
  243. console.error("Error unloading idle resources:", err);
  244. });
  245. }, this.inactivityTimeoutMs);
  246. // Don't keep process alive just for this timer
  247. this.inactivityTimer.unref();
  248. }
  249. }
  250. /**
  251. * Check if any contexts are currently loaded (and therefore worth unloading on inactivity).
  252. */
  253. private hasLoadedContexts(): boolean {
  254. return !!(this.embedContext || this.rerankContext);
  255. }
  256. /**
  257. * Unload idle resources but keep the instance alive for future use.
  258. *
  259. * By default, this disposes contexts (and their dependent sequences), while keeping models loaded.
  260. * This matches the intended lifecycle: model → context → sequence, where contexts are per-session.
  261. */
  262. async unloadIdleResources(): Promise<void> {
  263. // Don't unload if already disposed
  264. if (this.disposed) {
  265. return;
  266. }
  267. // Clear timer
  268. if (this.inactivityTimer) {
  269. clearTimeout(this.inactivityTimer);
  270. this.inactivityTimer = null;
  271. }
  272. // Dispose contexts first
  273. if (this.embedContext) {
  274. await this.embedContext.dispose();
  275. this.embedContext = null;
  276. }
  277. if (this.rerankContext) {
  278. await this.rerankContext.dispose();
  279. this.rerankContext = null;
  280. }
  281. // Optionally dispose models too (opt-in)
  282. if (this.disposeModelsOnInactivity) {
  283. if (this.embedModel) {
  284. await this.embedModel.dispose();
  285. this.embedModel = null;
  286. }
  287. if (this.generateModel) {
  288. await this.generateModel.dispose();
  289. this.generateModel = null;
  290. }
  291. if (this.rerankModel) {
  292. await this.rerankModel.dispose();
  293. this.rerankModel = null;
  294. }
  295. // Reset load promises so models can be reloaded later
  296. this.embedModelLoadPromise = null;
  297. this.generateModelLoadPromise = null;
  298. this.rerankModelLoadPromise = null;
  299. }
  300. // Note: We keep llama instance alive - it's lightweight
  301. }
  302. /**
  303. * Ensure model cache directory exists
  304. */
  305. private ensureModelCacheDir(): void {
  306. if (!existsSync(this.modelCacheDir)) {
  307. mkdirSync(this.modelCacheDir, { recursive: true });
  308. }
  309. }
  310. /**
  311. * Initialize the llama instance (lazy)
  312. */
  313. private async ensureLlama(): Promise<Llama> {
  314. if (!this.llama) {
  315. this.llama = await getLlama({ logLevel: LlamaLogLevel.error });
  316. }
  317. return this.llama;
  318. }
  319. /**
  320. * Resolve a model URI to a local path, downloading if needed
  321. */
  322. private async resolveModel(modelUri: string): Promise<string> {
  323. this.ensureModelCacheDir();
  324. // resolveModelFile handles HF URIs and downloads to the cache dir
  325. return await resolveModelFile(modelUri, this.modelCacheDir);
  326. }
  327. /**
  328. * Load embedding model (lazy)
  329. */
  330. private async ensureEmbedModel(): Promise<LlamaModel> {
  331. if (this.embedModel) {
  332. return this.embedModel;
  333. }
  334. if (this.embedModelLoadPromise) {
  335. return await this.embedModelLoadPromise;
  336. }
  337. this.embedModelLoadPromise = (async () => {
  338. const llama = await this.ensureLlama();
  339. const modelPath = await this.resolveModel(this.embedModelUri);
  340. const model = await llama.loadModel({ modelPath });
  341. this.embedModel = model;
  342. return model;
  343. })();
  344. try {
  345. return await this.embedModelLoadPromise;
  346. } finally {
  347. // Keep the resolved model cached; clear only the in-flight promise.
  348. this.embedModelLoadPromise = null;
  349. }
  350. }
  351. /**
  352. * Load embedding context (lazy). Context can be disposed and recreated without reloading the model.
  353. */
  354. private async ensureEmbedContext(): Promise<LlamaEmbeddingContext> {
  355. if (!this.embedContext) {
  356. const model = await this.ensureEmbedModel();
  357. this.embedContext = await model.createEmbeddingContext();
  358. }
  359. this.touchActivity();
  360. return this.embedContext;
  361. }
  362. /**
  363. * Load generation model (lazy) - context is created fresh per call
  364. */
  365. private async ensureGenerateModel(): Promise<LlamaModel> {
  366. if (!this.generateModel) {
  367. if (this.generateModelLoadPromise) {
  368. return await this.generateModelLoadPromise;
  369. }
  370. this.generateModelLoadPromise = (async () => {
  371. const llama = await this.ensureLlama();
  372. const modelPath = await this.resolveModel(this.generateModelUri);
  373. const model = await llama.loadModel({ modelPath });
  374. this.generateModel = model;
  375. return model;
  376. })();
  377. try {
  378. await this.generateModelLoadPromise;
  379. } finally {
  380. this.generateModelLoadPromise = null;
  381. }
  382. }
  383. this.touchActivity();
  384. if (!this.generateModel) {
  385. throw new Error("Generate model not loaded");
  386. }
  387. return this.generateModel;
  388. }
  389. /**
  390. * Load rerank model (lazy)
  391. */
  392. private async ensureRerankModel(): Promise<LlamaModel> {
  393. if (this.rerankModel) {
  394. return this.rerankModel;
  395. }
  396. if (this.rerankModelLoadPromise) {
  397. return await this.rerankModelLoadPromise;
  398. }
  399. this.rerankModelLoadPromise = (async () => {
  400. const llama = await this.ensureLlama();
  401. const modelPath = await this.resolveModel(this.rerankModelUri);
  402. const model = await llama.loadModel({ modelPath });
  403. this.rerankModel = model;
  404. return model;
  405. })();
  406. try {
  407. return await this.rerankModelLoadPromise;
  408. } finally {
  409. this.rerankModelLoadPromise = null;
  410. }
  411. }
  412. /**
  413. * Load rerank context (lazy). Context can be disposed and recreated without reloading the model.
  414. */
  415. private async ensureRerankContext(): Promise<Awaited<ReturnType<LlamaModel["createRankingContext"]>>> {
  416. if (!this.rerankContext) {
  417. const model = await this.ensureRerankModel();
  418. this.rerankContext = await model.createRankingContext();
  419. }
  420. this.touchActivity();
  421. return this.rerankContext;
  422. }
  423. // ==========================================================================
  424. // Tokenization
  425. // ==========================================================================
  426. /**
  427. * Tokenize text using the embedding model's tokenizer
  428. * Returns tokenizer tokens (opaque type from node-llama-cpp)
  429. */
  430. async tokenize(text: string): Promise<readonly LlamaToken[]> {
  431. await this.ensureEmbedContext(); // Ensure model is loaded
  432. if (!this.embedModel) {
  433. throw new Error("Embed model not loaded");
  434. }
  435. return this.embedModel.tokenize(text);
  436. }
  437. /**
  438. * Count tokens in text using the embedding model's tokenizer
  439. */
  440. async countTokens(text: string): Promise<number> {
  441. const tokens = await this.tokenize(text);
  442. return tokens.length;
  443. }
  444. /**
  445. * Detokenize token IDs back to text
  446. */
  447. async detokenize(tokens: readonly LlamaToken[]): Promise<string> {
  448. await this.ensureEmbedContext();
  449. if (!this.embedModel) {
  450. throw new Error("Embed model not loaded");
  451. }
  452. return this.embedModel.detokenize(tokens);
  453. }
  454. // ==========================================================================
  455. // Core API methods
  456. // ==========================================================================
  457. async embed(text: string, options: EmbedOptions = {}): Promise<EmbeddingResult | null> {
  458. try {
  459. const context = await this.ensureEmbedContext();
  460. const embedding = await context.getEmbeddingFor(text);
  461. return {
  462. embedding: Array.from(embedding.vector),
  463. model: this.embedModelUri,
  464. };
  465. } catch (error) {
  466. console.error("Embedding error:", error);
  467. return null;
  468. }
  469. }
  470. /**
  471. * Batch embed multiple texts efficiently
  472. * Uses Promise.all for parallel embedding - node-llama-cpp handles batching internally
  473. */
  474. async embedBatch(texts: string[]): Promise<(EmbeddingResult | null)[]> {
  475. if (texts.length === 0) return [];
  476. try {
  477. const context = await this.ensureEmbedContext();
  478. // node-llama-cpp handles batching internally when we make parallel requests
  479. const embeddings = await Promise.all(
  480. texts.map(async (text) => {
  481. try {
  482. const embedding = await context.getEmbeddingFor(text);
  483. return {
  484. embedding: Array.from(embedding.vector),
  485. model: this.embedModelUri,
  486. };
  487. } catch (err) {
  488. console.error("Embedding error for text:", err);
  489. return null;
  490. }
  491. })
  492. );
  493. return embeddings;
  494. } catch (error) {
  495. console.error("Batch embedding error:", error);
  496. return texts.map(() => null);
  497. }
  498. }
  499. async generate(prompt: string, options: GenerateOptions = {}): Promise<GenerateResult | null> {
  500. // Ensure model is loaded
  501. await this.ensureGenerateModel();
  502. // Create fresh context -> sequence -> session for each call
  503. const context = await this.generateModel!.createContext();
  504. const sequence = context.getSequence();
  505. const session = new LlamaChatSession({ contextSequence: sequence });
  506. const maxTokens = options.maxTokens ?? 150;
  507. const temperature = options.temperature ?? 0;
  508. let result = "";
  509. try {
  510. await session.prompt(prompt, {
  511. maxTokens,
  512. temperature,
  513. onTextChunk: (text) => {
  514. result += text;
  515. },
  516. });
  517. return {
  518. text: result,
  519. model: this.generateModelUri,
  520. done: true,
  521. };
  522. } finally {
  523. // Dispose context (which disposes dependent sequences/sessions per lifecycle rules)
  524. await context.dispose();
  525. }
  526. }
  527. async modelExists(modelUri: string): Promise<ModelInfo> {
  528. // For HuggingFace URIs, we assume they exist
  529. // For local paths, check if file exists
  530. if (modelUri.startsWith("hf:")) {
  531. return { name: modelUri, exists: true };
  532. }
  533. const exists = existsSync(modelUri);
  534. return {
  535. name: modelUri,
  536. exists,
  537. path: exists ? modelUri : undefined,
  538. };
  539. }
  540. // ==========================================================================
  541. // High-level abstractions
  542. // ==========================================================================
  543. async expandQuery(query: string, options: { context?: string, includeLexical?: boolean } = {}): Promise<Queryable[]> {
  544. const llama = await this.ensureLlama();
  545. await this.ensureGenerateModel();
  546. const includeLexical = options.includeLexical ?? true;
  547. const context = options.context;
  548. const grammar = await llama.createGrammar({
  549. grammar: `
  550. root ::= line+
  551. line ::= type ": " content "\\n"
  552. type ::= "lex" | "vec" | "hyde"
  553. content ::= [^\\n]+
  554. `
  555. });
  556. const prompt = `You are a search query optimization expert. Your task is to improve retrieval by rewriting queries and generating hypothetical documents.
  557. Original Query: ${query}
  558. ${context ? `Additional Context, ONLY USE IF RELEVANT:\n\n<context>${context}</context>` : ""}
  559. ## Step 1: Query Analysis
  560. Identify entities, search intent, and missing context.
  561. ## Step 2: Generate Hypothetical Document
  562. Write a focused sentence passage that would answer the query. Include specific terminology and domain vocabulary.
  563. ## Step 3: Query Rewrites
  564. Generate 2-3 alternative search queries that resolve ambiguities. Use terminology from the hypothetical document.
  565. ## Step 4: Final Retrieval Text
  566. Output exactly 1-3 'lex' lines, 1-3 'vec' lines, and MAX ONE 'hyde' line.
  567. <format>
  568. lex: {single search term}
  569. vec: {single vector query}
  570. hyde: {complete hypothetical document passage from Step 2 on a SINGLE LINE}
  571. </format>
  572. <example>
  573. Example (FOR FORMAT ONLY - DO NOT COPY THIS CONTENT):
  574. lex: example keyword 1
  575. lex: example keyword 2
  576. vec: example semantic query
  577. hyde: This is an example of a hypothetical document passage that would answer the example query. It contains multiple sentences and relevant vocabulary.
  578. </example>
  579. <rules>
  580. - DO NOT repeat the same line.
  581. - Each 'lex:' line MUST be a different keyword variation based on the ORIGINAL QUERY.
  582. - Each 'vec:' line MUST be a different semantic variation based on the ORIGINAL QUERY.
  583. - The 'hyde:' line MUST be the full sentence passage from Step 2, but all on one line.
  584. - DO NOT use the example content above.
  585. ${!includeLexical ? "- Do NOT output any 'lex:' lines" : ""}
  586. </rules>
  587. Final Output:`;
  588. // Create fresh context for each call
  589. const genContext = await this.generateModel!.createContext();
  590. const sequence = genContext.getSequence();
  591. const session = new LlamaChatSession({ contextSequence: sequence });
  592. try {
  593. const result = await session.prompt(prompt, {
  594. grammar,
  595. maxTokens: 1000,
  596. temperature: 1,
  597. });
  598. const lines = result.trim().split("\n");
  599. const queryables: Queryable[] = lines.map(line => {
  600. const colonIdx = line.indexOf(":");
  601. if (colonIdx === -1) return null;
  602. const type = line.slice(0, colonIdx).trim();
  603. if (type !== 'lex' && type !== 'vec' && type !== 'hyde') return null;
  604. const text = line.slice(colonIdx + 1).trim();
  605. return { type: type as QueryType, text };
  606. }).filter((q): q is Queryable => q !== null);
  607. // Filter out lex entries if not requested
  608. if (!includeLexical) {
  609. return queryables.filter(q => q.type !== 'lex');
  610. }
  611. return queryables;
  612. } catch (error) {
  613. console.error("Structured query expansion failed:", error);
  614. // Fallback to original query
  615. const fallback: Queryable[] = [{ type: 'vec', text: query }];
  616. if (includeLexical) fallback.unshift({ type: 'lex', text: query });
  617. return fallback;
  618. } finally {
  619. await genContext.dispose();
  620. }
  621. }
  622. async rerank(
  623. query: string,
  624. documents: RerankDocument[],
  625. options: RerankOptions = {}
  626. ): Promise<RerankResult> {
  627. const context = await this.ensureRerankContext();
  628. // Build a map from document text to original indices (for lookup after sorting)
  629. const textToDoc = new Map<string, { file: string; index: number }>();
  630. documents.forEach((doc, index) => {
  631. textToDoc.set(doc.text, { file: doc.file, index });
  632. });
  633. // Extract just the text for ranking
  634. const texts = documents.map((doc) => doc.text);
  635. // Use the proper ranking API - returns [{document: string, score: number}] sorted by score
  636. const ranked = await context.rankAndSort(query, texts);
  637. // Map back to our result format using the text-to-doc map
  638. const results: RerankDocumentResult[] = ranked.map((item) => {
  639. const docInfo = textToDoc.get(item.document)!;
  640. return {
  641. file: docInfo.file,
  642. score: item.score,
  643. index: docInfo.index,
  644. };
  645. });
  646. return {
  647. results,
  648. model: this.rerankModelUri,
  649. };
  650. }
  651. async dispose(): Promise<void> {
  652. // Prevent double-dispose
  653. if (this.disposed) {
  654. return;
  655. }
  656. this.disposed = true;
  657. // Clear inactivity timer
  658. if (this.inactivityTimer) {
  659. clearTimeout(this.inactivityTimer);
  660. this.inactivityTimer = null;
  661. }
  662. // Disposing llama cascades to models and contexts automatically
  663. // See: https://node-llama-cpp.withcat.ai/guide/objects-lifecycle
  664. // Note: llama.dispose() can hang indefinitely, so we use a timeout
  665. if (this.llama) {
  666. const disposePromise = this.llama.dispose();
  667. const timeoutPromise = new Promise<void>((resolve) => setTimeout(resolve, 1000));
  668. await Promise.race([disposePromise, timeoutPromise]);
  669. }
  670. // Clear references
  671. this.embedContext = null;
  672. this.rerankContext = null;
  673. this.embedModel = null;
  674. this.generateModel = null;
  675. this.rerankModel = null;
  676. this.llama = null;
  677. // Clear any in-flight load promises
  678. this.embedModelLoadPromise = null;
  679. this.generateModelLoadPromise = null;
  680. this.rerankModelLoadPromise = null;
  681. }
  682. }
  683. // =============================================================================
  684. // Singleton for default LlamaCpp instance
  685. // =============================================================================
  686. let defaultLlamaCpp: LlamaCpp | null = null;
  687. /**
  688. * Get the default LlamaCpp instance (creates one if needed)
  689. */
  690. export function getDefaultLlamaCpp(): LlamaCpp {
  691. if (!defaultLlamaCpp) {
  692. defaultLlamaCpp = new LlamaCpp();
  693. }
  694. return defaultLlamaCpp;
  695. }
  696. /**
  697. * Set a custom default LlamaCpp instance (useful for testing)
  698. */
  699. export function setDefaultLlamaCpp(llm: LlamaCpp | null): void {
  700. defaultLlamaCpp = llm;
  701. }
  702. /**
  703. * Dispose the default LlamaCpp instance if it exists.
  704. * Call this before process exit to prevent NAPI crashes.
  705. */
  706. export async function disposeDefaultLlamaCpp(): Promise<void> {
  707. if (defaultLlamaCpp) {
  708. await defaultLlamaCpp.dispose();
  709. defaultLlamaCpp = null;
  710. }
  711. }