llm.test.ts 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869
  1. /**
  2. * llm.test.ts - Unit tests for the LLM abstraction layer (node-llama-cpp)
  3. *
  4. * Run with: bun test src/llm.test.ts
  5. *
  6. * These tests require the actual models to be downloaded. Run the embed or
  7. * rerank functions first to trigger model downloads.
  8. */
  9. import { describe, test, expect, beforeAll, afterAll, vi } from "vitest";
  10. import {
  11. LlamaCpp,
  12. getDefaultLlamaCpp,
  13. disposeDefaultLlamaCpp,
  14. withLLMSession,
  15. canUnloadLLM,
  16. SessionReleasedError,
  17. isLocalLlmDisabled,
  18. resolveLlamaGpuMode,
  19. type RerankDocument,
  20. type ILLMSession,
  21. } from "../src/llm.js";
  22. // =============================================================================
  23. // Singleton Tests (no model loading required)
  24. // =============================================================================
  25. describe("Default LlamaCpp Singleton", () => {
  26. // Test singleton behavior without resetting to avoid orphan instances
  27. test("getDefaultLlamaCpp returns same instance on subsequent calls", () => {
  28. const llm1 = getDefaultLlamaCpp();
  29. const llm2 = getDefaultLlamaCpp();
  30. expect(llm1).toBe(llm2);
  31. expect(llm1).toBeInstanceOf(LlamaCpp);
  32. });
  33. });
  34. // =============================================================================
  35. // Model Existence Tests
  36. // =============================================================================
  37. describe("LlamaCpp.modelExists", () => {
  38. test("returns exists:true for HuggingFace model URIs", async () => {
  39. const llm = getDefaultLlamaCpp();
  40. const result = await llm.modelExists("hf:org/repo/model.gguf");
  41. expect(result.exists).toBe(true);
  42. expect(result.name).toBe("hf:org/repo/model.gguf");
  43. });
  44. test("returns exists:false for non-existent local paths", async () => {
  45. const llm = getDefaultLlamaCpp();
  46. const result = await llm.modelExists("/nonexistent/path/model.gguf");
  47. expect(result.exists).toBe(false);
  48. expect(result.name).toBe("/nonexistent/path/model.gguf");
  49. });
  50. });
  51. describe("LlamaCpp expand context size config", () => {
  52. const defaultExpandContextSize = 2048;
  53. test("uses default expand context size when no config or env is set", () => {
  54. const prev = process.env.QMD_EXPAND_CONTEXT_SIZE;
  55. delete process.env.QMD_EXPAND_CONTEXT_SIZE;
  56. try {
  57. const llm = new LlamaCpp({}) as any;
  58. expect(llm.expandContextSize).toBe(defaultExpandContextSize);
  59. } finally {
  60. if (prev === undefined) delete process.env.QMD_EXPAND_CONTEXT_SIZE;
  61. else process.env.QMD_EXPAND_CONTEXT_SIZE = prev;
  62. }
  63. });
  64. test("uses QMD_EXPAND_CONTEXT_SIZE when set to a positive integer", () => {
  65. const prev = process.env.QMD_EXPAND_CONTEXT_SIZE;
  66. process.env.QMD_EXPAND_CONTEXT_SIZE = "3072";
  67. try {
  68. const llm = new LlamaCpp({}) as any;
  69. expect(llm.expandContextSize).toBe(3072);
  70. } finally {
  71. if (prev === undefined) delete process.env.QMD_EXPAND_CONTEXT_SIZE;
  72. else process.env.QMD_EXPAND_CONTEXT_SIZE = prev;
  73. }
  74. });
  75. test("config value overrides QMD_EXPAND_CONTEXT_SIZE", () => {
  76. const prev = process.env.QMD_EXPAND_CONTEXT_SIZE;
  77. process.env.QMD_EXPAND_CONTEXT_SIZE = "4096";
  78. try {
  79. const llm = new LlamaCpp({ expandContextSize: 1536 }) as any;
  80. expect(llm.expandContextSize).toBe(1536);
  81. } finally {
  82. if (prev === undefined) delete process.env.QMD_EXPAND_CONTEXT_SIZE;
  83. else process.env.QMD_EXPAND_CONTEXT_SIZE = prev;
  84. }
  85. });
  86. test("falls back to default and warns when QMD_EXPAND_CONTEXT_SIZE is invalid", () => {
  87. const prev = process.env.QMD_EXPAND_CONTEXT_SIZE;
  88. process.env.QMD_EXPAND_CONTEXT_SIZE = "bad";
  89. const stderrSpy = vi.spyOn(process.stderr, "write").mockReturnValue(true);
  90. try {
  91. const llm = new LlamaCpp({}) as any;
  92. expect(llm.expandContextSize).toBe(defaultExpandContextSize);
  93. expect(stderrSpy).toHaveBeenCalled();
  94. expect(String(stderrSpy.mock.calls[0]?.[0] || "")).toContain("QMD_EXPAND_CONTEXT_SIZE");
  95. } finally {
  96. stderrSpy.mockRestore();
  97. if (prev === undefined) delete process.env.QMD_EXPAND_CONTEXT_SIZE;
  98. else process.env.QMD_EXPAND_CONTEXT_SIZE = prev;
  99. }
  100. });
  101. test("throws when config expandContextSize is invalid", () => {
  102. expect(() => new LlamaCpp({ expandContextSize: 0 })).toThrow(
  103. "Invalid expandContextSize: 0. Must be a positive integer."
  104. );
  105. });
  106. });
  107. describe("LlamaCpp model resolution (config > env > default)", () => {
  108. const HARDCODED_EMBED = "hf:ggml-org/embeddinggemma-300M-GGUF/embeddinggemma-300M-Q8_0.gguf";
  109. const HARDCODED_RERANK = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
  110. const HARDCODED_GENERATE = "hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf";
  111. test("uses hardcoded default when no config or env is set", () => {
  112. const prev = process.env.QMD_EMBED_MODEL;
  113. delete process.env.QMD_EMBED_MODEL;
  114. try {
  115. const llm = new LlamaCpp({}) as any;
  116. expect(llm.embedModelUri).toBe(HARDCODED_EMBED);
  117. expect(llm.rerankModelUri).toBe(HARDCODED_RERANK);
  118. expect(llm.generateModelUri).toBe(HARDCODED_GENERATE);
  119. } finally {
  120. if (prev === undefined) delete process.env.QMD_EMBED_MODEL;
  121. else process.env.QMD_EMBED_MODEL = prev;
  122. }
  123. });
  124. test("env var overrides hardcoded default", () => {
  125. const prev = process.env.QMD_EMBED_MODEL;
  126. process.env.QMD_EMBED_MODEL = "hf:custom/embed-model.gguf";
  127. try {
  128. const llm = new LlamaCpp({}) as any;
  129. expect(llm.embedModelUri).toBe("hf:custom/embed-model.gguf");
  130. } finally {
  131. if (prev === undefined) delete process.env.QMD_EMBED_MODEL;
  132. else process.env.QMD_EMBED_MODEL = prev;
  133. }
  134. });
  135. test("config overrides env var", () => {
  136. const prev = process.env.QMD_EMBED_MODEL;
  137. process.env.QMD_EMBED_MODEL = "hf:env/model.gguf";
  138. try {
  139. const llm = new LlamaCpp({ embedModel: "hf:config/model.gguf" }) as any;
  140. expect(llm.embedModelUri).toBe("hf:config/model.gguf");
  141. } finally {
  142. if (prev === undefined) delete process.env.QMD_EMBED_MODEL;
  143. else process.env.QMD_EMBED_MODEL = prev;
  144. }
  145. });
  146. });
  147. // =============================================================================
  148. // QMD_DISABLE_LOCAL_LLM + remote-only auto-CPU (i-c28wngnd)
  149. // =============================================================================
  150. describe("isLocalLlmDisabled (QMD_DISABLE_LOCAL_LLM)", () => {
  151. test("returns false when env var is unset", () => {
  152. expect(isLocalLlmDisabled({})).toBe(false);
  153. expect(isLocalLlmDisabled({ QMD_DISABLE_LOCAL_LLM: undefined })).toBe(false);
  154. });
  155. test("returns false when env var is empty / whitespace", () => {
  156. expect(isLocalLlmDisabled({ QMD_DISABLE_LOCAL_LLM: "" })).toBe(false);
  157. expect(isLocalLlmDisabled({ QMD_DISABLE_LOCAL_LLM: " " })).toBe(false);
  158. });
  159. test("returns true for canonical truthy values", () => {
  160. for (const v of ["1", "true", "yes", "on", "TRUE", "Yes", " 1 "]) {
  161. expect(isLocalLlmDisabled({ QMD_DISABLE_LOCAL_LLM: v })).toBe(true);
  162. }
  163. });
  164. test("returns false for canonical falsy values", () => {
  165. for (const v of ["0", "false", "no", "off", "FALSE", "No"]) {
  166. expect(isLocalLlmDisabled({ QMD_DISABLE_LOCAL_LLM: v })).toBe(false);
  167. }
  168. });
  169. });
  170. describe("resolveLlamaGpuMode (QMD_LLAMA_GPU + QMD_EMBED_ENDPOINT)", () => {
  171. test("returns 'auto' for empty env (legacy local-only setup)", () => {
  172. expect(resolveLlamaGpuMode({})).toBe("auto");
  173. });
  174. test("explicit QMD_LLAMA_GPU=off|none|0|disabled forces CPU", () => {
  175. for (const v of ["off", "none", "false", "0", "disabled", "disable", "OFF", "None"]) {
  176. expect(resolveLlamaGpuMode({ QMD_LLAMA_GPU: v })).toBe("cpu");
  177. }
  178. });
  179. test("explicit QMD_LLAMA_GPU=auto|on|true preserves probe", () => {
  180. for (const v of ["auto", "on", "true", "Auto"]) {
  181. expect(resolveLlamaGpuMode({ QMD_LLAMA_GPU: v })).toBe("auto");
  182. }
  183. });
  184. test("auto-detect: QMD_EMBED_ENDPOINT set → CPU (skip Vulkan probe)", () => {
  185. expect(
  186. resolveLlamaGpuMode({ QMD_EMBED_ENDPOINT: "http://models:8082" }),
  187. ).toBe("cpu");
  188. });
  189. test("explicit QMD_LLAMA_GPU=auto OVERRIDES QMD_EMBED_ENDPOINT auto-CPU", () => {
  190. expect(
  191. resolveLlamaGpuMode({
  192. QMD_LLAMA_GPU: "auto",
  193. QMD_EMBED_ENDPOINT: "http://models:8082",
  194. }),
  195. ).toBe("auto");
  196. });
  197. test("empty QMD_EMBED_ENDPOINT does not trigger auto-CPU", () => {
  198. expect(resolveLlamaGpuMode({ QMD_EMBED_ENDPOINT: "" })).toBe("auto");
  199. expect(resolveLlamaGpuMode({ QMD_EMBED_ENDPOINT: " " })).toBe("auto");
  200. });
  201. test("unknown QMD_LLAMA_GPU values fall back to 'auto' (preserve legacy probe)", () => {
  202. expect(resolveLlamaGpuMode({ QMD_LLAMA_GPU: "vulkan" })).toBe("auto");
  203. expect(resolveLlamaGpuMode({ QMD_LLAMA_GPU: "cuda" })).toBe("auto");
  204. });
  205. });
  206. describe("LlamaCpp.ensureLlama() + QMD_DISABLE_LOCAL_LLM", () => {
  207. test("throws with actionable error when QMD_DISABLE_LOCAL_LLM=1", async () => {
  208. const prev = process.env.QMD_DISABLE_LOCAL_LLM;
  209. process.env.QMD_DISABLE_LOCAL_LLM = "1";
  210. try {
  211. const llm = new LlamaCpp({}) as any;
  212. await expect(llm.ensureLlama()).rejects.toThrow(/QMD_DISABLE_LOCAL_LLM/);
  213. await expect(llm.ensureLlama()).rejects.toThrow(/EmbeddingProvider/);
  214. } finally {
  215. if (prev === undefined) delete process.env.QMD_DISABLE_LOCAL_LLM;
  216. else process.env.QMD_DISABLE_LOCAL_LLM = prev;
  217. }
  218. });
  219. test("does not throw when QMD_DISABLE_LOCAL_LLM is unset (smoke)", () => {
  220. // We don't want to actually call getLlama() (slow / loads native), but
  221. // we verify the guard does NOT fire for an empty/unset env. The full
  222. // integration path is exercised by the gated CI suite below.
  223. const prev = process.env.QMD_DISABLE_LOCAL_LLM;
  224. delete process.env.QMD_DISABLE_LOCAL_LLM;
  225. try {
  226. expect(isLocalLlmDisabled(process.env)).toBe(false);
  227. } finally {
  228. if (prev !== undefined) process.env.QMD_DISABLE_LOCAL_LLM = prev;
  229. }
  230. });
  231. });
  232. describe("LlamaCpp rerank deduping", () => {
  233. test("deduplicates identical document texts before scoring", async () => {
  234. const llm = new LlamaCpp({}) as any;
  235. llm._ciMode = false; // allow unit test even in CI (mocked, no real models)
  236. const rankAll = vi.fn(async (_query: string, docs: string[]) =>
  237. docs.map((doc) => doc === "shared chunk" ? 0.9 : 0.2)
  238. );
  239. llm.touchActivity = vi.fn();
  240. llm.ensureRerankContexts = vi.fn().mockResolvedValue([{ rankAll }]);
  241. llm.ensureRerankModel = vi.fn().mockResolvedValue({
  242. tokenize: (text: string) => Array.from(text),
  243. detokenize: (tokens: string[]) => tokens.join(""),
  244. });
  245. const result = await llm.rerank("query", [
  246. { file: "a.md", text: "shared chunk" },
  247. { file: "b.md", text: "shared chunk" },
  248. { file: "c.md", text: "different chunk" },
  249. ]);
  250. expect(rankAll).toHaveBeenCalledTimes(1);
  251. expect(rankAll).toHaveBeenCalledWith("query", ["shared chunk", "different chunk"]);
  252. expect(result.results).toHaveLength(3);
  253. const scoreByFile = new Map(result.results.map((item) => [item.file, item.score]));
  254. expect(scoreByFile.get("a.md")).toBe(0.9);
  255. expect(scoreByFile.get("b.md")).toBe(0.9);
  256. expect(scoreByFile.get("c.md")).toBe(0.2);
  257. });
  258. });
  259. // =============================================================================
  260. // Integration Tests (require actual models)
  261. // =============================================================================
  262. describe.skipIf(!!process.env.CI)("LlamaCpp Integration", () => {
  263. // Use the singleton to avoid multiple Metal contexts
  264. const llm = getDefaultLlamaCpp();
  265. afterAll(async () => {
  266. // Ensure native resources are released to avoid ggml-metal asserts on process exit.
  267. await disposeDefaultLlamaCpp();
  268. });
  269. describe("embed", () => {
  270. test("returns embedding with correct dimensions", async () => {
  271. const result = await llm.embed("Hello world");
  272. expect(result).not.toBeNull();
  273. expect(result!.embedding).toBeInstanceOf(Array);
  274. expect(result!.embedding.length).toBeGreaterThan(0);
  275. // embeddinggemma outputs 768 dimensions
  276. expect(result!.embedding.length).toBe(768);
  277. });
  278. test("returns consistent embeddings for same input", async () => {
  279. const result1 = await llm.embed("test text");
  280. const result2 = await llm.embed("test text");
  281. expect(result1).not.toBeNull();
  282. expect(result2).not.toBeNull();
  283. // Embeddings should be identical for the same input
  284. for (let i = 0; i < result1!.embedding.length; i++) {
  285. expect(result1!.embedding[i]).toBeCloseTo(result2!.embedding[i]!, 5);
  286. }
  287. });
  288. test("returns different embeddings for different inputs", async () => {
  289. const result1 = await llm.embed("cats are great");
  290. const result2 = await llm.embed("database optimization");
  291. expect(result1).not.toBeNull();
  292. expect(result2).not.toBeNull();
  293. // Calculate cosine similarity - should be less than 1.0 (not identical)
  294. let dotProduct = 0;
  295. let norm1 = 0;
  296. let norm2 = 0;
  297. for (let i = 0; i < result1!.embedding.length; i++) {
  298. const v1 = result1!.embedding[i]!;
  299. const v2 = result2!.embedding[i]!;
  300. dotProduct += v1 * v2;
  301. norm1 += v1 ** 2;
  302. norm2 += v2 ** 2;
  303. }
  304. const similarity = dotProduct / (Math.sqrt(norm1) * Math.sqrt(norm2));
  305. expect(similarity).toBeLessThan(0.95); // Should be meaningfully different
  306. });
  307. });
  308. describe("embedBatch", () => {
  309. test("returns embeddings for multiple texts", async () => {
  310. const texts = ["Hello world", "Test text", "Another document"];
  311. const results = await llm.embedBatch(texts);
  312. expect(results).toHaveLength(3);
  313. for (const result of results) {
  314. expect(result).not.toBeNull();
  315. expect(result!.embedding.length).toBe(768);
  316. }
  317. });
  318. test("returns same results as individual embed calls", async () => {
  319. const texts = ["cats are great", "dogs are awesome"];
  320. // Get batch embeddings
  321. const batchResults = await llm.embedBatch(texts);
  322. // Get individual embeddings
  323. const individualResults = await Promise.all(texts.map(t => llm.embed(t)));
  324. // Compare - should be identical
  325. for (let i = 0; i < texts.length; i++) {
  326. expect(batchResults[i]).not.toBeNull();
  327. expect(individualResults[i]).not.toBeNull();
  328. for (let j = 0; j < batchResults[i]!.embedding.length; j++) {
  329. expect(batchResults[i]!.embedding[j]).toBeCloseTo(individualResults[i]!.embedding[j]!, 5);
  330. }
  331. }
  332. });
  333. test("handles empty array", async () => {
  334. const results = await llm.embedBatch([]);
  335. expect(results).toHaveLength(0);
  336. });
  337. test("batch is faster than sequential", async () => {
  338. const texts = Array(10).fill(null).map((_, i) => `Document number ${i} with content`);
  339. // Time batch
  340. const batchStart = Date.now();
  341. await llm.embedBatch(texts);
  342. const batchTime = Date.now() - batchStart;
  343. // Time sequential
  344. const seqStart = Date.now();
  345. for (const text of texts) {
  346. await llm.embed(text);
  347. }
  348. const seqTime = Date.now() - seqStart;
  349. console.log(`Batch: ${batchTime}ms, Sequential: ${seqTime}ms`);
  350. // Performance is machine/load dependent. We only assert batch isn't drastically worse.
  351. expect(batchTime).toBeLessThanOrEqual(seqTime * 3);
  352. });
  353. test("handles concurrent embedBatch calls on fresh instance without race condition", async () => {
  354. // This test verifies the fix for a race condition where concurrent calls to
  355. // ensureEmbedContext() could create multiple contexts. Without the promise guard,
  356. // each concurrent embedBatch call sees embedContext === null and creates its own
  357. // context, causing resource leaks and potential "Context is disposed" errors.
  358. //
  359. // See: https://github.com/tobi/qmd/pull/54
  360. //
  361. // The fix uses a promise guard to ensure only one context creation runs at a time.
  362. // We verify this by instrumenting createEmbeddingContext to count invocations.
  363. const freshLlm = new LlamaCpp({});
  364. let contextCreateCount = 0;
  365. // Instrument the model's createEmbeddingContext to count calls
  366. const originalEnsureEmbedModel = (freshLlm as any).ensureEmbedModel.bind(freshLlm);
  367. let modelInstrumented = false;
  368. (freshLlm as any).ensureEmbedModel = async function() {
  369. const model = await originalEnsureEmbedModel();
  370. if (!modelInstrumented) {
  371. modelInstrumented = true;
  372. const originalCreate = model.createEmbeddingContext.bind(model);
  373. model.createEmbeddingContext = async function(...args: any[]) {
  374. contextCreateCount++;
  375. return originalCreate(...args);
  376. };
  377. }
  378. return model;
  379. };
  380. const texts = Array(10).fill(null).map((_, i) => `Document ${i}`);
  381. // Call embedBatch 5 TIMES in parallel on fresh instance.
  382. // Without the promise guard fix, this would create 5 contexts (one per call).
  383. // With the fix, only 1 context should be created.
  384. const batches = await Promise.all([
  385. freshLlm.embedBatch(texts.slice(0, 2)),
  386. freshLlm.embedBatch(texts.slice(2, 4)),
  387. freshLlm.embedBatch(texts.slice(4, 6)),
  388. freshLlm.embedBatch(texts.slice(6, 8)),
  389. freshLlm.embedBatch(texts.slice(8, 10)),
  390. ]);
  391. const allResults = batches.flat();
  392. expect(allResults).toHaveLength(10);
  393. const successCount = allResults.filter(r => r !== null).length;
  394. expect(successCount).toBe(10);
  395. // THE KEY ASSERTION: Contexts should be created once (by ensureEmbedContexts),
  396. // not duplicated per concurrent embedBatch call. The exact count depends on
  397. // available VRAM (computeParallelism), but should not be 5 (one per call).
  398. // Without the fix, contextCreateCount would be 5× the intended count (one set per concurrent call).
  399. // With the promise guard, contexts are created exactly once regardless of concurrent callers.
  400. // The count depends on VRAM (computeParallelism), but should be ≤ 8 (the cap).
  401. console.log(`Context creation count: ${contextCreateCount} (expected: ≤ 8, not 5× duplicated)`);
  402. expect(contextCreateCount).toBeGreaterThanOrEqual(1);
  403. expect(contextCreateCount).toBeLessThanOrEqual(8);
  404. await freshLlm.dispose();
  405. }, 60000);
  406. });
  407. describe("rerank", () => {
  408. test("scores capital of France question correctly", async () => {
  409. const query = "What is the capital of France?";
  410. const documents: RerankDocument[] = [
  411. { file: "butterflies.txt", text: "Butterflies indeed fly through the garden." },
  412. { file: "france.txt", text: "The capital of France is Paris." },
  413. { file: "canada.txt", text: "The capital of Canada is Ottawa." },
  414. ];
  415. const result = await llm.rerank(query, documents);
  416. expect(result.results).toHaveLength(3);
  417. // The France document should score highest
  418. expect(result.results[0]!.file).toBe("france.txt");
  419. expect(result.results[0]!.score).toBeGreaterThan(0.7);
  420. // Canada should be somewhat relevant (also about capitals)
  421. expect(result.results[1]!.file).toBe("canada.txt");
  422. // Butterflies should score lowest
  423. expect(result.results[2]!.file).toBe("butterflies.txt");
  424. expect(result.results[2]!.score).toBeLessThan(0.6);
  425. });
  426. test("scores authentication query correctly", async () => {
  427. const query = "How do I configure authentication?";
  428. const documents: RerankDocument[] = [
  429. { file: "weather.md", text: "The weather today is sunny with mild temperatures." },
  430. { file: "auth.md", text: "Authentication can be configured by setting the AUTH_SECRET environment variable." },
  431. { file: "pizza.md", text: "Our restaurant serves the best pizza in town." },
  432. { file: "jwt.md", text: "JWT authentication requires a secret key and expiration time." },
  433. ];
  434. const result = await llm.rerank(query, documents);
  435. expect(result.results).toHaveLength(4);
  436. // Auth documents should score highest
  437. const topTwo = result.results.slice(0, 2).map((r) => r.file);
  438. expect(topTwo).toContain("auth.md");
  439. expect(topTwo).toContain("jwt.md");
  440. // Irrelevant documents should score lowest
  441. const bottomTwo = result.results.slice(2).map((r) => r.file);
  442. expect(bottomTwo).toContain("weather.md");
  443. expect(bottomTwo).toContain("pizza.md");
  444. });
  445. test("handles programming queries correctly", async () => {
  446. const query = "How do I handle errors in JavaScript?";
  447. const documents: RerankDocument[] = [
  448. { file: "cooking.md", text: "To make a good pasta, boil water and add salt." },
  449. { file: "errors.md", text: "Use try-catch blocks to handle JavaScript errors gracefully." },
  450. { file: "python.md", text: "Python uses try-except for exception handling." },
  451. ];
  452. const result = await llm.rerank(query, documents);
  453. // JavaScript errors doc should score highest
  454. expect(result.results[0]!.file).toBe("errors.md");
  455. expect(result.results[0]!.score).toBeGreaterThan(0.7);
  456. // Python doc might be somewhat relevant (same concept, different language)
  457. // Cooking should be least relevant
  458. expect(result.results[2]!.file).toBe("cooking.md");
  459. });
  460. test("handles empty document list", async () => {
  461. const result = await llm.rerank("test query", []);
  462. expect(result.results).toHaveLength(0);
  463. });
  464. test("handles single document", async () => {
  465. const result = await llm.rerank("test", [{ file: "doc.md", text: "content" }]);
  466. expect(result.results).toHaveLength(1);
  467. expect(result.results[0]!.file).toBe("doc.md");
  468. });
  469. test("preserves original file paths", async () => {
  470. const documents: RerankDocument[] = [
  471. { file: "path/to/doc1.md", text: "content one" },
  472. { file: "another/path/doc2.md", text: "content two" },
  473. ];
  474. const result = await llm.rerank("query", documents);
  475. const files = result.results.map((r) => r.file).sort();
  476. expect(files).toEqual(["another/path/doc2.md", "path/to/doc1.md"]);
  477. });
  478. test("returns scores between 0 and 1", async () => {
  479. const documents: RerankDocument[] = [
  480. { file: "a.md", text: "The quick brown fox jumps over the lazy dog." },
  481. { file: "b.md", text: "Machine learning algorithms process data efficiently." },
  482. { file: "c.md", text: "React components use JSX syntax for rendering." },
  483. ];
  484. const result = await llm.rerank("Tell me about animals", documents);
  485. for (const doc of result.results) {
  486. expect(doc.score).toBeGreaterThanOrEqual(0);
  487. expect(doc.score).toBeLessThanOrEqual(1);
  488. }
  489. });
  490. test("batch reranks multiple documents efficiently", async () => {
  491. // Create 10 documents to verify batch processing works
  492. const documents: RerankDocument[] = Array(10)
  493. .fill(null)
  494. .map((_, i) => ({
  495. file: `doc${i}.md`,
  496. text: `Document number ${i} with some content about topic ${i % 3}`,
  497. }));
  498. const start = Date.now();
  499. const result = await llm.rerank("topic 1", documents);
  500. const elapsed = Date.now() - start;
  501. expect(result.results).toHaveLength(10);
  502. // Verify all documents are returned with valid scores
  503. for (const doc of result.results) {
  504. expect(doc.score).toBeGreaterThanOrEqual(0);
  505. expect(doc.score).toBeLessThanOrEqual(1);
  506. }
  507. // Log timing for monitoring batch performance
  508. console.log(`Batch rerank of 10 docs took ${elapsed}ms`);
  509. });
  510. test("uses fewer active rerank contexts for small batches", async () => {
  511. const freshLlm = new LlamaCpp({});
  512. const calls: number[] = [];
  513. const fakeModel = {
  514. tokenize: (text: string) => Array.from(text),
  515. detokenize: (tokens: string[]) => tokens.join(""),
  516. };
  517. const fakeContexts = Array.from({ length: 4 }, (_, idx) => ({
  518. rankAll: async (_query: string, docs: string[]) => {
  519. calls.push(idx);
  520. return docs.map(() => 0.5);
  521. },
  522. }));
  523. (freshLlm as any).ensureRerankModel = async () => fakeModel;
  524. (freshLlm as any).ensureRerankContexts = async () => fakeContexts;
  525. const documents: RerankDocument[] = Array.from({ length: 20 }, (_, i) => ({
  526. file: `doc${i}.md`,
  527. text: `Document number ${i}`,
  528. }));
  529. const result = await freshLlm.rerank("topic 1", documents);
  530. expect(result.results).toHaveLength(20);
  531. expect(calls).toEqual([0, 1]);
  532. });
  533. test("truncates and reranks document exceeding 2048 token context size", async () => {
  534. // The reranker context is created with contextSize=2048. Documents that
  535. // exceed the token budget (contextSize - template overhead - query tokens)
  536. // should be silently truncated rather than crashing.
  537. const paragraph = "The quick brown fox jumps over the lazy dog near the riverbank. " +
  538. "Authentication tokens must be validated on every request to ensure security. " +
  539. "Database queries should use prepared statements to prevent SQL injection attacks. " +
  540. "The deployment pipeline includes linting, testing, building, and publishing stages. ";
  541. // ~320 chars per paragraph, repeat 40 times = ~12800 chars ≈ 3200 tokens
  542. const longText = paragraph.repeat(40);
  543. const query = "How do I configure authentication?";
  544. const documents: RerankDocument[] = [
  545. { file: "short-relevant.md", text: "Authentication can be configured by setting AUTH_SECRET." },
  546. { file: "long-doc.md", text: longText },
  547. { file: "short-irrelevant.md", text: "The weather is sunny today." },
  548. ];
  549. console.log(`Long doc length: ${longText.length} chars (~${Math.round(longText.length / 4)} tokens)`);
  550. const result = await llm.rerank(query, documents);
  551. // Should return all 3 documents without crashing
  552. expect(result.results).toHaveLength(3);
  553. // All scores should be valid numbers in [0, 1]
  554. for (const doc of result.results) {
  555. expect(doc.score).toBeGreaterThanOrEqual(0);
  556. expect(doc.score).toBeLessThanOrEqual(1);
  557. expect(Number.isNaN(doc.score)).toBe(false);
  558. }
  559. // The short, directly relevant doc should still rank highest
  560. console.log("Rerank results for long doc test:");
  561. for (const doc of result.results) {
  562. console.log(` ${doc.file}: ${doc.score.toFixed(4)}`);
  563. }
  564. });
  565. });
  566. describe("expandQuery", () => {
  567. test("returns query expansions with correct types", async () => {
  568. const result = await llm.expandQuery("test query");
  569. // Result is Queryable[] containing lex, vec, and/or hyde entries
  570. expect(result.length).toBeGreaterThanOrEqual(1);
  571. // Each result should have a valid type
  572. for (const q of result) {
  573. expect(["lex", "vec", "hyde"]).toContain(q.type);
  574. expect(q.text.length).toBeGreaterThan(0);
  575. }
  576. }, 30000); // 30s timeout for model loading
  577. test("can exclude lexical queries", async () => {
  578. const result = await llm.expandQuery("authentication setup", { includeLexical: false });
  579. // Should not contain any 'lex' type entries
  580. const lexEntries = result.filter(q => q.type === "lex");
  581. expect(lexEntries).toHaveLength(0);
  582. });
  583. });
  584. });
  585. // =============================================================================
  586. // Session Management Tests
  587. // =============================================================================
  588. describe.skipIf(!!process.env.CI)("LLM Session Management", () => {
  589. describe("withLLMSession", () => {
  590. test("session provides access to LLM operations", async () => {
  591. const result = await withLLMSession(async (session) => {
  592. expect(session.isValid).toBe(true);
  593. const embedding = await session.embed("test text");
  594. expect(embedding).not.toBeNull();
  595. expect(embedding!.embedding.length).toBe(768);
  596. return "success";
  597. });
  598. expect(result).toBe("success");
  599. });
  600. test("session is invalid after release", async () => {
  601. let capturedSession: ILLMSession | null = null;
  602. await withLLMSession(async (session) => {
  603. capturedSession = session;
  604. expect(session.isValid).toBe(true);
  605. });
  606. // Session should be invalid after withLLMSession returns
  607. expect(capturedSession).not.toBeNull();
  608. expect(capturedSession!.isValid).toBe(false);
  609. });
  610. test("session prevents idle unload during operations", async () => {
  611. await withLLMSession(async (session) => {
  612. // While inside a session, canUnloadLLM should return false
  613. expect(canUnloadLLM()).toBe(false);
  614. // Perform an operation
  615. await session.embed("test");
  616. // Still should not be able to unload
  617. expect(canUnloadLLM()).toBe(false);
  618. });
  619. // After session ends, should be able to unload
  620. expect(canUnloadLLM()).toBe(true);
  621. });
  622. test("nested sessions increment ref count", async () => {
  623. await withLLMSession(async (outerSession) => {
  624. expect(canUnloadLLM()).toBe(false);
  625. await withLLMSession(async (innerSession) => {
  626. expect(canUnloadLLM()).toBe(false);
  627. expect(innerSession.isValid).toBe(true);
  628. expect(outerSession.isValid).toBe(true);
  629. });
  630. // Inner session released, but outer still active
  631. expect(canUnloadLLM()).toBe(false);
  632. expect(outerSession.isValid).toBe(true);
  633. });
  634. // All sessions released
  635. expect(canUnloadLLM()).toBe(true);
  636. });
  637. test("session embedBatch works correctly", async () => {
  638. await withLLMSession(async (session) => {
  639. const texts = ["Hello world", "Test text", "Another document"];
  640. const results = await session.embedBatch(texts);
  641. expect(results).toHaveLength(3);
  642. for (const result of results) {
  643. expect(result).not.toBeNull();
  644. expect(result!.embedding.length).toBe(768);
  645. }
  646. });
  647. });
  648. test("session rerank works correctly", async () => {
  649. await withLLMSession(async (session) => {
  650. const documents: RerankDocument[] = [
  651. { file: "a.txt", text: "The capital of France is Paris." },
  652. { file: "b.txt", text: "Dogs are great pets." },
  653. ];
  654. const result = await session.rerank("What is the capital of France?", documents);
  655. expect(result.results).toHaveLength(2);
  656. expect(result.results[0]!.file).toBe("a.txt");
  657. expect(result.results[0]!.score).toBeGreaterThan(result.results[1]!.score);
  658. });
  659. });
  660. test("max duration aborts session after timeout", async () => {
  661. let aborted = false;
  662. try {
  663. await withLLMSession(async (session) => {
  664. // Wait longer than max duration
  665. await new Promise(resolve => setTimeout(resolve, 150));
  666. // This operation should throw because session was aborted
  667. await session.embed("test");
  668. }, { maxDuration: 50 }); // 50ms max
  669. } catch (err) {
  670. if (err instanceof SessionReleasedError) {
  671. aborted = true;
  672. } else {
  673. throw err;
  674. }
  675. }
  676. expect(aborted).toBe(true);
  677. }, 5000);
  678. test("external abort signal propagates to session", async () => {
  679. const abortController = new AbortController();
  680. let sessionAborted = false;
  681. const promise = withLLMSession(async (session) => {
  682. // Wait a bit then check if aborted
  683. await new Promise(resolve => setTimeout(resolve, 100));
  684. if (!session.isValid) {
  685. sessionAborted = true;
  686. throw new SessionReleasedError("Session aborted");
  687. }
  688. return "should not reach";
  689. }, { signal: abortController.signal });
  690. // Abort after 20ms
  691. setTimeout(() => abortController.abort(), 20);
  692. try {
  693. await promise;
  694. } catch (err) {
  695. // Expected
  696. }
  697. expect(sessionAborted).toBe(true);
  698. }, 5000);
  699. test("session provides abort signal for monitoring", async () => {
  700. await withLLMSession(async (session) => {
  701. expect(session.signal).toBeInstanceOf(AbortSignal);
  702. expect(session.signal.aborted).toBe(false);
  703. });
  704. });
  705. test("returns value from callback", async () => {
  706. const result = await withLLMSession(async (session) => {
  707. await session.embed("test");
  708. return { status: "complete", count: 42 };
  709. });
  710. expect(result).toEqual({ status: "complete", count: 42 });
  711. });
  712. test("propagates errors from callback", async () => {
  713. const customError = new Error("Custom test error");
  714. await expect(
  715. withLLMSession(async () => {
  716. throw customError;
  717. })
  718. ).rejects.toThrow("Custom test error");
  719. });
  720. });
  721. });