llm.test.ts 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853
  1. /**
  2. * llm.test.ts - Unit tests for the LLM abstraction layer (node-llama-cpp)
  3. *
  4. * Run with: bun test src/llm.test.ts
  5. *
  6. * These tests require the actual models to be downloaded. Run the embed or
  7. * rerank functions first to trigger model downloads.
  8. */
  9. import { describe, test, expect, beforeAll, afterAll, vi } from "vitest";
  10. import {
  11. LlamaCpp,
  12. getDefaultLlamaCpp,
  13. disposeDefaultLlamaCpp,
  14. resolveLlamaGpuMode,
  15. withLLMSession,
  16. canUnloadLLM,
  17. SessionReleasedError,
  18. type RerankDocument,
  19. type ILLMSession,
  20. } from "../src/llm.js";
  21. // =============================================================================
  22. // Singleton Tests (no model loading required)
  23. // =============================================================================
  24. describe("Default LlamaCpp Singleton", () => {
  25. // Test singleton behavior without resetting to avoid orphan instances
  26. test("getDefaultLlamaCpp returns same instance on subsequent calls", () => {
  27. const llm1 = getDefaultLlamaCpp();
  28. const llm2 = getDefaultLlamaCpp();
  29. expect(llm1).toBe(llm2);
  30. expect(llm1).toBeInstanceOf(LlamaCpp);
  31. });
  32. });
  33. // =============================================================================
  34. // Model Existence Tests
  35. // =============================================================================
  36. describe("LlamaCpp.modelExists", () => {
  37. test("returns exists:true for HuggingFace model URIs", async () => {
  38. const llm = getDefaultLlamaCpp();
  39. const result = await llm.modelExists("hf:org/repo/model.gguf");
  40. expect(result.exists).toBe(true);
  41. expect(result.name).toBe("hf:org/repo/model.gguf");
  42. });
  43. test("returns exists:false for non-existent local paths", async () => {
  44. const llm = getDefaultLlamaCpp();
  45. const result = await llm.modelExists("/nonexistent/path/model.gguf");
  46. expect(result.exists).toBe(false);
  47. expect(result.name).toBe("/nonexistent/path/model.gguf");
  48. });
  49. });
  50. describe("QMD_LLAMA_GPU resolution", () => {
  51. test("uses auto when unset or blank", () => {
  52. expect(resolveLlamaGpuMode(undefined)).toBe("auto");
  53. expect(resolveLlamaGpuMode(" ")).toBe("auto");
  54. });
  55. test("maps CPU disable values to false", () => {
  56. expect(resolveLlamaGpuMode("false")).toBe(false);
  57. expect(resolveLlamaGpuMode("OFF")).toBe(false);
  58. expect(resolveLlamaGpuMode(" none ")).toBe(false);
  59. expect(resolveLlamaGpuMode("disabled")).toBe(false);
  60. expect(resolveLlamaGpuMode("0")).toBe(false);
  61. });
  62. test("passes through supported GPU backends", () => {
  63. expect(resolveLlamaGpuMode("metal")).toBe("metal");
  64. expect(resolveLlamaGpuMode("VULKAN")).toBe("vulkan");
  65. expect(resolveLlamaGpuMode(" cuda ")).toBe("cuda");
  66. });
  67. test("warns and falls back to auto for unsupported values", () => {
  68. const stderrSpy = vi.spyOn(process.stderr, "write").mockReturnValue(true);
  69. try {
  70. expect(resolveLlamaGpuMode("rocm")).toBe("auto");
  71. expect(stderrSpy).toHaveBeenCalled();
  72. expect(String(stderrSpy.mock.calls[0]?.[0] || "")).toContain("QMD_LLAMA_GPU");
  73. } finally {
  74. stderrSpy.mockRestore();
  75. }
  76. });
  77. });
  78. describe("LlamaCpp expand context size config", () => {
  79. const defaultExpandContextSize = 2048;
  80. test("uses default expand context size when no config or env is set", () => {
  81. const prev = process.env.QMD_EXPAND_CONTEXT_SIZE;
  82. delete process.env.QMD_EXPAND_CONTEXT_SIZE;
  83. try {
  84. const llm = new LlamaCpp({}) as any;
  85. expect(llm.expandContextSize).toBe(defaultExpandContextSize);
  86. } finally {
  87. if (prev === undefined) delete process.env.QMD_EXPAND_CONTEXT_SIZE;
  88. else process.env.QMD_EXPAND_CONTEXT_SIZE = prev;
  89. }
  90. });
  91. test("uses QMD_EXPAND_CONTEXT_SIZE when set to a positive integer", () => {
  92. const prev = process.env.QMD_EXPAND_CONTEXT_SIZE;
  93. process.env.QMD_EXPAND_CONTEXT_SIZE = "3072";
  94. try {
  95. const llm = new LlamaCpp({}) as any;
  96. expect(llm.expandContextSize).toBe(3072);
  97. } finally {
  98. if (prev === undefined) delete process.env.QMD_EXPAND_CONTEXT_SIZE;
  99. else process.env.QMD_EXPAND_CONTEXT_SIZE = prev;
  100. }
  101. });
  102. test("config value overrides QMD_EXPAND_CONTEXT_SIZE", () => {
  103. const prev = process.env.QMD_EXPAND_CONTEXT_SIZE;
  104. process.env.QMD_EXPAND_CONTEXT_SIZE = "4096";
  105. try {
  106. const llm = new LlamaCpp({ expandContextSize: 1536 }) as any;
  107. expect(llm.expandContextSize).toBe(1536);
  108. } finally {
  109. if (prev === undefined) delete process.env.QMD_EXPAND_CONTEXT_SIZE;
  110. else process.env.QMD_EXPAND_CONTEXT_SIZE = prev;
  111. }
  112. });
  113. test("falls back to default and warns when QMD_EXPAND_CONTEXT_SIZE is invalid", () => {
  114. const prev = process.env.QMD_EXPAND_CONTEXT_SIZE;
  115. process.env.QMD_EXPAND_CONTEXT_SIZE = "bad";
  116. const stderrSpy = vi.spyOn(process.stderr, "write").mockReturnValue(true);
  117. try {
  118. const llm = new LlamaCpp({}) as any;
  119. expect(llm.expandContextSize).toBe(defaultExpandContextSize);
  120. expect(stderrSpy).toHaveBeenCalled();
  121. expect(String(stderrSpy.mock.calls[0]?.[0] || "")).toContain("QMD_EXPAND_CONTEXT_SIZE");
  122. } finally {
  123. stderrSpy.mockRestore();
  124. if (prev === undefined) delete process.env.QMD_EXPAND_CONTEXT_SIZE;
  125. else process.env.QMD_EXPAND_CONTEXT_SIZE = prev;
  126. }
  127. });
  128. test("throws when config expandContextSize is invalid", () => {
  129. expect(() => new LlamaCpp({ expandContextSize: 0 })).toThrow(
  130. "Invalid expandContextSize: 0. Must be a positive integer."
  131. );
  132. });
  133. });
  134. describe("LlamaCpp model resolution (config > env > default)", () => {
  135. const HARDCODED_EMBED = "hf:ggml-org/embeddinggemma-300M-GGUF/embeddinggemma-300M-Q8_0.gguf";
  136. const HARDCODED_RERANK = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
  137. const HARDCODED_GENERATE = "hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf";
  138. test("uses hardcoded default when no config or env is set", () => {
  139. const prev = process.env.QMD_EMBED_MODEL;
  140. delete process.env.QMD_EMBED_MODEL;
  141. try {
  142. const llm = new LlamaCpp({}) as any;
  143. expect(llm.embedModelUri).toBe(HARDCODED_EMBED);
  144. expect(llm.rerankModelUri).toBe(HARDCODED_RERANK);
  145. expect(llm.generateModelUri).toBe(HARDCODED_GENERATE);
  146. } finally {
  147. if (prev === undefined) delete process.env.QMD_EMBED_MODEL;
  148. else process.env.QMD_EMBED_MODEL = prev;
  149. }
  150. });
  151. test("env var overrides hardcoded default", () => {
  152. const prev = process.env.QMD_EMBED_MODEL;
  153. process.env.QMD_EMBED_MODEL = "hf:custom/embed-model.gguf";
  154. try {
  155. const llm = new LlamaCpp({}) as any;
  156. expect(llm.embedModelUri).toBe("hf:custom/embed-model.gguf");
  157. } finally {
  158. if (prev === undefined) delete process.env.QMD_EMBED_MODEL;
  159. else process.env.QMD_EMBED_MODEL = prev;
  160. }
  161. });
  162. test("config overrides env var", () => {
  163. const prev = process.env.QMD_EMBED_MODEL;
  164. process.env.QMD_EMBED_MODEL = "hf:env/model.gguf";
  165. try {
  166. const llm = new LlamaCpp({ embedModel: "hf:config/model.gguf" }) as any;
  167. expect(llm.embedModelUri).toBe("hf:config/model.gguf");
  168. } finally {
  169. if (prev === undefined) delete process.env.QMD_EMBED_MODEL;
  170. else process.env.QMD_EMBED_MODEL = prev;
  171. }
  172. });
  173. });
  174. describe("LlamaCpp embedding truncation", () => {
  175. test("truncates against the active embedding context limit, not the model train context", async () => {
  176. const llm = new LlamaCpp({}) as any;
  177. const getEmbeddingFor = vi.fn(async (text: string) => ({
  178. vector: new Float32Array([0.25, 0.5]),
  179. text,
  180. }));
  181. llm.touchActivity = vi.fn();
  182. llm.embedModel = {
  183. trainContextSize: 8192,
  184. tokenize: (text: string) => Array.from({ length: text.length }, () => 1),
  185. detokenize: (tokens: readonly number[]) => "x".repeat(tokens.length),
  186. };
  187. llm.ensureEmbedContext = vi.fn().mockResolvedValue({ getEmbeddingFor });
  188. const result = await llm.embed("x".repeat(3000));
  189. expect(getEmbeddingFor).toHaveBeenCalledWith("x".repeat(2044));
  190. expect(result).toEqual({
  191. embedding: [0.25, 0.5],
  192. model: llm.embedModelUri,
  193. });
  194. });
  195. });
  196. describe("LlamaCpp rerank deduping", () => {
  197. test("deduplicates identical document texts before scoring", async () => {
  198. const llm = new LlamaCpp({}) as any;
  199. llm._ciMode = false; // allow unit test even in CI (mocked, no real models)
  200. const rankAll = vi.fn(async (_query: string, docs: string[]) =>
  201. docs.map((doc) => doc === "shared chunk" ? 0.9 : 0.2)
  202. );
  203. llm.touchActivity = vi.fn();
  204. llm.ensureRerankContexts = vi.fn().mockResolvedValue([{ rankAll }]);
  205. llm.ensureRerankModel = vi.fn().mockResolvedValue({
  206. tokenize: (text: string) => Array.from(text),
  207. detokenize: (tokens: string[]) => tokens.join(""),
  208. });
  209. const result = await llm.rerank("query", [
  210. { file: "a.md", text: "shared chunk" },
  211. { file: "b.md", text: "shared chunk" },
  212. { file: "c.md", text: "different chunk" },
  213. ]);
  214. expect(rankAll).toHaveBeenCalledTimes(1);
  215. expect(rankAll).toHaveBeenCalledWith("query", ["shared chunk", "different chunk"]);
  216. expect(result.results).toHaveLength(3);
  217. const scoreByFile = new Map(result.results.map((item) => [item.file, item.score]));
  218. expect(scoreByFile.get("a.md")).toBe(0.9);
  219. expect(scoreByFile.get("b.md")).toBe(0.9);
  220. expect(scoreByFile.get("c.md")).toBe(0.2);
  221. });
  222. });
  223. describe("LlamaCpp.getDeviceInfo", () => {
  224. test("can skip build attempts for status probes", async () => {
  225. const llm = new LlamaCpp({}) as any;
  226. const fakeLlama = {
  227. gpu: "metal",
  228. supportsGpuOffloading: true,
  229. cpuMathCores: 8,
  230. getGpuDeviceNames: vi.fn().mockResolvedValue(["Apple GPU"]),
  231. getVramState: vi.fn().mockResolvedValue({ total: 1024, used: 256, free: 768 }),
  232. };
  233. llm.ensureLlama = vi.fn().mockResolvedValue(fakeLlama);
  234. const device = await llm.getDeviceInfo({ allowBuild: false });
  235. expect(llm.ensureLlama).toHaveBeenCalledWith(false);
  236. expect(device).toEqual({
  237. gpu: "metal",
  238. gpuOffloading: true,
  239. gpuDevices: ["Apple GPU"],
  240. vram: { total: 1024, used: 256, free: 768 },
  241. cpuCores: 8,
  242. });
  243. });
  244. });
  245. // =============================================================================
  246. // Integration Tests (require actual models)
  247. // =============================================================================
  248. describe.skipIf(!!process.env.CI)("LlamaCpp Integration", () => {
  249. // Use the singleton to avoid multiple Metal contexts
  250. const llm = getDefaultLlamaCpp();
  251. afterAll(async () => {
  252. // Ensure native resources are released to avoid ggml-metal asserts on process exit.
  253. await disposeDefaultLlamaCpp();
  254. });
  255. describe("embed", () => {
  256. test("returns embedding with correct dimensions", async () => {
  257. const result = await llm.embed("Hello world");
  258. expect(result).not.toBeNull();
  259. expect(result!.embedding).toBeInstanceOf(Array);
  260. expect(result!.embedding.length).toBeGreaterThan(0);
  261. // embeddinggemma outputs 768 dimensions
  262. expect(result!.embedding.length).toBe(768);
  263. });
  264. test("returns consistent embeddings for same input", async () => {
  265. const result1 = await llm.embed("test text");
  266. const result2 = await llm.embed("test text");
  267. expect(result1).not.toBeNull();
  268. expect(result2).not.toBeNull();
  269. // Embeddings should be identical for the same input
  270. for (let i = 0; i < result1!.embedding.length; i++) {
  271. expect(result1!.embedding[i]).toBeCloseTo(result2!.embedding[i]!, 5);
  272. }
  273. });
  274. test("returns different embeddings for different inputs", async () => {
  275. const result1 = await llm.embed("cats are great");
  276. const result2 = await llm.embed("database optimization");
  277. expect(result1).not.toBeNull();
  278. expect(result2).not.toBeNull();
  279. // Calculate cosine similarity - should be less than 1.0 (not identical)
  280. let dotProduct = 0;
  281. let norm1 = 0;
  282. let norm2 = 0;
  283. for (let i = 0; i < result1!.embedding.length; i++) {
  284. const v1 = result1!.embedding[i]!;
  285. const v2 = result2!.embedding[i]!;
  286. dotProduct += v1 * v2;
  287. norm1 += v1 ** 2;
  288. norm2 += v2 ** 2;
  289. }
  290. const similarity = dotProduct / (Math.sqrt(norm1) * Math.sqrt(norm2));
  291. expect(similarity).toBeLessThan(0.95); // Should be meaningfully different
  292. });
  293. });
  294. describe("embedBatch", () => {
  295. test("returns embeddings for multiple texts", async () => {
  296. const texts = ["Hello world", "Test text", "Another document"];
  297. const results = await llm.embedBatch(texts);
  298. expect(results).toHaveLength(3);
  299. for (const result of results) {
  300. expect(result).not.toBeNull();
  301. expect(result!.embedding.length).toBe(768);
  302. }
  303. });
  304. test("returns same results as individual embed calls", async () => {
  305. const texts = ["cats are great", "dogs are awesome"];
  306. // Get batch embeddings
  307. const batchResults = await llm.embedBatch(texts);
  308. // Get individual embeddings
  309. const individualResults = await Promise.all(texts.map(t => llm.embed(t)));
  310. // Compare - should be identical
  311. for (let i = 0; i < texts.length; i++) {
  312. expect(batchResults[i]).not.toBeNull();
  313. expect(individualResults[i]).not.toBeNull();
  314. for (let j = 0; j < batchResults[i]!.embedding.length; j++) {
  315. expect(batchResults[i]!.embedding[j]).toBeCloseTo(individualResults[i]!.embedding[j]!, 5);
  316. }
  317. }
  318. });
  319. test("handles empty array", async () => {
  320. const results = await llm.embedBatch([]);
  321. expect(results).toHaveLength(0);
  322. });
  323. test("batch is faster than sequential", async () => {
  324. const texts = Array(10).fill(null).map((_, i) => `Document number ${i} with content`);
  325. // Time batch
  326. const batchStart = Date.now();
  327. await llm.embedBatch(texts);
  328. const batchTime = Date.now() - batchStart;
  329. // Time sequential
  330. const seqStart = Date.now();
  331. for (const text of texts) {
  332. await llm.embed(text);
  333. }
  334. const seqTime = Date.now() - seqStart;
  335. console.log(`Batch: ${batchTime}ms, Sequential: ${seqTime}ms`);
  336. // Performance is machine/load dependent. We only assert batch isn't drastically worse.
  337. expect(batchTime).toBeLessThanOrEqual(seqTime * 3);
  338. });
  339. test("handles concurrent embedBatch calls on fresh instance without race condition", async () => {
  340. // This test verifies the fix for a race condition where concurrent calls to
  341. // ensureEmbedContext() could create multiple contexts. Without the promise guard,
  342. // each concurrent embedBatch call sees embedContext === null and creates its own
  343. // context, causing resource leaks and potential "Context is disposed" errors.
  344. //
  345. // See: https://github.com/tobi/qmd/pull/54
  346. //
  347. // The fix uses a promise guard to ensure only one context creation runs at a time.
  348. // We verify this by instrumenting createEmbeddingContext to count invocations.
  349. const freshLlm = new LlamaCpp({});
  350. let contextCreateCount = 0;
  351. // Instrument the model's createEmbeddingContext to count calls
  352. const originalEnsureEmbedModel = (freshLlm as any).ensureEmbedModel.bind(freshLlm);
  353. let modelInstrumented = false;
  354. (freshLlm as any).ensureEmbedModel = async function() {
  355. const model = await originalEnsureEmbedModel();
  356. if (!modelInstrumented) {
  357. modelInstrumented = true;
  358. const originalCreate = model.createEmbeddingContext.bind(model);
  359. model.createEmbeddingContext = async function(...args: any[]) {
  360. contextCreateCount++;
  361. return originalCreate(...args);
  362. };
  363. }
  364. return model;
  365. };
  366. const texts = Array(10).fill(null).map((_, i) => `Document ${i}`);
  367. // Call embedBatch 5 TIMES in parallel on fresh instance.
  368. // Without the promise guard fix, this would create 5 contexts (one per call).
  369. // With the fix, only 1 context should be created.
  370. const batches = await Promise.all([
  371. freshLlm.embedBatch(texts.slice(0, 2)),
  372. freshLlm.embedBatch(texts.slice(2, 4)),
  373. freshLlm.embedBatch(texts.slice(4, 6)),
  374. freshLlm.embedBatch(texts.slice(6, 8)),
  375. freshLlm.embedBatch(texts.slice(8, 10)),
  376. ]);
  377. const allResults = batches.flat();
  378. expect(allResults).toHaveLength(10);
  379. const successCount = allResults.filter(r => r !== null).length;
  380. expect(successCount).toBe(10);
  381. // THE KEY ASSERTION: Contexts should be created once (by ensureEmbedContexts),
  382. // not duplicated per concurrent embedBatch call. The exact count depends on
  383. // available VRAM (computeParallelism), but should not be 5 (one per call).
  384. // Without the fix, contextCreateCount would be 5× the intended count (one set per concurrent call).
  385. // With the promise guard, contexts are created exactly once regardless of concurrent callers.
  386. // The count depends on VRAM (computeParallelism), but should be ≤ 8 (the cap).
  387. console.log(`Context creation count: ${contextCreateCount} (expected: ≤ 8, not 5× duplicated)`);
  388. expect(contextCreateCount).toBeGreaterThanOrEqual(1);
  389. expect(contextCreateCount).toBeLessThanOrEqual(8);
  390. await freshLlm.dispose();
  391. }, 60000);
  392. });
  393. describe("rerank", () => {
  394. test("scores capital of France question correctly", async () => {
  395. const query = "What is the capital of France?";
  396. const documents: RerankDocument[] = [
  397. { file: "butterflies.txt", text: "Butterflies indeed fly through the garden." },
  398. { file: "france.txt", text: "The capital of France is Paris." },
  399. { file: "canada.txt", text: "The capital of Canada is Ottawa." },
  400. ];
  401. const result = await llm.rerank(query, documents);
  402. expect(result.results).toHaveLength(3);
  403. // The France document should score highest
  404. expect(result.results[0]!.file).toBe("france.txt");
  405. expect(result.results[0]!.score).toBeGreaterThan(0.7);
  406. // Canada should be somewhat relevant (also about capitals)
  407. expect(result.results[1]!.file).toBe("canada.txt");
  408. // Butterflies should score lowest
  409. expect(result.results[2]!.file).toBe("butterflies.txt");
  410. expect(result.results[2]!.score).toBeLessThan(0.6);
  411. });
  412. test("scores authentication query correctly", async () => {
  413. const query = "How do I configure authentication?";
  414. const documents: RerankDocument[] = [
  415. { file: "weather.md", text: "The weather today is sunny with mild temperatures." },
  416. { file: "auth.md", text: "Authentication can be configured by setting the AUTH_SECRET environment variable." },
  417. { file: "pizza.md", text: "Our restaurant serves the best pizza in town." },
  418. { file: "jwt.md", text: "JWT authentication requires a secret key and expiration time." },
  419. ];
  420. const result = await llm.rerank(query, documents);
  421. expect(result.results).toHaveLength(4);
  422. // Auth documents should score highest
  423. const topTwo = result.results.slice(0, 2).map((r) => r.file);
  424. expect(topTwo).toContain("auth.md");
  425. expect(topTwo).toContain("jwt.md");
  426. // Irrelevant documents should score lowest
  427. const bottomTwo = result.results.slice(2).map((r) => r.file);
  428. expect(bottomTwo).toContain("weather.md");
  429. expect(bottomTwo).toContain("pizza.md");
  430. });
  431. test("handles programming queries correctly", async () => {
  432. const query = "How do I handle errors in JavaScript?";
  433. const documents: RerankDocument[] = [
  434. { file: "cooking.md", text: "To make a good pasta, boil water and add salt." },
  435. { file: "errors.md", text: "Use try-catch blocks to handle JavaScript errors gracefully." },
  436. { file: "python.md", text: "Python uses try-except for exception handling." },
  437. ];
  438. const result = await llm.rerank(query, documents);
  439. // JavaScript errors doc should score highest
  440. expect(result.results[0]!.file).toBe("errors.md");
  441. expect(result.results[0]!.score).toBeGreaterThan(0.7);
  442. // Python doc might be somewhat relevant (same concept, different language)
  443. // Cooking should be least relevant
  444. expect(result.results[2]!.file).toBe("cooking.md");
  445. });
  446. test("handles empty document list", async () => {
  447. const result = await llm.rerank("test query", []);
  448. expect(result.results).toHaveLength(0);
  449. });
  450. test("handles single document", async () => {
  451. const result = await llm.rerank("test", [{ file: "doc.md", text: "content" }]);
  452. expect(result.results).toHaveLength(1);
  453. expect(result.results[0]!.file).toBe("doc.md");
  454. });
  455. test("preserves original file paths", async () => {
  456. const documents: RerankDocument[] = [
  457. { file: "path/to/doc1.md", text: "content one" },
  458. { file: "another/path/doc2.md", text: "content two" },
  459. ];
  460. const result = await llm.rerank("query", documents);
  461. const files = result.results.map((r) => r.file).sort();
  462. expect(files).toEqual(["another/path/doc2.md", "path/to/doc1.md"]);
  463. });
  464. test("returns scores between 0 and 1", async () => {
  465. const documents: RerankDocument[] = [
  466. { file: "a.md", text: "The quick brown fox jumps over the lazy dog." },
  467. { file: "b.md", text: "Machine learning algorithms process data efficiently." },
  468. { file: "c.md", text: "React components use JSX syntax for rendering." },
  469. ];
  470. const result = await llm.rerank("Tell me about animals", documents);
  471. for (const doc of result.results) {
  472. expect(doc.score).toBeGreaterThanOrEqual(0);
  473. expect(doc.score).toBeLessThanOrEqual(1);
  474. }
  475. });
  476. test("batch reranks multiple documents efficiently", async () => {
  477. // Create 10 documents to verify batch processing works
  478. const documents: RerankDocument[] = Array(10)
  479. .fill(null)
  480. .map((_, i) => ({
  481. file: `doc${i}.md`,
  482. text: `Document number ${i} with some content about topic ${i % 3}`,
  483. }));
  484. const start = Date.now();
  485. const result = await llm.rerank("topic 1", documents);
  486. const elapsed = Date.now() - start;
  487. expect(result.results).toHaveLength(10);
  488. // Verify all documents are returned with valid scores
  489. for (const doc of result.results) {
  490. expect(doc.score).toBeGreaterThanOrEqual(0);
  491. expect(doc.score).toBeLessThanOrEqual(1);
  492. }
  493. // Log timing for monitoring batch performance
  494. console.log(`Batch rerank of 10 docs took ${elapsed}ms`);
  495. });
  496. test("uses fewer active rerank contexts for small batches", async () => {
  497. const freshLlm = new LlamaCpp({});
  498. const calls: number[] = [];
  499. const fakeModel = {
  500. tokenize: (text: string) => Array.from(text),
  501. detokenize: (tokens: string[]) => tokens.join(""),
  502. };
  503. const fakeContexts = Array.from({ length: 4 }, (_, idx) => ({
  504. rankAll: async (_query: string, docs: string[]) => {
  505. calls.push(idx);
  506. return docs.map(() => 0.5);
  507. },
  508. }));
  509. (freshLlm as any).ensureRerankModel = async () => fakeModel;
  510. (freshLlm as any).ensureRerankContexts = async () => fakeContexts;
  511. const documents: RerankDocument[] = Array.from({ length: 20 }, (_, i) => ({
  512. file: `doc${i}.md`,
  513. text: `Document number ${i}`,
  514. }));
  515. const result = await freshLlm.rerank("topic 1", documents);
  516. expect(result.results).toHaveLength(20);
  517. expect(calls).toEqual([0, 1]);
  518. });
  519. test("truncates and reranks document exceeding 2048 token context size", async () => {
  520. // The reranker context is created with contextSize=2048. Documents that
  521. // exceed the token budget (contextSize - template overhead - query tokens)
  522. // should be silently truncated rather than crashing.
  523. const paragraph = "The quick brown fox jumps over the lazy dog near the riverbank. " +
  524. "Authentication tokens must be validated on every request to ensure security. " +
  525. "Database queries should use prepared statements to prevent SQL injection attacks. " +
  526. "The deployment pipeline includes linting, testing, building, and publishing stages. ";
  527. // ~320 chars per paragraph, repeat 40 times = ~12800 chars ≈ 3200 tokens
  528. const longText = paragraph.repeat(40);
  529. const query = "How do I configure authentication?";
  530. const documents: RerankDocument[] = [
  531. { file: "short-relevant.md", text: "Authentication can be configured by setting AUTH_SECRET." },
  532. { file: "long-doc.md", text: longText },
  533. { file: "short-irrelevant.md", text: "The weather is sunny today." },
  534. ];
  535. console.log(`Long doc length: ${longText.length} chars (~${Math.round(longText.length / 4)} tokens)`);
  536. const result = await llm.rerank(query, documents);
  537. // Should return all 3 documents without crashing
  538. expect(result.results).toHaveLength(3);
  539. // All scores should be valid numbers in [0, 1]
  540. for (const doc of result.results) {
  541. expect(doc.score).toBeGreaterThanOrEqual(0);
  542. expect(doc.score).toBeLessThanOrEqual(1);
  543. expect(Number.isNaN(doc.score)).toBe(false);
  544. }
  545. // The short, directly relevant doc should still rank highest
  546. console.log("Rerank results for long doc test:");
  547. for (const doc of result.results) {
  548. console.log(` ${doc.file}: ${doc.score.toFixed(4)}`);
  549. }
  550. });
  551. });
  552. describe("expandQuery", () => {
  553. test("returns query expansions with correct types", async () => {
  554. const result = await llm.expandQuery("test query");
  555. // Result is Queryable[] containing lex, vec, and/or hyde entries
  556. expect(result.length).toBeGreaterThanOrEqual(1);
  557. // Each result should have a valid type
  558. for (const q of result) {
  559. expect(["lex", "vec", "hyde"]).toContain(q.type);
  560. expect(q.text.length).toBeGreaterThan(0);
  561. }
  562. }, 30000); // 30s timeout for model loading
  563. test("can exclude lexical queries", async () => {
  564. const result = await llm.expandQuery("authentication setup", { includeLexical: false });
  565. // Should not contain any 'lex' type entries
  566. const lexEntries = result.filter(q => q.type === "lex");
  567. expect(lexEntries).toHaveLength(0);
  568. });
  569. });
  570. });
  571. // =============================================================================
  572. // Session Management Tests
  573. // =============================================================================
  574. describe.skipIf(!!process.env.CI)("LLM Session Management", () => {
  575. describe("withLLMSession", () => {
  576. test("session provides access to LLM operations", async () => {
  577. const result = await withLLMSession(async (session) => {
  578. expect(session.isValid).toBe(true);
  579. const embedding = await session.embed("test text");
  580. expect(embedding).not.toBeNull();
  581. expect(embedding!.embedding.length).toBe(768);
  582. return "success";
  583. });
  584. expect(result).toBe("success");
  585. });
  586. test("session is invalid after release", async () => {
  587. let capturedSession: ILLMSession | null = null;
  588. await withLLMSession(async (session) => {
  589. capturedSession = session;
  590. expect(session.isValid).toBe(true);
  591. });
  592. // Session should be invalid after withLLMSession returns
  593. expect(capturedSession).not.toBeNull();
  594. expect(capturedSession!.isValid).toBe(false);
  595. });
  596. test("session prevents idle unload during operations", async () => {
  597. await withLLMSession(async (session) => {
  598. // While inside a session, canUnloadLLM should return false
  599. expect(canUnloadLLM()).toBe(false);
  600. // Perform an operation
  601. await session.embed("test");
  602. // Still should not be able to unload
  603. expect(canUnloadLLM()).toBe(false);
  604. });
  605. // After session ends, should be able to unload
  606. expect(canUnloadLLM()).toBe(true);
  607. });
  608. test("nested sessions increment ref count", async () => {
  609. await withLLMSession(async (outerSession) => {
  610. expect(canUnloadLLM()).toBe(false);
  611. await withLLMSession(async (innerSession) => {
  612. expect(canUnloadLLM()).toBe(false);
  613. expect(innerSession.isValid).toBe(true);
  614. expect(outerSession.isValid).toBe(true);
  615. });
  616. // Inner session released, but outer still active
  617. expect(canUnloadLLM()).toBe(false);
  618. expect(outerSession.isValid).toBe(true);
  619. });
  620. // All sessions released
  621. expect(canUnloadLLM()).toBe(true);
  622. });
  623. test("session embedBatch works correctly", async () => {
  624. await withLLMSession(async (session) => {
  625. const texts = ["Hello world", "Test text", "Another document"];
  626. const results = await session.embedBatch(texts);
  627. expect(results).toHaveLength(3);
  628. for (const result of results) {
  629. expect(result).not.toBeNull();
  630. expect(result!.embedding.length).toBe(768);
  631. }
  632. });
  633. });
  634. test("session rerank works correctly", async () => {
  635. await withLLMSession(async (session) => {
  636. const documents: RerankDocument[] = [
  637. { file: "a.txt", text: "The capital of France is Paris." },
  638. { file: "b.txt", text: "Dogs are great pets." },
  639. ];
  640. const result = await session.rerank("What is the capital of France?", documents);
  641. expect(result.results).toHaveLength(2);
  642. expect(result.results[0]!.file).toBe("a.txt");
  643. expect(result.results[0]!.score).toBeGreaterThan(result.results[1]!.score);
  644. });
  645. });
  646. test("max duration aborts session after timeout", async () => {
  647. let aborted = false;
  648. try {
  649. await withLLMSession(async (session) => {
  650. // Wait longer than max duration
  651. await new Promise(resolve => setTimeout(resolve, 150));
  652. // This operation should throw because session was aborted
  653. await session.embed("test");
  654. }, { maxDuration: 50 }); // 50ms max
  655. } catch (err) {
  656. if (err instanceof SessionReleasedError) {
  657. aborted = true;
  658. } else {
  659. throw err;
  660. }
  661. }
  662. expect(aborted).toBe(true);
  663. }, 5000);
  664. test("external abort signal propagates to session", async () => {
  665. const abortController = new AbortController();
  666. let sessionAborted = false;
  667. const promise = withLLMSession(async (session) => {
  668. // Wait a bit then check if aborted
  669. await new Promise(resolve => setTimeout(resolve, 100));
  670. if (!session.isValid) {
  671. sessionAborted = true;
  672. throw new SessionReleasedError("Session aborted");
  673. }
  674. return "should not reach";
  675. }, { signal: abortController.signal });
  676. // Abort after 20ms
  677. setTimeout(() => abortController.abort(), 20);
  678. try {
  679. await promise;
  680. } catch (err) {
  681. // Expected
  682. }
  683. expect(sessionAborted).toBe(true);
  684. }, 5000);
  685. test("session provides abort signal for monitoring", async () => {
  686. await withLLMSession(async (session) => {
  687. expect(session.signal).toBeInstanceOf(AbortSignal);
  688. expect(session.signal.aborted).toBe(false);
  689. });
  690. });
  691. test("returns value from callback", async () => {
  692. const result = await withLLMSession(async (session) => {
  693. await session.embed("test");
  694. return { status: "complete", count: 42 };
  695. });
  696. expect(result).toEqual({ status: "complete", count: 42 });
  697. });
  698. test("propagates errors from callback", async () => {
  699. const customError = new Error("Custom test error");
  700. await expect(
  701. withLLMSession(async () => {
  702. throw customError;
  703. })
  704. ).rejects.toThrow("Custom test error");
  705. });
  706. });
  707. });