embedding-store-integration.test.ts 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300
  1. /**
  2. * embedding-store-integration.test.ts - Tests for the
  3. * generateEmbeddings() / EmbeddingProvider integration in store.ts.
  4. *
  5. * Uses an in-memory SQLite + a stub EmbeddingProvider to avoid loading
  6. * node-llama-cpp models. Verifies:
  7. * - Provider's embedBatch is called when options.embedProvider is set
  8. * - Model-id guard throws ModelMismatchError on mismatch
  9. * - Force re-embed bypasses the guard
  10. * - getDistinctEmbeddingModels reads content_vectors correctly
  11. */
  12. import { describe, test, expect, beforeEach, afterEach, vi } from "vitest";
  13. import { mkdtempSync, rmSync } from "node:fs";
  14. import { tmpdir } from "node:os";
  15. import { join } from "node:path";
  16. // Mock the llm.js module so `getDefaultLlamaCpp` (the only function
  17. // `chunkDocumentByTokens` reaches into when no `tokenizer` is supplied)
  18. // throws on call. This is the strongest possible assertion of DoD #1
  19. // for i-1rqixh6m: provider-mode embed runs MUST never load node-llama-cpp.
  20. //
  21. // Vitest hoists this `vi.mock` above the `import` lines below, and
  22. // since the module replacement applies to ALL importers (including
  23. // `store.js`), any leaked call from `chunkDocumentByTokens` (or any
  24. // sibling code path) into `getDefaultLlamaCpp` will throw a clear
  25. // "DoD violation" error and fail the test.
  26. vi.mock("../src/llm.js", async (importOriginal) => {
  27. const actual = await importOriginal<typeof import("../src/llm.js")>();
  28. return {
  29. ...actual,
  30. getDefaultLlamaCpp: vi.fn(() => {
  31. throw new Error(
  32. "getDefaultLlamaCpp() invoked when embedProvider was supplied — " +
  33. "DoD #1 violation (i-1rqixh6m). Provider-mode embed must not load node-llama-cpp.",
  34. );
  35. }),
  36. };
  37. });
  38. import {
  39. createStore,
  40. generateEmbeddings,
  41. getDistinctEmbeddingModels,
  42. insertEmbedding,
  43. type Store,
  44. } from "../src/store.js";
  45. import {
  46. ModelMismatchError,
  47. type EmbeddingProvider,
  48. type ProviderEmbedding,
  49. type ProviderHealth,
  50. } from "../src/embedding/provider.js";
  51. import * as llmModule from "../src/llm.js";
  52. // ─────────────────────────── Stub provider ───────────────────────────────────
  53. class StubProvider implements EmbeddingProvider {
  54. readonly kind = "openai" as const;
  55. readonly modelId: string;
  56. readonly dim: number;
  57. embedBatchCalls = 0;
  58. embedCalls = 0;
  59. totalTextsEmbedded = 0;
  60. constructor(modelId: string, dim = 4) {
  61. this.modelId = modelId;
  62. this.dim = dim;
  63. }
  64. getModelId(): string {
  65. return this.modelId;
  66. }
  67. getDimensions(): number | undefined {
  68. return this.dim;
  69. }
  70. async healthcheck(): Promise<ProviderHealth> {
  71. return { ok: true, model: this.modelId, dimensions: this.dim };
  72. }
  73. async embed(text: string): Promise<ProviderEmbedding | null> {
  74. this.embedCalls++;
  75. this.totalTextsEmbedded++;
  76. return { embedding: this.fakeEmbed(text), model: this.modelId };
  77. }
  78. async embedBatch(texts: string[]): Promise<(ProviderEmbedding | null)[]> {
  79. this.embedBatchCalls++;
  80. this.totalTextsEmbedded += texts.length;
  81. return texts.map((t) => ({ embedding: this.fakeEmbed(t), model: this.modelId }));
  82. }
  83. async dispose(): Promise<void> {}
  84. private fakeEmbed(text: string): number[] {
  85. return Array.from({ length: this.dim }, (_, i) => (text.length + i) * 0.01);
  86. }
  87. }
  88. // ─────────────────────────── Test setup ──────────────────────────────────────
  89. let workDir: string;
  90. let store: Store;
  91. beforeEach(() => {
  92. workDir = mkdtempSync(join(tmpdir(), "qmd-store-int-test-"));
  93. process.env.INDEX_PATH = join(workDir, "index.sqlite");
  94. store = createStore(process.env.INDEX_PATH);
  95. // Insert content + documents with the bare-minimum schema. The content
  96. // body needs to be non-empty so chunkDocumentByTokens emits at least one
  97. // chunk per doc.
  98. const now = "2026-04-27T00:00:00Z";
  99. store.db
  100. .prepare(`INSERT INTO content (hash, doc, created_at) VALUES (?, ?, ?)`)
  101. .run("hash1", "Document one body content here that is long enough to chunk.", now);
  102. store.db
  103. .prepare(`INSERT INTO content (hash, doc, created_at) VALUES (?, ?, ?)`)
  104. .run("hash2", "Document two body content there with different words to chunk.", now);
  105. store.db
  106. .prepare(
  107. `INSERT INTO documents (hash, collection, path, title, created_at, modified_at, active) VALUES (?, ?, ?, ?, ?, ?, ?)`,
  108. )
  109. .run("hash1", "test", "one.md", "One", now, now, 1);
  110. store.db
  111. .prepare(
  112. `INSERT INTO documents (hash, collection, path, title, created_at, modified_at, active) VALUES (?, ?, ?, ?, ?, ?, ?)`,
  113. )
  114. .run("hash2", "test", "two.md", "Two", now, now, 1);
  115. });
  116. afterEach(() => {
  117. try {
  118. store.close();
  119. } catch { /* ignore */ }
  120. delete process.env.INDEX_PATH;
  121. rmSync(workDir, { recursive: true, force: true });
  122. // Reset call history on the mocked getDefaultLlamaCpp between tests so
  123. // each test gets a clean ledger to assert against.
  124. const spy = llmModule.getDefaultLlamaCpp as unknown as ReturnType<typeof vi.fn>;
  125. spy.mockClear();
  126. });
  127. // ─────────────────────────── getDistinctEmbeddingModels ──────────────────────
  128. describe("getDistinctEmbeddingModels", () => {
  129. test("returns [] when content_vectors is empty", () => {
  130. expect(getDistinctEmbeddingModels(store.db)).toEqual([]);
  131. });
  132. test("returns distinct model strings", () => {
  133. store.ensureVecTable(4);
  134. insertEmbedding(store.db, "hash1", 0, 0, new Float32Array([0.1, 0.2, 0.3, 0.4]), "embeddinggemma", "2026-04-27T00:00:00Z");
  135. insertEmbedding(store.db, "hash2", 0, 0, new Float32Array([0.5, 0.6, 0.7, 0.8]), "embeddinggemma", "2026-04-27T00:00:00Z");
  136. expect(getDistinctEmbeddingModels(store.db)).toEqual(["embeddinggemma"]);
  137. });
  138. test("returns multiple distinct models when present", () => {
  139. store.ensureVecTable(4);
  140. insertEmbedding(store.db, "hash1", 0, 0, new Float32Array([0.1, 0.2, 0.3, 0.4]), "model-a", "2026-04-27T00:00:00Z");
  141. insertEmbedding(store.db, "hash2", 0, 0, new Float32Array([0.5, 0.6, 0.7, 0.8]), "model-b", "2026-04-27T00:00:00Z");
  142. const models = getDistinctEmbeddingModels(store.db).sort();
  143. expect(models).toEqual(["model-a", "model-b"]);
  144. });
  145. });
  146. // ─────────────────────────── generateEmbeddings + provider ───────────────────
  147. describe("generateEmbeddings with EmbeddingProvider", () => {
  148. test("uses provider.embedBatch when supplied", async () => {
  149. const provider = new StubProvider("embeddinggemma", 4);
  150. const result = await generateEmbeddings(store, {
  151. embedProvider: provider,
  152. // Use small batches to keep test fast
  153. maxDocsPerBatch: 64,
  154. });
  155. expect(result.docsProcessed).toBe(2);
  156. expect(result.chunksEmbedded).toBeGreaterThan(0);
  157. expect(result.errors).toBe(0);
  158. expect(provider.embedBatchCalls + provider.embedCalls).toBeGreaterThan(0);
  159. expect(provider.totalTextsEmbedded).toBeGreaterThan(0);
  160. });
  161. // Default 5s timeout restored after i-08ovbvtb removed the
  162. // `withLLMSessionForLlm` wrapper from the provider path. The previous
  163. // 30s bump (commit 058ec1d) was a workaround for the cold-cache LLM
  164. // warm-up that the refactor now skips entirely.
  165. test("provider mode does not access store.llm (DoD #2, #5 — i-08ovbvtb)", async () => {
  166. // When `embedProvider` is supplied, the refactor must NOT consult the
  167. // local LlamaCpp at all — neither `embedModelName` nor any other field.
  168. // We assert this by setting `store.llm` to a Proxy that throws on any
  169. // property access. If `getLlm(store).embedModelName` (or any sibling
  170. // call site) regressed back into the provider path, the test would
  171. // fail with a clear error message.
  172. const throwingLlm = new Proxy({}, {
  173. get(_target, prop) {
  174. throw new Error(
  175. `store.llm.${String(prop)} accessed when embedProvider was supplied — DoD violation`,
  176. );
  177. },
  178. }) as never;
  179. store.llm = throwingLlm;
  180. const provider = new StubProvider("embeddinggemma", 4);
  181. const result = await generateEmbeddings(store, { embedProvider: provider });
  182. expect(result.docsProcessed).toBe(2);
  183. expect(result.chunksEmbedded).toBeGreaterThan(0);
  184. expect(result.errors).toBe(0);
  185. expect(provider.totalTextsEmbedded).toBeGreaterThan(0);
  186. });
  187. test("provider mode does not call getDefaultLlamaCpp (DoD #3 — i-1rqixh6m)", async () => {
  188. // Stronger assertion than the `store.llm` Proxy above: when the
  189. // chunker or any sibling code path falls back to the *global*
  190. // `getDefaultLlamaCpp()` singleton (the previous warm-up source
  191. // inside `chunkDocumentByTokens`), the module-level mock at the top
  192. // of this file would throw — so a successful run is itself proof of
  193. // compliance. We additionally assert call count = 0 for clarity.
  194. const spy = llmModule.getDefaultLlamaCpp as unknown as ReturnType<typeof vi.fn>;
  195. expect(spy).not.toHaveBeenCalled();
  196. const provider = new StubProvider("embeddinggemma", 4);
  197. const result = await generateEmbeddings(store, { embedProvider: provider });
  198. expect(result.docsProcessed).toBe(2);
  199. expect(result.chunksEmbedded).toBeGreaterThan(0);
  200. expect(result.errors).toBe(0);
  201. expect(provider.totalTextsEmbedded).toBeGreaterThan(0);
  202. // The hard assertion: not a single call to the local LlamaCpp
  203. // singleton during the entire embed run. If `chunkDocumentByTokens`
  204. // (or any sibling) regresses and reaches `getDefaultLlamaCpp()` on
  205. // the provider path, this test fails with a clear DoD-violation
  206. // message — and the run itself would have already thrown.
  207. expect(spy).not.toHaveBeenCalled();
  208. });
  209. test("model-id guard throws ModelMismatchError on mismatch", async () => {
  210. // Pre-populate content_vectors with a different model id
  211. store.ensureVecTable(4);
  212. insertEmbedding(
  213. store.db,
  214. "hash1",
  215. 0,
  216. 0,
  217. new Float32Array([0.1, 0.2, 0.3, 0.4]),
  218. "old-model",
  219. "2026-04-27T00:00:00Z",
  220. );
  221. const provider = new StubProvider("new-model", 4);
  222. await expect(
  223. generateEmbeddings(store, { embedProvider: provider }),
  224. ).rejects.toBeInstanceOf(ModelMismatchError);
  225. });
  226. test("model-id matches → proceeds", async () => {
  227. store.ensureVecTable(4);
  228. insertEmbedding(
  229. store.db,
  230. "hash1",
  231. 0,
  232. 0,
  233. new Float32Array([0.1, 0.2, 0.3, 0.4]),
  234. "embeddinggemma",
  235. "2026-04-27T00:00:00Z",
  236. );
  237. const provider = new StubProvider("embeddinggemma", 4);
  238. const result = await generateEmbeddings(store, { embedProvider: provider });
  239. // Only hash2 needs embedding (hash1 already has one)
  240. expect(result.docsProcessed).toBeLessThanOrEqual(2);
  241. expect(result.errors).toBe(0);
  242. });
  243. test("force=true bypasses model-id guard", async () => {
  244. store.ensureVecTable(4);
  245. insertEmbedding(
  246. store.db,
  247. "hash1",
  248. 0,
  249. 0,
  250. new Float32Array([0.1, 0.2, 0.3, 0.4]),
  251. "old-model",
  252. "2026-04-27T00:00:00Z",
  253. );
  254. const provider = new StubProvider("new-model", 4);
  255. // force=true wipes content_vectors first → guard sees empty → no throw
  256. const result = await generateEmbeddings(store, {
  257. embedProvider: provider,
  258. force: true,
  259. });
  260. expect(result.docsProcessed).toBe(2);
  261. expect(result.errors).toBe(0);
  262. // Now only "new-model" should be in the DB
  263. expect(getDistinctEmbeddingModels(store.db)).toEqual(["new-model"]);
  264. });
  265. test("empty DB → no guard issue, anything goes", async () => {
  266. expect(getDistinctEmbeddingModels(store.db)).toEqual([]);
  267. const provider = new StubProvider("anything-id", 4);
  268. const result = await generateEmbeddings(store, { embedProvider: provider });
  269. expect(result.errors).toBe(0);
  270. });
  271. });