embedding-store-integration.test.ts 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245
  1. /**
  2. * embedding-store-integration.test.ts - Tests for the
  3. * generateEmbeddings() / EmbeddingProvider integration in store.ts.
  4. *
  5. * Uses an in-memory SQLite + a stub EmbeddingProvider to avoid loading
  6. * node-llama-cpp models. Verifies:
  7. * - Provider's embedBatch is called when options.embedProvider is set
  8. * - Model-id guard throws ModelMismatchError on mismatch
  9. * - Force re-embed bypasses the guard
  10. * - getDistinctEmbeddingModels reads content_vectors correctly
  11. */
  12. import { describe, test, expect, beforeEach, afterEach } from "vitest";
  13. import { mkdtempSync, rmSync } from "node:fs";
  14. import { tmpdir } from "node:os";
  15. import { join } from "node:path";
  16. import {
  17. createStore,
  18. generateEmbeddings,
  19. getDistinctEmbeddingModels,
  20. insertEmbedding,
  21. type Store,
  22. } from "../src/store.js";
  23. import {
  24. ModelMismatchError,
  25. type EmbeddingProvider,
  26. type ProviderEmbedding,
  27. type ProviderHealth,
  28. } from "../src/embedding/provider.js";
  29. // ─────────────────────────── Stub provider ───────────────────────────────────
  30. class StubProvider implements EmbeddingProvider {
  31. readonly kind = "openai" as const;
  32. readonly modelId: string;
  33. readonly dim: number;
  34. embedBatchCalls = 0;
  35. embedCalls = 0;
  36. totalTextsEmbedded = 0;
  37. constructor(modelId: string, dim = 4) {
  38. this.modelId = modelId;
  39. this.dim = dim;
  40. }
  41. getModelId(): string {
  42. return this.modelId;
  43. }
  44. getDimensions(): number | undefined {
  45. return this.dim;
  46. }
  47. async healthcheck(): Promise<ProviderHealth> {
  48. return { ok: true, model: this.modelId, dimensions: this.dim };
  49. }
  50. async embed(text: string): Promise<ProviderEmbedding | null> {
  51. this.embedCalls++;
  52. this.totalTextsEmbedded++;
  53. return { embedding: this.fakeEmbed(text), model: this.modelId };
  54. }
  55. async embedBatch(texts: string[]): Promise<(ProviderEmbedding | null)[]> {
  56. this.embedBatchCalls++;
  57. this.totalTextsEmbedded += texts.length;
  58. return texts.map((t) => ({ embedding: this.fakeEmbed(t), model: this.modelId }));
  59. }
  60. async dispose(): Promise<void> {}
  61. private fakeEmbed(text: string): number[] {
  62. return Array.from({ length: this.dim }, (_, i) => (text.length + i) * 0.01);
  63. }
  64. }
  65. // ─────────────────────────── Test setup ──────────────────────────────────────
  66. let workDir: string;
  67. let store: Store;
  68. beforeEach(() => {
  69. workDir = mkdtempSync(join(tmpdir(), "qmd-store-int-test-"));
  70. process.env.INDEX_PATH = join(workDir, "index.sqlite");
  71. store = createStore(process.env.INDEX_PATH);
  72. // Insert content + documents with the bare-minimum schema. The content
  73. // body needs to be non-empty so chunkDocumentByTokens emits at least one
  74. // chunk per doc.
  75. const now = "2026-04-27T00:00:00Z";
  76. store.db
  77. .prepare(`INSERT INTO content (hash, doc, created_at) VALUES (?, ?, ?)`)
  78. .run("hash1", "Document one body content here that is long enough to chunk.", now);
  79. store.db
  80. .prepare(`INSERT INTO content (hash, doc, created_at) VALUES (?, ?, ?)`)
  81. .run("hash2", "Document two body content there with different words to chunk.", now);
  82. store.db
  83. .prepare(
  84. `INSERT INTO documents (hash, collection, path, title, created_at, modified_at, active) VALUES (?, ?, ?, ?, ?, ?, ?)`,
  85. )
  86. .run("hash1", "test", "one.md", "One", now, now, 1);
  87. store.db
  88. .prepare(
  89. `INSERT INTO documents (hash, collection, path, title, created_at, modified_at, active) VALUES (?, ?, ?, ?, ?, ?, ?)`,
  90. )
  91. .run("hash2", "test", "two.md", "Two", now, now, 1);
  92. });
  93. afterEach(() => {
  94. try {
  95. store.close();
  96. } catch { /* ignore */ }
  97. delete process.env.INDEX_PATH;
  98. rmSync(workDir, { recursive: true, force: true });
  99. });
  100. // ─────────────────────────── getDistinctEmbeddingModels ──────────────────────
  101. describe("getDistinctEmbeddingModels", () => {
  102. test("returns [] when content_vectors is empty", () => {
  103. expect(getDistinctEmbeddingModels(store.db)).toEqual([]);
  104. });
  105. test("returns distinct model strings", () => {
  106. store.ensureVecTable(4);
  107. insertEmbedding(store.db, "hash1", 0, 0, new Float32Array([0.1, 0.2, 0.3, 0.4]), "embeddinggemma", "2026-04-27T00:00:00Z");
  108. insertEmbedding(store.db, "hash2", 0, 0, new Float32Array([0.5, 0.6, 0.7, 0.8]), "embeddinggemma", "2026-04-27T00:00:00Z");
  109. expect(getDistinctEmbeddingModels(store.db)).toEqual(["embeddinggemma"]);
  110. });
  111. test("returns multiple distinct models when present", () => {
  112. store.ensureVecTable(4);
  113. insertEmbedding(store.db, "hash1", 0, 0, new Float32Array([0.1, 0.2, 0.3, 0.4]), "model-a", "2026-04-27T00:00:00Z");
  114. insertEmbedding(store.db, "hash2", 0, 0, new Float32Array([0.5, 0.6, 0.7, 0.8]), "model-b", "2026-04-27T00:00:00Z");
  115. const models = getDistinctEmbeddingModels(store.db).sort();
  116. expect(models).toEqual(["model-a", "model-b"]);
  117. });
  118. });
  119. // ─────────────────────────── generateEmbeddings + provider ───────────────────
  120. describe("generateEmbeddings with EmbeddingProvider", () => {
  121. test("uses provider.embedBatch when supplied", async () => {
  122. const provider = new StubProvider("embeddinggemma", 4);
  123. const result = await generateEmbeddings(store, {
  124. embedProvider: provider,
  125. // Use small batches to keep test fast
  126. maxDocsPerBatch: 64,
  127. });
  128. expect(result.docsProcessed).toBe(2);
  129. expect(result.chunksEmbedded).toBeGreaterThan(0);
  130. expect(result.errors).toBe(0);
  131. expect(provider.embedBatchCalls + provider.embedCalls).toBeGreaterThan(0);
  132. expect(provider.totalTextsEmbedded).toBeGreaterThan(0);
  133. });
  134. // Default 5s timeout restored after i-08ovbvtb removed the
  135. // `withLLMSessionForLlm` wrapper from the provider path. The previous
  136. // 30s bump (commit 058ec1d) was a workaround for the cold-cache LLM
  137. // warm-up that the refactor now skips entirely.
  138. test("provider mode does not access store.llm (DoD #2, #5 — i-08ovbvtb)", async () => {
  139. // When `embedProvider` is supplied, the refactor must NOT consult the
  140. // local LlamaCpp at all — neither `embedModelName` nor any other field.
  141. // We assert this by setting `store.llm` to a Proxy that throws on any
  142. // property access. If `getLlm(store).embedModelName` (or any sibling
  143. // call site) regressed back into the provider path, the test would
  144. // fail with a clear error message.
  145. const throwingLlm = new Proxy({}, {
  146. get(_target, prop) {
  147. throw new Error(
  148. `store.llm.${String(prop)} accessed when embedProvider was supplied — DoD violation`,
  149. );
  150. },
  151. }) as never;
  152. store.llm = throwingLlm;
  153. const provider = new StubProvider("embeddinggemma", 4);
  154. const result = await generateEmbeddings(store, { embedProvider: provider });
  155. expect(result.docsProcessed).toBe(2);
  156. expect(result.chunksEmbedded).toBeGreaterThan(0);
  157. expect(result.errors).toBe(0);
  158. expect(provider.totalTextsEmbedded).toBeGreaterThan(0);
  159. });
  160. test("model-id guard throws ModelMismatchError on mismatch", async () => {
  161. // Pre-populate content_vectors with a different model id
  162. store.ensureVecTable(4);
  163. insertEmbedding(
  164. store.db,
  165. "hash1",
  166. 0,
  167. 0,
  168. new Float32Array([0.1, 0.2, 0.3, 0.4]),
  169. "old-model",
  170. "2026-04-27T00:00:00Z",
  171. );
  172. const provider = new StubProvider("new-model", 4);
  173. await expect(
  174. generateEmbeddings(store, { embedProvider: provider }),
  175. ).rejects.toBeInstanceOf(ModelMismatchError);
  176. });
  177. test("model-id matches → proceeds", async () => {
  178. store.ensureVecTable(4);
  179. insertEmbedding(
  180. store.db,
  181. "hash1",
  182. 0,
  183. 0,
  184. new Float32Array([0.1, 0.2, 0.3, 0.4]),
  185. "embeddinggemma",
  186. "2026-04-27T00:00:00Z",
  187. );
  188. const provider = new StubProvider("embeddinggemma", 4);
  189. const result = await generateEmbeddings(store, { embedProvider: provider });
  190. // Only hash2 needs embedding (hash1 already has one)
  191. expect(result.docsProcessed).toBeLessThanOrEqual(2);
  192. expect(result.errors).toBe(0);
  193. });
  194. test("force=true bypasses model-id guard", async () => {
  195. store.ensureVecTable(4);
  196. insertEmbedding(
  197. store.db,
  198. "hash1",
  199. 0,
  200. 0,
  201. new Float32Array([0.1, 0.2, 0.3, 0.4]),
  202. "old-model",
  203. "2026-04-27T00:00:00Z",
  204. );
  205. const provider = new StubProvider("new-model", 4);
  206. // force=true wipes content_vectors first → guard sees empty → no throw
  207. const result = await generateEmbeddings(store, {
  208. embedProvider: provider,
  209. force: true,
  210. });
  211. expect(result.docsProcessed).toBe(2);
  212. expect(result.errors).toBe(0);
  213. // Now only "new-model" should be in the DB
  214. expect(getDistinctEmbeddingModels(store.db)).toEqual(["new-model"]);
  215. });
  216. test("empty DB → no guard issue, anything goes", async () => {
  217. expect(getDistinctEmbeddingModels(store.db)).toEqual([]);
  218. const provider = new StubProvider("anything-id", 4);
  219. const result = await generateEmbeddings(store, { embedProvider: provider });
  220. expect(result.errors).toBe(0);
  221. });
  222. });