embedding-store-integration.test.ts 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216
  1. /**
  2. * embedding-store-integration.test.ts - Tests for the
  3. * generateEmbeddings() / EmbeddingProvider integration in store.ts.
  4. *
  5. * Uses an in-memory SQLite + a stub EmbeddingProvider to avoid loading
  6. * node-llama-cpp models. Verifies:
  7. * - Provider's embedBatch is called when options.embedProvider is set
  8. * - Model-id guard throws ModelMismatchError on mismatch
  9. * - Force re-embed bypasses the guard
  10. * - getDistinctEmbeddingModels reads content_vectors correctly
  11. */
  12. import { describe, test, expect, beforeEach, afterEach } from "vitest";
  13. import { mkdtempSync, rmSync } from "node:fs";
  14. import { tmpdir } from "node:os";
  15. import { join } from "node:path";
  16. import {
  17. createStore,
  18. generateEmbeddings,
  19. getDistinctEmbeddingModels,
  20. insertEmbedding,
  21. type Store,
  22. } from "../src/store.js";
  23. import {
  24. ModelMismatchError,
  25. type EmbeddingProvider,
  26. type ProviderEmbedding,
  27. type ProviderHealth,
  28. } from "../src/embedding/provider.js";
  29. // ─────────────────────────── Stub provider ───────────────────────────────────
  30. class StubProvider implements EmbeddingProvider {
  31. readonly kind = "openai" as const;
  32. readonly modelId: string;
  33. readonly dim: number;
  34. embedBatchCalls = 0;
  35. embedCalls = 0;
  36. totalTextsEmbedded = 0;
  37. constructor(modelId: string, dim = 4) {
  38. this.modelId = modelId;
  39. this.dim = dim;
  40. }
  41. getModelId(): string {
  42. return this.modelId;
  43. }
  44. getDimensions(): number | undefined {
  45. return this.dim;
  46. }
  47. async healthcheck(): Promise<ProviderHealth> {
  48. return { ok: true, model: this.modelId, dimensions: this.dim };
  49. }
  50. async embed(text: string): Promise<ProviderEmbedding | null> {
  51. this.embedCalls++;
  52. this.totalTextsEmbedded++;
  53. return { embedding: this.fakeEmbed(text), model: this.modelId };
  54. }
  55. async embedBatch(texts: string[]): Promise<(ProviderEmbedding | null)[]> {
  56. this.embedBatchCalls++;
  57. this.totalTextsEmbedded += texts.length;
  58. return texts.map((t) => ({ embedding: this.fakeEmbed(t), model: this.modelId }));
  59. }
  60. async dispose(): Promise<void> {}
  61. private fakeEmbed(text: string): number[] {
  62. return Array.from({ length: this.dim }, (_, i) => (text.length + i) * 0.01);
  63. }
  64. }
  65. // ─────────────────────────── Test setup ──────────────────────────────────────
  66. let workDir: string;
  67. let store: Store;
  68. beforeEach(() => {
  69. workDir = mkdtempSync(join(tmpdir(), "qmd-store-int-test-"));
  70. process.env.INDEX_PATH = join(workDir, "index.sqlite");
  71. store = createStore(process.env.INDEX_PATH);
  72. // Insert content + documents with the bare-minimum schema. The content
  73. // body needs to be non-empty so chunkDocumentByTokens emits at least one
  74. // chunk per doc.
  75. const now = "2026-04-27T00:00:00Z";
  76. store.db
  77. .prepare(`INSERT INTO content (hash, doc, created_at) VALUES (?, ?, ?)`)
  78. .run("hash1", "Document one body content here that is long enough to chunk.", now);
  79. store.db
  80. .prepare(`INSERT INTO content (hash, doc, created_at) VALUES (?, ?, ?)`)
  81. .run("hash2", "Document two body content there with different words to chunk.", now);
  82. store.db
  83. .prepare(
  84. `INSERT INTO documents (hash, collection, path, title, created_at, modified_at, active) VALUES (?, ?, ?, ?, ?, ?, ?)`,
  85. )
  86. .run("hash1", "test", "one.md", "One", now, now, 1);
  87. store.db
  88. .prepare(
  89. `INSERT INTO documents (hash, collection, path, title, created_at, modified_at, active) VALUES (?, ?, ?, ?, ?, ?, ?)`,
  90. )
  91. .run("hash2", "test", "two.md", "Two", now, now, 1);
  92. });
  93. afterEach(() => {
  94. try {
  95. store.close();
  96. } catch { /* ignore */ }
  97. delete process.env.INDEX_PATH;
  98. rmSync(workDir, { recursive: true, force: true });
  99. });
  100. // ─────────────────────────── getDistinctEmbeddingModels ──────────────────────
  101. describe("getDistinctEmbeddingModels", () => {
  102. test("returns [] when content_vectors is empty", () => {
  103. expect(getDistinctEmbeddingModels(store.db)).toEqual([]);
  104. });
  105. test("returns distinct model strings", () => {
  106. store.ensureVecTable(4);
  107. insertEmbedding(store.db, "hash1", 0, 0, new Float32Array([0.1, 0.2, 0.3, 0.4]), "embeddinggemma", "2026-04-27T00:00:00Z");
  108. insertEmbedding(store.db, "hash2", 0, 0, new Float32Array([0.5, 0.6, 0.7, 0.8]), "embeddinggemma", "2026-04-27T00:00:00Z");
  109. expect(getDistinctEmbeddingModels(store.db)).toEqual(["embeddinggemma"]);
  110. });
  111. test("returns multiple distinct models when present", () => {
  112. store.ensureVecTable(4);
  113. insertEmbedding(store.db, "hash1", 0, 0, new Float32Array([0.1, 0.2, 0.3, 0.4]), "model-a", "2026-04-27T00:00:00Z");
  114. insertEmbedding(store.db, "hash2", 0, 0, new Float32Array([0.5, 0.6, 0.7, 0.8]), "model-b", "2026-04-27T00:00:00Z");
  115. const models = getDistinctEmbeddingModels(store.db).sort();
  116. expect(models).toEqual(["model-a", "model-b"]);
  117. });
  118. });
  119. // ─────────────────────────── generateEmbeddings + provider ───────────────────
  120. describe("generateEmbeddings with EmbeddingProvider", () => {
  121. test("uses provider.embedBatch when supplied", async () => {
  122. const provider = new StubProvider("embeddinggemma", 4);
  123. const result = await generateEmbeddings(store, {
  124. embedProvider: provider,
  125. // Use small batches to keep test fast
  126. maxDocsPerBatch: 64,
  127. });
  128. expect(result.docsProcessed).toBe(2);
  129. expect(result.chunksEmbedded).toBeGreaterThan(0);
  130. expect(result.errors).toBe(0);
  131. expect(provider.embedBatchCalls + provider.embedCalls).toBeGreaterThan(0);
  132. expect(provider.totalTextsEmbedded).toBeGreaterThan(0);
  133. });
  134. test("model-id guard throws ModelMismatchError on mismatch", async () => {
  135. // Pre-populate content_vectors with a different model id
  136. store.ensureVecTable(4);
  137. insertEmbedding(
  138. store.db,
  139. "hash1",
  140. 0,
  141. 0,
  142. new Float32Array([0.1, 0.2, 0.3, 0.4]),
  143. "old-model",
  144. "2026-04-27T00:00:00Z",
  145. );
  146. const provider = new StubProvider("new-model", 4);
  147. await expect(
  148. generateEmbeddings(store, { embedProvider: provider }),
  149. ).rejects.toBeInstanceOf(ModelMismatchError);
  150. });
  151. test("model-id matches → proceeds", async () => {
  152. store.ensureVecTable(4);
  153. insertEmbedding(
  154. store.db,
  155. "hash1",
  156. 0,
  157. 0,
  158. new Float32Array([0.1, 0.2, 0.3, 0.4]),
  159. "embeddinggemma",
  160. "2026-04-27T00:00:00Z",
  161. );
  162. const provider = new StubProvider("embeddinggemma", 4);
  163. const result = await generateEmbeddings(store, { embedProvider: provider });
  164. // Only hash2 needs embedding (hash1 already has one)
  165. expect(result.docsProcessed).toBeLessThanOrEqual(2);
  166. expect(result.errors).toBe(0);
  167. });
  168. test("force=true bypasses model-id guard", async () => {
  169. store.ensureVecTable(4);
  170. insertEmbedding(
  171. store.db,
  172. "hash1",
  173. 0,
  174. 0,
  175. new Float32Array([0.1, 0.2, 0.3, 0.4]),
  176. "old-model",
  177. "2026-04-27T00:00:00Z",
  178. );
  179. const provider = new StubProvider("new-model", 4);
  180. // force=true wipes content_vectors first → guard sees empty → no throw
  181. const result = await generateEmbeddings(store, {
  182. embedProvider: provider,
  183. force: true,
  184. });
  185. expect(result.docsProcessed).toBe(2);
  186. expect(result.errors).toBe(0);
  187. // Now only "new-model" should be in the DB
  188. expect(getDistinctEmbeddingModels(store.db)).toEqual(["new-model"]);
  189. });
  190. test("empty DB → no guard issue, anything goes", async () => {
  191. expect(getDistinctEmbeddingModels(store.db)).toEqual([]);
  192. const provider = new StubProvider("anything-id", 4);
  193. const result = await generateEmbeddings(store, { embedProvider: provider });
  194. expect(result.errors).toBe(0);
  195. });
  196. });