embedding-store-integration.test.ts 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219
  1. /**
  2. * embedding-store-integration.test.ts - Tests for the
  3. * generateEmbeddings() / EmbeddingProvider integration in store.ts.
  4. *
  5. * Uses an in-memory SQLite + a stub EmbeddingProvider to avoid loading
  6. * node-llama-cpp models. Verifies:
  7. * - Provider's embedBatch is called when options.embedProvider is set
  8. * - Model-id guard throws ModelMismatchError on mismatch
  9. * - Force re-embed bypasses the guard
  10. * - getDistinctEmbeddingModels reads content_vectors correctly
  11. */
  12. import { describe, test, expect, beforeEach, afterEach } from "vitest";
  13. import { mkdtempSync, rmSync } from "node:fs";
  14. import { tmpdir } from "node:os";
  15. import { join } from "node:path";
  16. import {
  17. createStore,
  18. generateEmbeddings,
  19. getDistinctEmbeddingModels,
  20. insertEmbedding,
  21. type Store,
  22. } from "../src/store.js";
  23. import {
  24. ModelMismatchError,
  25. type EmbeddingProvider,
  26. type ProviderEmbedding,
  27. type ProviderHealth,
  28. } from "../src/embedding/provider.js";
  29. // ─────────────────────────── Stub provider ───────────────────────────────────
  30. class StubProvider implements EmbeddingProvider {
  31. readonly kind = "openai" as const;
  32. readonly modelId: string;
  33. readonly dim: number;
  34. embedBatchCalls = 0;
  35. embedCalls = 0;
  36. totalTextsEmbedded = 0;
  37. constructor(modelId: string, dim = 4) {
  38. this.modelId = modelId;
  39. this.dim = dim;
  40. }
  41. getModelId(): string {
  42. return this.modelId;
  43. }
  44. getDimensions(): number | undefined {
  45. return this.dim;
  46. }
  47. async healthcheck(): Promise<ProviderHealth> {
  48. return { ok: true, model: this.modelId, dimensions: this.dim };
  49. }
  50. async embed(text: string): Promise<ProviderEmbedding | null> {
  51. this.embedCalls++;
  52. this.totalTextsEmbedded++;
  53. return { embedding: this.fakeEmbed(text), model: this.modelId };
  54. }
  55. async embedBatch(texts: string[]): Promise<(ProviderEmbedding | null)[]> {
  56. this.embedBatchCalls++;
  57. this.totalTextsEmbedded += texts.length;
  58. return texts.map((t) => ({ embedding: this.fakeEmbed(t), model: this.modelId }));
  59. }
  60. async dispose(): Promise<void> {}
  61. private fakeEmbed(text: string): number[] {
  62. return Array.from({ length: this.dim }, (_, i) => (text.length + i) * 0.01);
  63. }
  64. }
  65. // ─────────────────────────── Test setup ──────────────────────────────────────
  66. let workDir: string;
  67. let store: Store;
  68. beforeEach(() => {
  69. workDir = mkdtempSync(join(tmpdir(), "qmd-store-int-test-"));
  70. process.env.INDEX_PATH = join(workDir, "index.sqlite");
  71. store = createStore(process.env.INDEX_PATH);
  72. // Insert content + documents with the bare-minimum schema. The content
  73. // body needs to be non-empty so chunkDocumentByTokens emits at least one
  74. // chunk per doc.
  75. const now = "2026-04-27T00:00:00Z";
  76. store.db
  77. .prepare(`INSERT INTO content (hash, doc, created_at) VALUES (?, ?, ?)`)
  78. .run("hash1", "Document one body content here that is long enough to chunk.", now);
  79. store.db
  80. .prepare(`INSERT INTO content (hash, doc, created_at) VALUES (?, ?, ?)`)
  81. .run("hash2", "Document two body content there with different words to chunk.", now);
  82. store.db
  83. .prepare(
  84. `INSERT INTO documents (hash, collection, path, title, created_at, modified_at, active) VALUES (?, ?, ?, ?, ?, ?, ?)`,
  85. )
  86. .run("hash1", "test", "one.md", "One", now, now, 1);
  87. store.db
  88. .prepare(
  89. `INSERT INTO documents (hash, collection, path, title, created_at, modified_at, active) VALUES (?, ?, ?, ?, ?, ?, ?)`,
  90. )
  91. .run("hash2", "test", "two.md", "Two", now, now, 1);
  92. });
  93. afterEach(() => {
  94. try {
  95. store.close();
  96. } catch { /* ignore */ }
  97. delete process.env.INDEX_PATH;
  98. rmSync(workDir, { recursive: true, force: true });
  99. });
  100. // ─────────────────────────── getDistinctEmbeddingModels ──────────────────────
  101. describe("getDistinctEmbeddingModels", () => {
  102. test("returns [] when content_vectors is empty", () => {
  103. expect(getDistinctEmbeddingModels(store.db)).toEqual([]);
  104. });
  105. test("returns distinct model strings", () => {
  106. store.ensureVecTable(4);
  107. insertEmbedding(store.db, "hash1", 0, 0, new Float32Array([0.1, 0.2, 0.3, 0.4]), "embeddinggemma", "2026-04-27T00:00:00Z");
  108. insertEmbedding(store.db, "hash2", 0, 0, new Float32Array([0.5, 0.6, 0.7, 0.8]), "embeddinggemma", "2026-04-27T00:00:00Z");
  109. expect(getDistinctEmbeddingModels(store.db)).toEqual(["embeddinggemma"]);
  110. });
  111. test("returns multiple distinct models when present", () => {
  112. store.ensureVecTable(4);
  113. insertEmbedding(store.db, "hash1", 0, 0, new Float32Array([0.1, 0.2, 0.3, 0.4]), "model-a", "2026-04-27T00:00:00Z");
  114. insertEmbedding(store.db, "hash2", 0, 0, new Float32Array([0.5, 0.6, 0.7, 0.8]), "model-b", "2026-04-27T00:00:00Z");
  115. const models = getDistinctEmbeddingModels(store.db).sort();
  116. expect(models).toEqual(["model-a", "model-b"]);
  117. });
  118. });
  119. // ─────────────────────────── generateEmbeddings + provider ───────────────────
  120. describe("generateEmbeddings with EmbeddingProvider", () => {
  121. test("uses provider.embedBatch when supplied", async () => {
  122. const provider = new StubProvider("embeddinggemma", 4);
  123. const result = await generateEmbeddings(store, {
  124. embedProvider: provider,
  125. // Use small batches to keep test fast
  126. maxDocsPerBatch: 64,
  127. });
  128. expect(result.docsProcessed).toBe(2);
  129. expect(result.chunksEmbedded).toBeGreaterThan(0);
  130. expect(result.errors).toBe(0);
  131. expect(provider.embedBatchCalls + provider.embedCalls).toBeGreaterThan(0);
  132. expect(provider.totalTextsEmbedded).toBeGreaterThan(0);
  133. }, 30000); // Cold-cache llama-cpp init can take >5s on first session call.
  134. // Provider short-circuits embed calls (line 1494-1499 of store.ts) but the
  135. // outer `withLLMSessionForLlm` wrapper still warms the LLM. DoD #9 (skip
  136. // LLM init when provider supplied) is a follow-up refactor.
  137. test("model-id guard throws ModelMismatchError on mismatch", async () => {
  138. // Pre-populate content_vectors with a different model id
  139. store.ensureVecTable(4);
  140. insertEmbedding(
  141. store.db,
  142. "hash1",
  143. 0,
  144. 0,
  145. new Float32Array([0.1, 0.2, 0.3, 0.4]),
  146. "old-model",
  147. "2026-04-27T00:00:00Z",
  148. );
  149. const provider = new StubProvider("new-model", 4);
  150. await expect(
  151. generateEmbeddings(store, { embedProvider: provider }),
  152. ).rejects.toBeInstanceOf(ModelMismatchError);
  153. });
  154. test("model-id matches → proceeds", async () => {
  155. store.ensureVecTable(4);
  156. insertEmbedding(
  157. store.db,
  158. "hash1",
  159. 0,
  160. 0,
  161. new Float32Array([0.1, 0.2, 0.3, 0.4]),
  162. "embeddinggemma",
  163. "2026-04-27T00:00:00Z",
  164. );
  165. const provider = new StubProvider("embeddinggemma", 4);
  166. const result = await generateEmbeddings(store, { embedProvider: provider });
  167. // Only hash2 needs embedding (hash1 already has one)
  168. expect(result.docsProcessed).toBeLessThanOrEqual(2);
  169. expect(result.errors).toBe(0);
  170. });
  171. test("force=true bypasses model-id guard", async () => {
  172. store.ensureVecTable(4);
  173. insertEmbedding(
  174. store.db,
  175. "hash1",
  176. 0,
  177. 0,
  178. new Float32Array([0.1, 0.2, 0.3, 0.4]),
  179. "old-model",
  180. "2026-04-27T00:00:00Z",
  181. );
  182. const provider = new StubProvider("new-model", 4);
  183. // force=true wipes content_vectors first → guard sees empty → no throw
  184. const result = await generateEmbeddings(store, {
  185. embedProvider: provider,
  186. force: true,
  187. });
  188. expect(result.docsProcessed).toBe(2);
  189. expect(result.errors).toBe(0);
  190. // Now only "new-model" should be in the DB
  191. expect(getDistinctEmbeddingModels(store.db)).toEqual(["new-model"]);
  192. });
  193. test("empty DB → no guard issue, anything goes", async () => {
  194. expect(getDistinctEmbeddingModels(store.db)).toEqual([]);
  195. const provider = new StubProvider("anything-id", 4);
  196. const result = await generateEmbeddings(store, { embedProvider: provider });
  197. expect(result.errors).toBe(0);
  198. });
  199. });