embed-collection-filter.test.ts 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203
  1. /**
  2. * embed-collection-filter.test.ts — Tests for the collection-filter plumbing
  3. * shipped under i-ofojj7dy:
  4. *
  5. * - getPendingEmbeddingDocs(db, collection) filters at the SQL layer
  6. * - getHashesNeedingEmbedding(db, collection) filters at the SQL layer
  7. * - generateEmbeddings({ collection }) only embeds matching docs
  8. *
  9. * Uses an in-memory SQLite + stub EmbeddingProvider — no node-llama-cpp.
  10. */
  11. import { describe, test, expect, beforeEach, afterEach } from "vitest";
  12. import { mkdtempSync, rmSync } from "node:fs";
  13. import { tmpdir } from "node:os";
  14. import { join } from "node:path";
  15. import {
  16. createStore,
  17. generateEmbeddings,
  18. getHashesNeedingEmbedding,
  19. type Store,
  20. } from "../src/store.js";
  21. import type {
  22. EmbeddingProvider,
  23. ProviderEmbedding,
  24. ProviderHealth,
  25. } from "../src/embedding/provider.js";
  26. // ─────────────────────────── Stub provider ───────────────────────────────────
  27. class StubProvider implements EmbeddingProvider {
  28. readonly kind = "openai" as const;
  29. readonly modelId: string;
  30. readonly dim: number;
  31. embedBatchCalls = 0;
  32. totalTextsEmbedded = 0;
  33. // Snapshot the per-doc collection labels we received via the chunk stream.
  34. // generateEmbeddings hands us the chunk text only, but we can correlate
  35. // back through `docsProcessed` count in the result. For this test we only
  36. // assert on the result counts.
  37. constructor(modelId: string, dim = 4) {
  38. this.modelId = modelId;
  39. this.dim = dim;
  40. }
  41. getModelId(): string { return this.modelId; }
  42. getDimensions(): number | undefined { return this.dim; }
  43. async healthcheck(): Promise<ProviderHealth> {
  44. return { ok: true, model: this.modelId, dimensions: this.dim };
  45. }
  46. async embed(text: string): Promise<ProviderEmbedding | null> {
  47. this.totalTextsEmbedded++;
  48. return { embedding: this.fakeEmbed(text), model: this.modelId };
  49. }
  50. async embedBatch(texts: string[]): Promise<(ProviderEmbedding | null)[]> {
  51. this.embedBatchCalls++;
  52. this.totalTextsEmbedded += texts.length;
  53. return texts.map((t) => ({ embedding: this.fakeEmbed(t), model: this.modelId }));
  54. }
  55. async dispose(): Promise<void> {}
  56. private fakeEmbed(text: string): number[] {
  57. return Array.from({ length: this.dim }, (_, i) => (text.length + i) * 0.01);
  58. }
  59. }
  60. // ─────────────────────────── Test setup ──────────────────────────────────────
  61. let workDir: string;
  62. let store: Store;
  63. beforeEach(() => {
  64. workDir = mkdtempSync(join(tmpdir(), "qmd-embed-filter-test-"));
  65. process.env.INDEX_PATH = join(workDir, "index.sqlite");
  66. store = createStore(process.env.INDEX_PATH);
  67. const now = "2026-05-13T00:00:00Z";
  68. // Three distinct content hashes, three distinct collections — one doc each.
  69. // The body has to be non-empty so chunkDocumentByTokens emits ≥1 chunk/doc.
  70. const bodies: Record<string, string> = {
  71. hashA: "Alpha collection body content here that is long enough to chunk.",
  72. hashB: "Beta collection body text there with different vocabulary to chunk.",
  73. hashC: "Gamma collection body words yonder packing unique tokens to chunk.",
  74. };
  75. for (const [hash, body] of Object.entries(bodies)) {
  76. store.db
  77. .prepare(`INSERT INTO content (hash, doc, created_at) VALUES (?, ?, ?)`)
  78. .run(hash, body, now);
  79. }
  80. // doc-per-collection mapping
  81. const insertDoc = (hash: string, collection: string, path: string) => {
  82. store.db
  83. .prepare(
  84. `INSERT INTO documents (hash, collection, path, title, created_at, modified_at, active) VALUES (?, ?, ?, ?, ?, ?, ?)`,
  85. )
  86. .run(hash, collection, path, path, now, now, 1);
  87. };
  88. insertDoc("hashA", "alpha", "a.md");
  89. insertDoc("hashB", "beta", "b.md");
  90. insertDoc("hashC", "gamma", "c.md");
  91. });
  92. afterEach(() => {
  93. try {
  94. store.close();
  95. } catch { /* ignore */ }
  96. delete process.env.INDEX_PATH;
  97. rmSync(workDir, { recursive: true, force: true });
  98. });
  99. // ─────────────────────────── getHashesNeedingEmbedding ───────────────────────
  100. describe("getHashesNeedingEmbedding with collection filter (i-ofojj7dy)", () => {
  101. test("returns total count when no collection passed", () => {
  102. expect(getHashesNeedingEmbedding(store.db)).toBe(3);
  103. });
  104. test("returns 1 when filtering to a single-doc collection", () => {
  105. expect(getHashesNeedingEmbedding(store.db, "alpha")).toBe(1);
  106. expect(getHashesNeedingEmbedding(store.db, "beta")).toBe(1);
  107. expect(getHashesNeedingEmbedding(store.db, "gamma")).toBe(1);
  108. });
  109. test("returns 0 when filter does not match any collection", () => {
  110. expect(getHashesNeedingEmbedding(store.db, "nonexistent")).toBe(0);
  111. });
  112. test("shared content hash counted per containing collection", () => {
  113. // Add a second doc that re-uses hashA but in collection "beta".
  114. const now = "2026-05-13T00:00:00Z";
  115. store.db
  116. .prepare(
  117. `INSERT INTO documents (hash, collection, path, title, created_at, modified_at, active) VALUES (?, ?, ?, ?, ?, ?, ?)`,
  118. )
  119. .run("hashA", "beta", "shared.md", "shared", now, now, 1);
  120. // Without filter, the DISTINCT count of pending hashes is still 3.
  121. expect(getHashesNeedingEmbedding(store.db)).toBe(3);
  122. // With filter, beta now contains 2 distinct hashes (hashA + hashB).
  123. expect(getHashesNeedingEmbedding(store.db, "beta")).toBe(2);
  124. // Alpha still owns just hashA.
  125. expect(getHashesNeedingEmbedding(store.db, "alpha")).toBe(1);
  126. });
  127. test("inactive docs are excluded from the filtered count", () => {
  128. store.db
  129. .prepare(`UPDATE documents SET active = 0 WHERE collection = 'beta'`)
  130. .run();
  131. expect(getHashesNeedingEmbedding(store.db, "beta")).toBe(0);
  132. // Other collections unaffected
  133. expect(getHashesNeedingEmbedding(store.db, "alpha")).toBe(1);
  134. });
  135. });
  136. // ─────────────────────────── generateEmbeddings filter ───────────────────────
  137. describe("generateEmbeddings with collection filter (i-ofojj7dy)", () => {
  138. test("processes only documents in the named collection", async () => {
  139. const provider = new StubProvider("embeddinggemma", 4);
  140. const result = await generateEmbeddings(store, {
  141. embedProvider: provider,
  142. collection: "alpha",
  143. maxDocsPerBatch: 64,
  144. });
  145. expect(result.docsProcessed).toBe(1);
  146. expect(result.chunksEmbedded).toBeGreaterThan(0);
  147. expect(result.errors).toBe(0);
  148. });
  149. test("processes all documents when collection is omitted (legacy path)", async () => {
  150. const provider = new StubProvider("embeddinggemma", 4);
  151. const result = await generateEmbeddings(store, {
  152. embedProvider: provider,
  153. maxDocsPerBatch: 64,
  154. });
  155. expect(result.docsProcessed).toBe(3);
  156. expect(result.errors).toBe(0);
  157. });
  158. test("returns zero-result for unknown collection without throwing", async () => {
  159. const provider = new StubProvider("embeddinggemma", 4);
  160. const result = await generateEmbeddings(store, {
  161. embedProvider: provider,
  162. collection: "ghost",
  163. });
  164. // No docs to embed → returns early with the empty-result shape
  165. expect(result.docsProcessed).toBe(0);
  166. expect(result.chunksEmbedded).toBe(0);
  167. expect(result.errors).toBe(0);
  168. expect(provider.totalTextsEmbedded).toBe(0);
  169. });
  170. test("does not embed docs from sibling collections", async () => {
  171. // Embed only beta; verify alpha + gamma are STILL pending afterward.
  172. const provider = new StubProvider("embeddinggemma", 4);
  173. await generateEmbeddings(store, {
  174. embedProvider: provider,
  175. collection: "beta",
  176. });
  177. // alpha + gamma still need embeddings, beta does not
  178. expect(getHashesNeedingEmbedding(store.db, "alpha")).toBe(1);
  179. expect(getHashesNeedingEmbedding(store.db, "gamma")).toBe(1);
  180. expect(getHashesNeedingEmbedding(store.db, "beta")).toBe(0);
  181. });
  182. });