/** * embed-collection-filter.test.ts — Tests for the collection-filter plumbing * shipped under i-ofojj7dy: * * - getPendingEmbeddingDocs(db, collection) filters at the SQL layer * - getHashesNeedingEmbedding(db, collection) filters at the SQL layer * - generateEmbeddings({ collection }) only embeds matching docs * * Uses an in-memory SQLite + stub EmbeddingProvider — no node-llama-cpp. */ import { describe, test, expect, beforeEach, afterEach } from "vitest"; import { mkdtempSync, rmSync } from "node:fs"; import { tmpdir } from "node:os"; import { join } from "node:path"; import { createStore, generateEmbeddings, getHashesNeedingEmbedding, type Store, } from "../src/store.js"; import type { EmbeddingProvider, ProviderEmbedding, ProviderHealth, } from "../src/embedding/provider.js"; // ─────────────────────────── Stub provider ─────────────────────────────────── class StubProvider implements EmbeddingProvider { readonly kind = "openai" as const; readonly modelId: string; readonly dim: number; embedBatchCalls = 0; totalTextsEmbedded = 0; // Snapshot the per-doc collection labels we received via the chunk stream. // generateEmbeddings hands us the chunk text only, but we can correlate // back through `docsProcessed` count in the result. For this test we only // assert on the result counts. constructor(modelId: string, dim = 4) { this.modelId = modelId; this.dim = dim; } getModelId(): string { return this.modelId; } getDimensions(): number | undefined { return this.dim; } async healthcheck(): Promise { return { ok: true, model: this.modelId, dimensions: this.dim }; } async embed(text: string): Promise { this.totalTextsEmbedded++; return { embedding: this.fakeEmbed(text), model: this.modelId }; } async embedBatch(texts: string[]): Promise<(ProviderEmbedding | null)[]> { this.embedBatchCalls++; this.totalTextsEmbedded += texts.length; return texts.map((t) => ({ embedding: this.fakeEmbed(t), model: this.modelId })); } async dispose(): Promise {} private fakeEmbed(text: string): number[] { return Array.from({ length: this.dim }, (_, i) => (text.length + i) * 0.01); } } // ─────────────────────────── Test setup ────────────────────────────────────── let workDir: string; let store: Store; beforeEach(() => { workDir = mkdtempSync(join(tmpdir(), "qmd-embed-filter-test-")); process.env.INDEX_PATH = join(workDir, "index.sqlite"); store = createStore(process.env.INDEX_PATH); const now = "2026-05-13T00:00:00Z"; // Three distinct content hashes, three distinct collections — one doc each. // The body has to be non-empty so chunkDocumentByTokens emits ≥1 chunk/doc. const bodies: Record = { hashA: "Alpha collection body content here that is long enough to chunk.", hashB: "Beta collection body text there with different vocabulary to chunk.", hashC: "Gamma collection body words yonder packing unique tokens to chunk.", }; for (const [hash, body] of Object.entries(bodies)) { store.db .prepare(`INSERT INTO content (hash, doc, created_at) VALUES (?, ?, ?)`) .run(hash, body, now); } // doc-per-collection mapping const insertDoc = (hash: string, collection: string, path: string) => { store.db .prepare( `INSERT INTO documents (hash, collection, path, title, created_at, modified_at, active) VALUES (?, ?, ?, ?, ?, ?, ?)`, ) .run(hash, collection, path, path, now, now, 1); }; insertDoc("hashA", "alpha", "a.md"); insertDoc("hashB", "beta", "b.md"); insertDoc("hashC", "gamma", "c.md"); }); afterEach(() => { try { store.close(); } catch { /* ignore */ } delete process.env.INDEX_PATH; rmSync(workDir, { recursive: true, force: true }); }); // ─────────────────────────── getHashesNeedingEmbedding ─────────────────────── describe("getHashesNeedingEmbedding with collection filter (i-ofojj7dy)", () => { test("returns total count when no collection passed", () => { expect(getHashesNeedingEmbedding(store.db)).toBe(3); }); test("returns 1 when filtering to a single-doc collection", () => { expect(getHashesNeedingEmbedding(store.db, "alpha")).toBe(1); expect(getHashesNeedingEmbedding(store.db, "beta")).toBe(1); expect(getHashesNeedingEmbedding(store.db, "gamma")).toBe(1); }); test("returns 0 when filter does not match any collection", () => { expect(getHashesNeedingEmbedding(store.db, "nonexistent")).toBe(0); }); test("shared content hash counted per containing collection", () => { // Add a second doc that re-uses hashA but in collection "beta". const now = "2026-05-13T00:00:00Z"; store.db .prepare( `INSERT INTO documents (hash, collection, path, title, created_at, modified_at, active) VALUES (?, ?, ?, ?, ?, ?, ?)`, ) .run("hashA", "beta", "shared.md", "shared", now, now, 1); // Without filter, the DISTINCT count of pending hashes is still 3. expect(getHashesNeedingEmbedding(store.db)).toBe(3); // With filter, beta now contains 2 distinct hashes (hashA + hashB). expect(getHashesNeedingEmbedding(store.db, "beta")).toBe(2); // Alpha still owns just hashA. expect(getHashesNeedingEmbedding(store.db, "alpha")).toBe(1); }); test("inactive docs are excluded from the filtered count", () => { store.db .prepare(`UPDATE documents SET active = 0 WHERE collection = 'beta'`) .run(); expect(getHashesNeedingEmbedding(store.db, "beta")).toBe(0); // Other collections unaffected expect(getHashesNeedingEmbedding(store.db, "alpha")).toBe(1); }); }); // ─────────────────────────── generateEmbeddings filter ─────────────────────── describe("generateEmbeddings with collection filter (i-ofojj7dy)", () => { test("processes only documents in the named collection", async () => { const provider = new StubProvider("embeddinggemma", 4); const result = await generateEmbeddings(store, { embedProvider: provider, collection: "alpha", maxDocsPerBatch: 64, }); expect(result.docsProcessed).toBe(1); expect(result.chunksEmbedded).toBeGreaterThan(0); expect(result.errors).toBe(0); }); test("processes all documents when collection is omitted (legacy path)", async () => { const provider = new StubProvider("embeddinggemma", 4); const result = await generateEmbeddings(store, { embedProvider: provider, maxDocsPerBatch: 64, }); expect(result.docsProcessed).toBe(3); expect(result.errors).toBe(0); }); test("returns zero-result for unknown collection without throwing", async () => { const provider = new StubProvider("embeddinggemma", 4); const result = await generateEmbeddings(store, { embedProvider: provider, collection: "ghost", }); // No docs to embed → returns early with the empty-result shape expect(result.docsProcessed).toBe(0); expect(result.chunksEmbedded).toBe(0); expect(result.errors).toBe(0); expect(provider.totalTextsEmbedded).toBe(0); }); test("does not embed docs from sibling collections", async () => { // Embed only beta; verify alpha + gamma are STILL pending afterward. const provider = new StubProvider("embeddinggemma", 4); await generateEmbeddings(store, { embedProvider: provider, collection: "beta", }); // alpha + gamma still need embeddings, beta does not expect(getHashesNeedingEmbedding(store.db, "alpha")).toBe(1); expect(getHashesNeedingEmbedding(store.db, "gamma")).toBe(1); expect(getHashesNeedingEmbedding(store.db, "beta")).toBe(0); }); });