| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203 |
- /**
- * embed-collection-filter.test.ts — Tests for the collection-filter plumbing
- * shipped under i-ofojj7dy:
- *
- * - getPendingEmbeddingDocs(db, collection) filters at the SQL layer
- * - getHashesNeedingEmbedding(db, collection) filters at the SQL layer
- * - generateEmbeddings({ collection }) only embeds matching docs
- *
- * Uses an in-memory SQLite + stub EmbeddingProvider — no node-llama-cpp.
- */
- import { describe, test, expect, beforeEach, afterEach } from "vitest";
- import { mkdtempSync, rmSync } from "node:fs";
- import { tmpdir } from "node:os";
- import { join } from "node:path";
- import {
- createStore,
- generateEmbeddings,
- getHashesNeedingEmbedding,
- type Store,
- } from "../src/store.js";
- import type {
- EmbeddingProvider,
- ProviderEmbedding,
- ProviderHealth,
- } from "../src/embedding/provider.js";
- // ─────────────────────────── Stub provider ───────────────────────────────────
- class StubProvider implements EmbeddingProvider {
- readonly kind = "openai" as const;
- readonly modelId: string;
- readonly dim: number;
- embedBatchCalls = 0;
- totalTextsEmbedded = 0;
- // Snapshot the per-doc collection labels we received via the chunk stream.
- // generateEmbeddings hands us the chunk text only, but we can correlate
- // back through `docsProcessed` count in the result. For this test we only
- // assert on the result counts.
- constructor(modelId: string, dim = 4) {
- this.modelId = modelId;
- this.dim = dim;
- }
- getModelId(): string { return this.modelId; }
- getDimensions(): number | undefined { return this.dim; }
- async healthcheck(): Promise<ProviderHealth> {
- return { ok: true, model: this.modelId, dimensions: this.dim };
- }
- async embed(text: string): Promise<ProviderEmbedding | null> {
- this.totalTextsEmbedded++;
- return { embedding: this.fakeEmbed(text), model: this.modelId };
- }
- async embedBatch(texts: string[]): Promise<(ProviderEmbedding | null)[]> {
- this.embedBatchCalls++;
- this.totalTextsEmbedded += texts.length;
- return texts.map((t) => ({ embedding: this.fakeEmbed(t), model: this.modelId }));
- }
- async dispose(): Promise<void> {}
- private fakeEmbed(text: string): number[] {
- return Array.from({ length: this.dim }, (_, i) => (text.length + i) * 0.01);
- }
- }
- // ─────────────────────────── Test setup ──────────────────────────────────────
- let workDir: string;
- let store: Store;
- beforeEach(() => {
- workDir = mkdtempSync(join(tmpdir(), "qmd-embed-filter-test-"));
- process.env.INDEX_PATH = join(workDir, "index.sqlite");
- store = createStore(process.env.INDEX_PATH);
- const now = "2026-05-13T00:00:00Z";
- // Three distinct content hashes, three distinct collections — one doc each.
- // The body has to be non-empty so chunkDocumentByTokens emits ≥1 chunk/doc.
- const bodies: Record<string, string> = {
- hashA: "Alpha collection body content here that is long enough to chunk.",
- hashB: "Beta collection body text there with different vocabulary to chunk.",
- hashC: "Gamma collection body words yonder packing unique tokens to chunk.",
- };
- for (const [hash, body] of Object.entries(bodies)) {
- store.db
- .prepare(`INSERT INTO content (hash, doc, created_at) VALUES (?, ?, ?)`)
- .run(hash, body, now);
- }
- // doc-per-collection mapping
- const insertDoc = (hash: string, collection: string, path: string) => {
- store.db
- .prepare(
- `INSERT INTO documents (hash, collection, path, title, created_at, modified_at, active) VALUES (?, ?, ?, ?, ?, ?, ?)`,
- )
- .run(hash, collection, path, path, now, now, 1);
- };
- insertDoc("hashA", "alpha", "a.md");
- insertDoc("hashB", "beta", "b.md");
- insertDoc("hashC", "gamma", "c.md");
- });
- afterEach(() => {
- try {
- store.close();
- } catch { /* ignore */ }
- delete process.env.INDEX_PATH;
- rmSync(workDir, { recursive: true, force: true });
- });
- // ─────────────────────────── getHashesNeedingEmbedding ───────────────────────
- describe("getHashesNeedingEmbedding with collection filter (i-ofojj7dy)", () => {
- test("returns total count when no collection passed", () => {
- expect(getHashesNeedingEmbedding(store.db)).toBe(3);
- });
- test("returns 1 when filtering to a single-doc collection", () => {
- expect(getHashesNeedingEmbedding(store.db, "alpha")).toBe(1);
- expect(getHashesNeedingEmbedding(store.db, "beta")).toBe(1);
- expect(getHashesNeedingEmbedding(store.db, "gamma")).toBe(1);
- });
- test("returns 0 when filter does not match any collection", () => {
- expect(getHashesNeedingEmbedding(store.db, "nonexistent")).toBe(0);
- });
- test("shared content hash counted per containing collection", () => {
- // Add a second doc that re-uses hashA but in collection "beta".
- const now = "2026-05-13T00:00:00Z";
- store.db
- .prepare(
- `INSERT INTO documents (hash, collection, path, title, created_at, modified_at, active) VALUES (?, ?, ?, ?, ?, ?, ?)`,
- )
- .run("hashA", "beta", "shared.md", "shared", now, now, 1);
- // Without filter, the DISTINCT count of pending hashes is still 3.
- expect(getHashesNeedingEmbedding(store.db)).toBe(3);
- // With filter, beta now contains 2 distinct hashes (hashA + hashB).
- expect(getHashesNeedingEmbedding(store.db, "beta")).toBe(2);
- // Alpha still owns just hashA.
- expect(getHashesNeedingEmbedding(store.db, "alpha")).toBe(1);
- });
- test("inactive docs are excluded from the filtered count", () => {
- store.db
- .prepare(`UPDATE documents SET active = 0 WHERE collection = 'beta'`)
- .run();
- expect(getHashesNeedingEmbedding(store.db, "beta")).toBe(0);
- // Other collections unaffected
- expect(getHashesNeedingEmbedding(store.db, "alpha")).toBe(1);
- });
- });
- // ─────────────────────────── generateEmbeddings filter ───────────────────────
- describe("generateEmbeddings with collection filter (i-ofojj7dy)", () => {
- test("processes only documents in the named collection", async () => {
- const provider = new StubProvider("embeddinggemma", 4);
- const result = await generateEmbeddings(store, {
- embedProvider: provider,
- collection: "alpha",
- maxDocsPerBatch: 64,
- });
- expect(result.docsProcessed).toBe(1);
- expect(result.chunksEmbedded).toBeGreaterThan(0);
- expect(result.errors).toBe(0);
- });
- test("processes all documents when collection is omitted (legacy path)", async () => {
- const provider = new StubProvider("embeddinggemma", 4);
- const result = await generateEmbeddings(store, {
- embedProvider: provider,
- maxDocsPerBatch: 64,
- });
- expect(result.docsProcessed).toBe(3);
- expect(result.errors).toBe(0);
- });
- test("returns zero-result for unknown collection without throwing", async () => {
- const provider = new StubProvider("embeddinggemma", 4);
- const result = await generateEmbeddings(store, {
- embedProvider: provider,
- collection: "ghost",
- });
- // No docs to embed → returns early with the empty-result shape
- expect(result.docsProcessed).toBe(0);
- expect(result.chunksEmbedded).toBe(0);
- expect(result.errors).toBe(0);
- expect(provider.totalTextsEmbedded).toBe(0);
- });
- test("does not embed docs from sibling collections", async () => {
- // Embed only beta; verify alpha + gamma are STILL pending afterward.
- const provider = new StubProvider("embeddinggemma", 4);
- await generateEmbeddings(store, {
- embedProvider: provider,
- collection: "beta",
- });
- // alpha + gamma still need embeddings, beta does not
- expect(getHashesNeedingEmbedding(store.db, "alpha")).toBe(1);
- expect(getHashesNeedingEmbedding(store.db, "gamma")).toBe(1);
- expect(getHashesNeedingEmbedding(store.db, "beta")).toBe(0);
- });
- });
|