|
@@ -0,0 +1,203 @@
|
|
|
|
|
+/**
|
|
|
|
|
+ * embed-collection-filter.test.ts — Tests for the collection-filter plumbing
|
|
|
|
|
+ * shipped under i-ofojj7dy:
|
|
|
|
|
+ *
|
|
|
|
|
+ * - getPendingEmbeddingDocs(db, collection) filters at the SQL layer
|
|
|
|
|
+ * - getHashesNeedingEmbedding(db, collection) filters at the SQL layer
|
|
|
|
|
+ * - generateEmbeddings({ collection }) only embeds matching docs
|
|
|
|
|
+ *
|
|
|
|
|
+ * Uses an in-memory SQLite + stub EmbeddingProvider — no node-llama-cpp.
|
|
|
|
|
+ */
|
|
|
|
|
+
|
|
|
|
|
+import { describe, test, expect, beforeEach, afterEach } from "vitest";
|
|
|
|
|
+import { mkdtempSync, rmSync } from "node:fs";
|
|
|
|
|
+import { tmpdir } from "node:os";
|
|
|
|
|
+import { join } from "node:path";
|
|
|
|
|
+
|
|
|
|
|
+import {
|
|
|
|
|
+ createStore,
|
|
|
|
|
+ generateEmbeddings,
|
|
|
|
|
+ getHashesNeedingEmbedding,
|
|
|
|
|
+ type Store,
|
|
|
|
|
+} from "../src/store.js";
|
|
|
|
|
+import type {
|
|
|
|
|
+ EmbeddingProvider,
|
|
|
|
|
+ ProviderEmbedding,
|
|
|
|
|
+ ProviderHealth,
|
|
|
|
|
+} from "../src/embedding/provider.js";
|
|
|
|
|
+
|
|
|
|
|
+// ─────────────────────────── Stub provider ───────────────────────────────────
|
|
|
|
|
+
|
|
|
|
|
+class StubProvider implements EmbeddingProvider {
|
|
|
|
|
+ readonly kind = "openai" as const;
|
|
|
|
|
+ readonly modelId: string;
|
|
|
|
|
+ readonly dim: number;
|
|
|
|
|
+ embedBatchCalls = 0;
|
|
|
|
|
+ totalTextsEmbedded = 0;
|
|
|
|
|
+ // Snapshot the per-doc collection labels we received via the chunk stream.
|
|
|
|
|
+ // generateEmbeddings hands us the chunk text only, but we can correlate
|
|
|
|
|
+ // back through `docsProcessed` count in the result. For this test we only
|
|
|
|
|
+ // assert on the result counts.
|
|
|
|
|
+ constructor(modelId: string, dim = 4) {
|
|
|
|
|
+ this.modelId = modelId;
|
|
|
|
|
+ this.dim = dim;
|
|
|
|
|
+ }
|
|
|
|
|
+ getModelId(): string { return this.modelId; }
|
|
|
|
|
+ getDimensions(): number | undefined { return this.dim; }
|
|
|
|
|
+ async healthcheck(): Promise<ProviderHealth> {
|
|
|
|
|
+ return { ok: true, model: this.modelId, dimensions: this.dim };
|
|
|
|
|
+ }
|
|
|
|
|
+ async embed(text: string): Promise<ProviderEmbedding | null> {
|
|
|
|
|
+ this.totalTextsEmbedded++;
|
|
|
|
|
+ return { embedding: this.fakeEmbed(text), model: this.modelId };
|
|
|
|
|
+ }
|
|
|
|
|
+ async embedBatch(texts: string[]): Promise<(ProviderEmbedding | null)[]> {
|
|
|
|
|
+ this.embedBatchCalls++;
|
|
|
|
|
+ this.totalTextsEmbedded += texts.length;
|
|
|
|
|
+ return texts.map((t) => ({ embedding: this.fakeEmbed(t), model: this.modelId }));
|
|
|
|
|
+ }
|
|
|
|
|
+ async dispose(): Promise<void> {}
|
|
|
|
|
+ private fakeEmbed(text: string): number[] {
|
|
|
|
|
+ return Array.from({ length: this.dim }, (_, i) => (text.length + i) * 0.01);
|
|
|
|
|
+ }
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+// ─────────────────────────── Test setup ──────────────────────────────────────
|
|
|
|
|
+
|
|
|
|
|
+let workDir: string;
|
|
|
|
|
+let store: Store;
|
|
|
|
|
+
|
|
|
|
|
+beforeEach(() => {
|
|
|
|
|
+ workDir = mkdtempSync(join(tmpdir(), "qmd-embed-filter-test-"));
|
|
|
|
|
+ process.env.INDEX_PATH = join(workDir, "index.sqlite");
|
|
|
|
|
+ store = createStore(process.env.INDEX_PATH);
|
|
|
|
|
+
|
|
|
|
|
+ const now = "2026-05-13T00:00:00Z";
|
|
|
|
|
+
|
|
|
|
|
+ // Three distinct content hashes, three distinct collections — one doc each.
|
|
|
|
|
+ // The body has to be non-empty so chunkDocumentByTokens emits ≥1 chunk/doc.
|
|
|
|
|
+ const bodies: Record<string, string> = {
|
|
|
|
|
+ hashA: "Alpha collection body content here that is long enough to chunk.",
|
|
|
|
|
+ hashB: "Beta collection body text there with different vocabulary to chunk.",
|
|
|
|
|
+ hashC: "Gamma collection body words yonder packing unique tokens to chunk.",
|
|
|
|
|
+ };
|
|
|
|
|
+ for (const [hash, body] of Object.entries(bodies)) {
|
|
|
|
|
+ store.db
|
|
|
|
|
+ .prepare(`INSERT INTO content (hash, doc, created_at) VALUES (?, ?, ?)`)
|
|
|
|
|
+ .run(hash, body, now);
|
|
|
|
|
+ }
|
|
|
|
|
+ // doc-per-collection mapping
|
|
|
|
|
+ const insertDoc = (hash: string, collection: string, path: string) => {
|
|
|
|
|
+ store.db
|
|
|
|
|
+ .prepare(
|
|
|
|
|
+ `INSERT INTO documents (hash, collection, path, title, created_at, modified_at, active) VALUES (?, ?, ?, ?, ?, ?, ?)`,
|
|
|
|
|
+ )
|
|
|
|
|
+ .run(hash, collection, path, path, now, now, 1);
|
|
|
|
|
+ };
|
|
|
|
|
+ insertDoc("hashA", "alpha", "a.md");
|
|
|
|
|
+ insertDoc("hashB", "beta", "b.md");
|
|
|
|
|
+ insertDoc("hashC", "gamma", "c.md");
|
|
|
|
|
+});
|
|
|
|
|
+
|
|
|
|
|
+afterEach(() => {
|
|
|
|
|
+ try {
|
|
|
|
|
+ store.close();
|
|
|
|
|
+ } catch { /* ignore */ }
|
|
|
|
|
+ delete process.env.INDEX_PATH;
|
|
|
|
|
+ rmSync(workDir, { recursive: true, force: true });
|
|
|
|
|
+});
|
|
|
|
|
+
|
|
|
|
|
+// ─────────────────────────── getHashesNeedingEmbedding ───────────────────────
|
|
|
|
|
+
|
|
|
|
|
+describe("getHashesNeedingEmbedding with collection filter (i-ofojj7dy)", () => {
|
|
|
|
|
+ test("returns total count when no collection passed", () => {
|
|
|
|
|
+ expect(getHashesNeedingEmbedding(store.db)).toBe(3);
|
|
|
|
|
+ });
|
|
|
|
|
+
|
|
|
|
|
+ test("returns 1 when filtering to a single-doc collection", () => {
|
|
|
|
|
+ expect(getHashesNeedingEmbedding(store.db, "alpha")).toBe(1);
|
|
|
|
|
+ expect(getHashesNeedingEmbedding(store.db, "beta")).toBe(1);
|
|
|
|
|
+ expect(getHashesNeedingEmbedding(store.db, "gamma")).toBe(1);
|
|
|
|
|
+ });
|
|
|
|
|
+
|
|
|
|
|
+ test("returns 0 when filter does not match any collection", () => {
|
|
|
|
|
+ expect(getHashesNeedingEmbedding(store.db, "nonexistent")).toBe(0);
|
|
|
|
|
+ });
|
|
|
|
|
+
|
|
|
|
|
+ test("shared content hash counted per containing collection", () => {
|
|
|
|
|
+ // Add a second doc that re-uses hashA but in collection "beta".
|
|
|
|
|
+ const now = "2026-05-13T00:00:00Z";
|
|
|
|
|
+ store.db
|
|
|
|
|
+ .prepare(
|
|
|
|
|
+ `INSERT INTO documents (hash, collection, path, title, created_at, modified_at, active) VALUES (?, ?, ?, ?, ?, ?, ?)`,
|
|
|
|
|
+ )
|
|
|
|
|
+ .run("hashA", "beta", "shared.md", "shared", now, now, 1);
|
|
|
|
|
+ // Without filter, the DISTINCT count of pending hashes is still 3.
|
|
|
|
|
+ expect(getHashesNeedingEmbedding(store.db)).toBe(3);
|
|
|
|
|
+ // With filter, beta now contains 2 distinct hashes (hashA + hashB).
|
|
|
|
|
+ expect(getHashesNeedingEmbedding(store.db, "beta")).toBe(2);
|
|
|
|
|
+ // Alpha still owns just hashA.
|
|
|
|
|
+ expect(getHashesNeedingEmbedding(store.db, "alpha")).toBe(1);
|
|
|
|
|
+ });
|
|
|
|
|
+
|
|
|
|
|
+ test("inactive docs are excluded from the filtered count", () => {
|
|
|
|
|
+ store.db
|
|
|
|
|
+ .prepare(`UPDATE documents SET active = 0 WHERE collection = 'beta'`)
|
|
|
|
|
+ .run();
|
|
|
|
|
+ expect(getHashesNeedingEmbedding(store.db, "beta")).toBe(0);
|
|
|
|
|
+ // Other collections unaffected
|
|
|
|
|
+ expect(getHashesNeedingEmbedding(store.db, "alpha")).toBe(1);
|
|
|
|
|
+ });
|
|
|
|
|
+});
|
|
|
|
|
+
|
|
|
|
|
+// ─────────────────────────── generateEmbeddings filter ───────────────────────
|
|
|
|
|
+
|
|
|
|
|
+describe("generateEmbeddings with collection filter (i-ofojj7dy)", () => {
|
|
|
|
|
+ test("processes only documents in the named collection", async () => {
|
|
|
|
|
+ const provider = new StubProvider("embeddinggemma", 4);
|
|
|
|
|
+ const result = await generateEmbeddings(store, {
|
|
|
|
|
+ embedProvider: provider,
|
|
|
|
|
+ collection: "alpha",
|
|
|
|
|
+ maxDocsPerBatch: 64,
|
|
|
|
|
+ });
|
|
|
|
|
+ expect(result.docsProcessed).toBe(1);
|
|
|
|
|
+ expect(result.chunksEmbedded).toBeGreaterThan(0);
|
|
|
|
|
+ expect(result.errors).toBe(0);
|
|
|
|
|
+ });
|
|
|
|
|
+
|
|
|
|
|
+ test("processes all documents when collection is omitted (legacy path)", async () => {
|
|
|
|
|
+ const provider = new StubProvider("embeddinggemma", 4);
|
|
|
|
|
+ const result = await generateEmbeddings(store, {
|
|
|
|
|
+ embedProvider: provider,
|
|
|
|
|
+ maxDocsPerBatch: 64,
|
|
|
|
|
+ });
|
|
|
|
|
+ expect(result.docsProcessed).toBe(3);
|
|
|
|
|
+ expect(result.errors).toBe(0);
|
|
|
|
|
+ });
|
|
|
|
|
+
|
|
|
|
|
+ test("returns zero-result for unknown collection without throwing", async () => {
|
|
|
|
|
+ const provider = new StubProvider("embeddinggemma", 4);
|
|
|
|
|
+ const result = await generateEmbeddings(store, {
|
|
|
|
|
+ embedProvider: provider,
|
|
|
|
|
+ collection: "ghost",
|
|
|
|
|
+ });
|
|
|
|
|
+ // No docs to embed → returns early with the empty-result shape
|
|
|
|
|
+ expect(result.docsProcessed).toBe(0);
|
|
|
|
|
+ expect(result.chunksEmbedded).toBe(0);
|
|
|
|
|
+ expect(result.errors).toBe(0);
|
|
|
|
|
+ expect(provider.totalTextsEmbedded).toBe(0);
|
|
|
|
|
+ });
|
|
|
|
|
+
|
|
|
|
|
+ test("does not embed docs from sibling collections", async () => {
|
|
|
|
|
+ // Embed only beta; verify alpha + gamma are STILL pending afterward.
|
|
|
|
|
+ const provider = new StubProvider("embeddinggemma", 4);
|
|
|
|
|
+ await generateEmbeddings(store, {
|
|
|
|
|
+ embedProvider: provider,
|
|
|
|
|
+ collection: "beta",
|
|
|
|
|
+ });
|
|
|
|
|
+ // alpha + gamma still need embeddings, beta does not
|
|
|
|
|
+ expect(getHashesNeedingEmbedding(store.db, "alpha")).toBe(1);
|
|
|
|
|
+ expect(getHashesNeedingEmbedding(store.db, "gamma")).toBe(1);
|
|
|
|
|
+ expect(getHashesNeedingEmbedding(store.db, "beta")).toBe(0);
|
|
|
|
|
+ });
|
|
|
|
|
+});
|