/** * embedding-store-integration.test.ts - Tests for the * generateEmbeddings() / EmbeddingProvider integration in store.ts. * * Uses an in-memory SQLite + a stub EmbeddingProvider to avoid loading * node-llama-cpp models. Verifies: * - Provider's embedBatch is called when options.embedProvider is set * - Model-id guard throws ModelMismatchError on mismatch * - Force re-embed bypasses the guard * - getDistinctEmbeddingModels reads content_vectors correctly */ import { describe, test, expect, beforeEach, afterEach, vi } from "vitest"; import { mkdtempSync, rmSync } from "node:fs"; import { tmpdir } from "node:os"; import { join } from "node:path"; // Mock the llm.js module so `getDefaultLlamaCpp` (the only function // `chunkDocumentByTokens` reaches into when no `tokenizer` is supplied) // throws on call. This is the strongest possible assertion of DoD #1 // for i-1rqixh6m: provider-mode embed runs MUST never load node-llama-cpp. // // Vitest hoists this `vi.mock` above the `import` lines below, and // since the module replacement applies to ALL importers (including // `store.js`), any leaked call from `chunkDocumentByTokens` (or any // sibling code path) into `getDefaultLlamaCpp` will throw a clear // "DoD violation" error and fail the test. vi.mock("../src/llm.js", async (importOriginal) => { const actual = await importOriginal(); return { ...actual, getDefaultLlamaCpp: vi.fn(() => { throw new Error( "getDefaultLlamaCpp() invoked when embedProvider was supplied — " + "DoD #1 violation (i-1rqixh6m). Provider-mode embed must not load node-llama-cpp.", ); }), }; }); import { createStore, generateEmbeddings, getDistinctEmbeddingModels, insertEmbedding, type Store, } from "../src/store.js"; import { ModelMismatchError, type EmbeddingProvider, type ProviderEmbedding, type ProviderHealth, } from "../src/embedding/provider.js"; import * as llmModule from "../src/llm.js"; // ─────────────────────────── Stub provider ─────────────────────────────────── class StubProvider implements EmbeddingProvider { readonly kind = "openai" as const; readonly modelId: string; readonly dim: number; embedBatchCalls = 0; embedCalls = 0; totalTextsEmbedded = 0; constructor(modelId: string, dim = 4) { this.modelId = modelId; this.dim = dim; } getModelId(): string { return this.modelId; } getDimensions(): number | undefined { return this.dim; } async healthcheck(): Promise { return { ok: true, model: this.modelId, dimensions: this.dim }; } async embed(text: string): Promise { this.embedCalls++; this.totalTextsEmbedded++; return { embedding: this.fakeEmbed(text), model: this.modelId }; } async embedBatch(texts: string[]): Promise<(ProviderEmbedding | null)[]> { this.embedBatchCalls++; this.totalTextsEmbedded += texts.length; return texts.map((t) => ({ embedding: this.fakeEmbed(t), model: this.modelId })); } async dispose(): Promise {} private fakeEmbed(text: string): number[] { return Array.from({ length: this.dim }, (_, i) => (text.length + i) * 0.01); } } // ─────────────────────────── Test setup ────────────────────────────────────── let workDir: string; let store: Store; beforeEach(() => { workDir = mkdtempSync(join(tmpdir(), "qmd-store-int-test-")); process.env.INDEX_PATH = join(workDir, "index.sqlite"); store = createStore(process.env.INDEX_PATH); // Insert content + documents with the bare-minimum schema. The content // body needs to be non-empty so chunkDocumentByTokens emits at least one // chunk per doc. const now = "2026-04-27T00:00:00Z"; store.db .prepare(`INSERT INTO content (hash, doc, created_at) VALUES (?, ?, ?)`) .run("hash1", "Document one body content here that is long enough to chunk.", now); store.db .prepare(`INSERT INTO content (hash, doc, created_at) VALUES (?, ?, ?)`) .run("hash2", "Document two body content there with different words to chunk.", now); store.db .prepare( `INSERT INTO documents (hash, collection, path, title, created_at, modified_at, active) VALUES (?, ?, ?, ?, ?, ?, ?)`, ) .run("hash1", "test", "one.md", "One", now, now, 1); store.db .prepare( `INSERT INTO documents (hash, collection, path, title, created_at, modified_at, active) VALUES (?, ?, ?, ?, ?, ?, ?)`, ) .run("hash2", "test", "two.md", "Two", now, now, 1); }); afterEach(() => { try { store.close(); } catch { /* ignore */ } delete process.env.INDEX_PATH; rmSync(workDir, { recursive: true, force: true }); // Reset call history on the mocked getDefaultLlamaCpp between tests so // each test gets a clean ledger to assert against. const spy = llmModule.getDefaultLlamaCpp as unknown as ReturnType; spy.mockClear(); }); // ─────────────────────────── getDistinctEmbeddingModels ────────────────────── describe("getDistinctEmbeddingModels", () => { test("returns [] when content_vectors is empty", () => { expect(getDistinctEmbeddingModels(store.db)).toEqual([]); }); test("returns distinct model strings", () => { store.ensureVecTable(4); insertEmbedding(store.db, "hash1", 0, 0, new Float32Array([0.1, 0.2, 0.3, 0.4]), "embeddinggemma", "2026-04-27T00:00:00Z"); insertEmbedding(store.db, "hash2", 0, 0, new Float32Array([0.5, 0.6, 0.7, 0.8]), "embeddinggemma", "2026-04-27T00:00:00Z"); expect(getDistinctEmbeddingModels(store.db)).toEqual(["embeddinggemma"]); }); test("returns multiple distinct models when present", () => { store.ensureVecTable(4); insertEmbedding(store.db, "hash1", 0, 0, new Float32Array([0.1, 0.2, 0.3, 0.4]), "model-a", "2026-04-27T00:00:00Z"); insertEmbedding(store.db, "hash2", 0, 0, new Float32Array([0.5, 0.6, 0.7, 0.8]), "model-b", "2026-04-27T00:00:00Z"); const models = getDistinctEmbeddingModels(store.db).sort(); expect(models).toEqual(["model-a", "model-b"]); }); }); // ─────────────────────────── generateEmbeddings + provider ─────────────────── describe("generateEmbeddings with EmbeddingProvider", () => { test("uses provider.embedBatch when supplied", async () => { const provider = new StubProvider("embeddinggemma", 4); const result = await generateEmbeddings(store, { embedProvider: provider, // Use small batches to keep test fast maxDocsPerBatch: 64, }); expect(result.docsProcessed).toBe(2); expect(result.chunksEmbedded).toBeGreaterThan(0); expect(result.errors).toBe(0); expect(provider.embedBatchCalls + provider.embedCalls).toBeGreaterThan(0); expect(provider.totalTextsEmbedded).toBeGreaterThan(0); }); // Default 5s timeout restored after i-08ovbvtb removed the // `withLLMSessionForLlm` wrapper from the provider path. The previous // 30s bump (commit 058ec1d) was a workaround for the cold-cache LLM // warm-up that the refactor now skips entirely. test("provider mode does not access store.llm (DoD #2, #5 — i-08ovbvtb)", async () => { // When `embedProvider` is supplied, the refactor must NOT consult the // local LlamaCpp at all — neither `embedModelName` nor any other field. // We assert this by setting `store.llm` to a Proxy that throws on any // property access. If `getLlm(store).embedModelName` (or any sibling // call site) regressed back into the provider path, the test would // fail with a clear error message. const throwingLlm = new Proxy({}, { get(_target, prop) { throw new Error( `store.llm.${String(prop)} accessed when embedProvider was supplied — DoD violation`, ); }, }) as never; store.llm = throwingLlm; const provider = new StubProvider("embeddinggemma", 4); const result = await generateEmbeddings(store, { embedProvider: provider }); expect(result.docsProcessed).toBe(2); expect(result.chunksEmbedded).toBeGreaterThan(0); expect(result.errors).toBe(0); expect(provider.totalTextsEmbedded).toBeGreaterThan(0); }); test("provider mode does not call getDefaultLlamaCpp (DoD #3 — i-1rqixh6m)", async () => { // Stronger assertion than the `store.llm` Proxy above: when the // chunker or any sibling code path falls back to the *global* // `getDefaultLlamaCpp()` singleton (the previous warm-up source // inside `chunkDocumentByTokens`), the module-level mock at the top // of this file would throw — so a successful run is itself proof of // compliance. We additionally assert call count = 0 for clarity. const spy = llmModule.getDefaultLlamaCpp as unknown as ReturnType; expect(spy).not.toHaveBeenCalled(); const provider = new StubProvider("embeddinggemma", 4); const result = await generateEmbeddings(store, { embedProvider: provider }); expect(result.docsProcessed).toBe(2); expect(result.chunksEmbedded).toBeGreaterThan(0); expect(result.errors).toBe(0); expect(provider.totalTextsEmbedded).toBeGreaterThan(0); // The hard assertion: not a single call to the local LlamaCpp // singleton during the entire embed run. If `chunkDocumentByTokens` // (or any sibling) regresses and reaches `getDefaultLlamaCpp()` on // the provider path, this test fails with a clear DoD-violation // message — and the run itself would have already thrown. expect(spy).not.toHaveBeenCalled(); }); test("model-id guard throws ModelMismatchError on mismatch", async () => { // Pre-populate content_vectors with a different model id store.ensureVecTable(4); insertEmbedding( store.db, "hash1", 0, 0, new Float32Array([0.1, 0.2, 0.3, 0.4]), "old-model", "2026-04-27T00:00:00Z", ); const provider = new StubProvider("new-model", 4); await expect( generateEmbeddings(store, { embedProvider: provider }), ).rejects.toBeInstanceOf(ModelMismatchError); }); test("model-id matches → proceeds", async () => { store.ensureVecTable(4); insertEmbedding( store.db, "hash1", 0, 0, new Float32Array([0.1, 0.2, 0.3, 0.4]), "embeddinggemma", "2026-04-27T00:00:00Z", ); const provider = new StubProvider("embeddinggemma", 4); const result = await generateEmbeddings(store, { embedProvider: provider }); // Only hash2 needs embedding (hash1 already has one) expect(result.docsProcessed).toBeLessThanOrEqual(2); expect(result.errors).toBe(0); }); test("force=true bypasses model-id guard", async () => { store.ensureVecTable(4); insertEmbedding( store.db, "hash1", 0, 0, new Float32Array([0.1, 0.2, 0.3, 0.4]), "old-model", "2026-04-27T00:00:00Z", ); const provider = new StubProvider("new-model", 4); // force=true wipes content_vectors first → guard sees empty → no throw const result = await generateEmbeddings(store, { embedProvider: provider, force: true, }); expect(result.docsProcessed).toBe(2); expect(result.errors).toBe(0); // Now only "new-model" should be in the DB expect(getDistinctEmbeddingModels(store.db)).toEqual(["new-model"]); }); test("empty DB → no guard issue, anything goes", async () => { expect(getDistinctEmbeddingModels(store.db)).toEqual([]); const provider = new StubProvider("anything-id", 4); const result = await generateEmbeddings(store, { embedProvider: provider }); expect(result.errors).toBe(0); }); });