| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389 |
- /**
- * embedding-store-integration.test.ts - Tests for the
- * generateEmbeddings() / EmbeddingProvider integration in store.ts.
- *
- * Uses an in-memory SQLite + a stub EmbeddingProvider to avoid loading
- * node-llama-cpp models. Verifies:
- * - Provider's embedBatch is called when options.embedProvider is set
- * - Model-id guard throws ModelMismatchError on mismatch
- * - Force re-embed bypasses the guard
- * - getDistinctEmbeddingModels reads content_vectors correctly
- */
- import { describe, test, expect, beforeEach, afterEach, vi } from "vitest";
- import { mkdtempSync, rmSync } from "node:fs";
- import { tmpdir } from "node:os";
- import { join } from "node:path";
- // Mock the llm.js module so `getDefaultLlamaCpp` (the only function
- // `chunkDocumentByTokens` reaches into when no `tokenizer` is supplied)
- // throws on call. This is the strongest possible assertion of DoD #1
- // for i-1rqixh6m: provider-mode embed runs MUST never load node-llama-cpp.
- //
- // Vitest hoists this `vi.mock` above the `import` lines below, and
- // since the module replacement applies to ALL importers (including
- // `store.js`), any leaked call from `chunkDocumentByTokens` (or any
- // sibling code path) into `getDefaultLlamaCpp` will throw a clear
- // "DoD violation" error and fail the test.
- vi.mock("../src/llm.js", async (importOriginal) => {
- const actual = await importOriginal<typeof import("../src/llm.js")>();
- return {
- ...actual,
- getDefaultLlamaCpp: vi.fn(() => {
- throw new Error(
- "getDefaultLlamaCpp() invoked when embedProvider was supplied — " +
- "DoD #1 violation (i-1rqixh6m). Provider-mode embed must not load node-llama-cpp.",
- );
- }),
- };
- });
- import {
- createStore,
- generateEmbeddings,
- getDistinctEmbeddingModels,
- insertEmbedding,
- type Store,
- } from "../src/store.js";
- import {
- ModelMismatchError,
- type EmbeddingProvider,
- type ProviderEmbedding,
- type ProviderHealth,
- } from "../src/embedding/provider.js";
- import * as llmModule from "../src/llm.js";
- // ─────────────────────────── Stub provider ───────────────────────────────────
- class StubProvider implements EmbeddingProvider {
- readonly kind = "openai" as const;
- readonly modelId: string;
- readonly dim: number;
- embedBatchCalls = 0;
- embedCalls = 0;
- totalTextsEmbedded = 0;
- constructor(modelId: string, dim = 4) {
- this.modelId = modelId;
- this.dim = dim;
- }
- getModelId(): string {
- return this.modelId;
- }
- getDimensions(): number | undefined {
- return this.dim;
- }
- async healthcheck(): Promise<ProviderHealth> {
- return { ok: true, model: this.modelId, dimensions: this.dim };
- }
- async embed(text: string): Promise<ProviderEmbedding | null> {
- this.embedCalls++;
- this.totalTextsEmbedded++;
- return { embedding: this.fakeEmbed(text), model: this.modelId };
- }
- async embedBatch(texts: string[]): Promise<(ProviderEmbedding | null)[]> {
- this.embedBatchCalls++;
- this.totalTextsEmbedded += texts.length;
- return texts.map((t) => ({ embedding: this.fakeEmbed(t), model: this.modelId }));
- }
- async dispose(): Promise<void> {}
- private fakeEmbed(text: string): number[] {
- return Array.from({ length: this.dim }, (_, i) => (text.length + i) * 0.01);
- }
- }
- // ─────────────────────────── Test setup ──────────────────────────────────────
- let workDir: string;
- let store: Store;
- beforeEach(() => {
- workDir = mkdtempSync(join(tmpdir(), "qmd-store-int-test-"));
- process.env.INDEX_PATH = join(workDir, "index.sqlite");
- store = createStore(process.env.INDEX_PATH);
- // Insert content + documents with the bare-minimum schema. The content
- // body needs to be non-empty so chunkDocumentByTokens emits at least one
- // chunk per doc.
- const now = "2026-04-27T00:00:00Z";
- store.db
- .prepare(`INSERT INTO content (hash, doc, created_at) VALUES (?, ?, ?)`)
- .run("hash1", "Document one body content here that is long enough to chunk.", now);
- store.db
- .prepare(`INSERT INTO content (hash, doc, created_at) VALUES (?, ?, ?)`)
- .run("hash2", "Document two body content there with different words to chunk.", now);
- store.db
- .prepare(
- `INSERT INTO documents (hash, collection, path, title, created_at, modified_at, active) VALUES (?, ?, ?, ?, ?, ?, ?)`,
- )
- .run("hash1", "test", "one.md", "One", now, now, 1);
- store.db
- .prepare(
- `INSERT INTO documents (hash, collection, path, title, created_at, modified_at, active) VALUES (?, ?, ?, ?, ?, ?, ?)`,
- )
- .run("hash2", "test", "two.md", "Two", now, now, 1);
- });
- afterEach(() => {
- try {
- store.close();
- } catch { /* ignore */ }
- delete process.env.INDEX_PATH;
- rmSync(workDir, { recursive: true, force: true });
- // Reset call history on the mocked getDefaultLlamaCpp between tests so
- // each test gets a clean ledger to assert against.
- const spy = llmModule.getDefaultLlamaCpp as unknown as ReturnType<typeof vi.fn>;
- spy.mockClear();
- });
- // ─────────────────────────── getDistinctEmbeddingModels ──────────────────────
- describe("getDistinctEmbeddingModels", () => {
- test("returns [] when content_vectors is empty", () => {
- expect(getDistinctEmbeddingModels(store.db)).toEqual([]);
- });
- test("returns distinct model strings", () => {
- store.ensureVecTable(4);
- insertEmbedding(store.db, "hash1", 0, 0, new Float32Array([0.1, 0.2, 0.3, 0.4]), "embeddinggemma", "2026-04-27T00:00:00Z");
- insertEmbedding(store.db, "hash2", 0, 0, new Float32Array([0.5, 0.6, 0.7, 0.8]), "embeddinggemma", "2026-04-27T00:00:00Z");
- expect(getDistinctEmbeddingModels(store.db)).toEqual(["embeddinggemma"]);
- });
- test("returns multiple distinct models when present", () => {
- store.ensureVecTable(4);
- insertEmbedding(store.db, "hash1", 0, 0, new Float32Array([0.1, 0.2, 0.3, 0.4]), "model-a", "2026-04-27T00:00:00Z");
- insertEmbedding(store.db, "hash2", 0, 0, new Float32Array([0.5, 0.6, 0.7, 0.8]), "model-b", "2026-04-27T00:00:00Z");
- const models = getDistinctEmbeddingModels(store.db).sort();
- expect(models).toEqual(["model-a", "model-b"]);
- });
- });
- // ─────────────────────────── generateEmbeddings + provider ───────────────────
- describe("generateEmbeddings with EmbeddingProvider", () => {
- test("uses provider.embedBatch when supplied", async () => {
- const provider = new StubProvider("embeddinggemma", 4);
- const result = await generateEmbeddings(store, {
- embedProvider: provider,
- // Use small batches to keep test fast
- maxDocsPerBatch: 64,
- });
- expect(result.docsProcessed).toBe(2);
- expect(result.chunksEmbedded).toBeGreaterThan(0);
- expect(result.errors).toBe(0);
- expect(provider.embedBatchCalls + provider.embedCalls).toBeGreaterThan(0);
- expect(provider.totalTextsEmbedded).toBeGreaterThan(0);
- });
- // Default 5s timeout restored after i-08ovbvtb removed the
- // `withLLMSessionForLlm` wrapper from the provider path. The previous
- // 30s bump (commit 058ec1d) was a workaround for the cold-cache LLM
- // warm-up that the refactor now skips entirely.
- test("provider mode does not access store.llm (DoD #2, #5 — i-08ovbvtb)", async () => {
- // When `embedProvider` is supplied, the refactor must NOT consult the
- // local LlamaCpp at all — neither `embedModelName` nor any other field.
- // We assert this by setting `store.llm` to a Proxy that throws on any
- // property access. If `getLlm(store).embedModelName` (or any sibling
- // call site) regressed back into the provider path, the test would
- // fail with a clear error message.
- const throwingLlm = new Proxy({}, {
- get(_target, prop) {
- throw new Error(
- `store.llm.${String(prop)} accessed when embedProvider was supplied — DoD violation`,
- );
- },
- }) as never;
- store.llm = throwingLlm;
- const provider = new StubProvider("embeddinggemma", 4);
- const result = await generateEmbeddings(store, { embedProvider: provider });
- expect(result.docsProcessed).toBe(2);
- expect(result.chunksEmbedded).toBeGreaterThan(0);
- expect(result.errors).toBe(0);
- expect(provider.totalTextsEmbedded).toBeGreaterThan(0);
- });
- test("provider mode does not call getDefaultLlamaCpp (DoD #3 — i-1rqixh6m)", async () => {
- // Stronger assertion than the `store.llm` Proxy above: when the
- // chunker or any sibling code path falls back to the *global*
- // `getDefaultLlamaCpp()` singleton (the previous warm-up source
- // inside `chunkDocumentByTokens`), the module-level mock at the top
- // of this file would throw — so a successful run is itself proof of
- // compliance. We additionally assert call count = 0 for clarity.
- const spy = llmModule.getDefaultLlamaCpp as unknown as ReturnType<typeof vi.fn>;
- expect(spy).not.toHaveBeenCalled();
- const provider = new StubProvider("embeddinggemma", 4);
- const result = await generateEmbeddings(store, { embedProvider: provider });
- expect(result.docsProcessed).toBe(2);
- expect(result.chunksEmbedded).toBeGreaterThan(0);
- expect(result.errors).toBe(0);
- expect(provider.totalTextsEmbedded).toBeGreaterThan(0);
- // The hard assertion: not a single call to the local LlamaCpp
- // singleton during the entire embed run. If `chunkDocumentByTokens`
- // (or any sibling) regresses and reaches `getDefaultLlamaCpp()` on
- // the provider path, this test fails with a clear DoD-violation
- // message — and the run itself would have already thrown.
- expect(spy).not.toHaveBeenCalled();
- });
- test("model-id guard throws ModelMismatchError on mismatch", async () => {
- // Pre-populate content_vectors with a different model id
- store.ensureVecTable(4);
- insertEmbedding(
- store.db,
- "hash1",
- 0,
- 0,
- new Float32Array([0.1, 0.2, 0.3, 0.4]),
- "old-model",
- "2026-04-27T00:00:00Z",
- );
- const provider = new StubProvider("new-model", 4);
- await expect(
- generateEmbeddings(store, { embedProvider: provider }),
- ).rejects.toBeInstanceOf(ModelMismatchError);
- });
- test("model-id matches → proceeds", async () => {
- store.ensureVecTable(4);
- insertEmbedding(
- store.db,
- "hash1",
- 0,
- 0,
- new Float32Array([0.1, 0.2, 0.3, 0.4]),
- "embeddinggemma",
- "2026-04-27T00:00:00Z",
- );
- const provider = new StubProvider("embeddinggemma", 4);
- const result = await generateEmbeddings(store, { embedProvider: provider });
- // Only hash2 needs embedding (hash1 already has one)
- expect(result.docsProcessed).toBeLessThanOrEqual(2);
- expect(result.errors).toBe(0);
- });
- test("force=true bypasses model-id guard", async () => {
- store.ensureVecTable(4);
- insertEmbedding(
- store.db,
- "hash1",
- 0,
- 0,
- new Float32Array([0.1, 0.2, 0.3, 0.4]),
- "old-model",
- "2026-04-27T00:00:00Z",
- );
- const provider = new StubProvider("new-model", 4);
- // force=true wipes content_vectors first → guard sees empty → no throw
- const result = await generateEmbeddings(store, {
- embedProvider: provider,
- force: true,
- });
- expect(result.docsProcessed).toBe(2);
- expect(result.errors).toBe(0);
- // Now only "new-model" should be in the DB
- expect(getDistinctEmbeddingModels(store.db)).toEqual(["new-model"]);
- });
- test("empty DB → no guard issue, anything goes", async () => {
- expect(getDistinctEmbeddingModels(store.db)).toEqual([]);
- const provider = new StubProvider("anything-id", 4);
- const result = await generateEmbeddings(store, { embedProvider: provider });
- expect(result.errors).toBe(0);
- });
- });
- // ─────── First-chunk dimension probe — retry + rich error (i-vm1lxwry) ───────
- /**
- * Provider that controls per-call success/failure for the first N calls,
- * exposing a `getLastError()` so the dimension-probe error path includes
- * the upstream cause. Used to exercise the issue i-vm1lxwry behavior.
- */
- class FlakyProvider implements EmbeddingProvider {
- readonly kind = "openai" as const;
- readonly modelId: string;
- readonly dim: number;
- // Behavior plan: on call N, return plan[N] (true=success, false=fail, "throw"=throw)
- plan: Array<true | false | "throw">;
- callIdx = 0;
- private lastErr: string | undefined = undefined;
- errorMessage = `endpoint=https://ai.mm.mk/v1/embeddings status=500 body="probe failure"`;
- constructor(modelId: string, dim: number, plan: Array<true | false | "throw">) {
- this.modelId = modelId;
- this.dim = dim;
- this.plan = plan;
- }
- getModelId(): string { return this.modelId; }
- getDimensions(): number | undefined { return this.dim; }
- getLastError(): string | undefined { return this.lastErr; }
- async healthcheck(): Promise<ProviderHealth> {
- return { ok: true, model: this.modelId, dimensions: this.dim };
- }
- async embed(text: string): Promise<ProviderEmbedding | null> {
- return (await this.embedBatch([text]))[0] ?? null;
- }
- async embedBatch(texts: string[]): Promise<(ProviderEmbedding | null)[]> {
- const decision = this.plan[this.callIdx] ?? this.plan[this.plan.length - 1] ?? false;
- this.callIdx++;
- if (decision === "throw") {
- this.lastErr = this.errorMessage;
- throw new Error(this.errorMessage);
- }
- if (decision === false) {
- this.lastErr = this.errorMessage;
- return texts.map(() => null);
- }
- this.lastErr = undefined;
- return texts.map((t) => ({
- embedding: Array.from({ length: this.dim }, (_, i) => (t.length + i) * 0.01),
- model: this.modelId,
- }));
- }
- async dispose(): Promise<void> {}
- }
- describe("first-chunk dimension probe — retry + rich error (i-vm1lxwry)", () => {
- test("retries once on null first-chunk and proceeds on success", async () => {
- // Plan: first call fails, second (retry) succeeds, all subsequent succeed
- const provider = new FlakyProvider("embeddinggemma", 4, [false, true]);
- const result = await generateEmbeddings(store, { embedProvider: provider });
- expect(result.errors).toBe(0);
- expect(result.docsProcessed).toBe(2);
- expect(result.chunksEmbedded).toBeGreaterThan(0);
- // We expect at least 2 calls: the failed first probe + the retry that succeeded.
- expect(provider.callIdx).toBeGreaterThanOrEqual(2);
- });
- test("throws rich error including provider kind and underlying cause when both attempts fail", async () => {
- // Plan: every call returns null
- const provider = new FlakyProvider("embeddinggemma", 4, [false]);
- await expect(
- generateEmbeddings(store, { embedProvider: provider }),
- ).rejects.toThrow(/Failed to get embedding dimensions from first chunk after retry/);
- // Re-run to inspect the rejected error
- const provider2 = new FlakyProvider("embeddinggemma", 4, [false]);
- let caught: unknown = null;
- try {
- await generateEmbeddings(store, { embedProvider: provider2 });
- } catch (e) {
- caught = e;
- }
- expect(caught).toBeInstanceOf(Error);
- const msg = (caught as Error).message;
- expect(msg).toContain("provider=openai");
- expect(msg).toContain("ai.mm.mk");
- expect(msg).toContain("status=500");
- expect(msg).toContain("probe failure");
- // Both attempts (initial + retry) consumed → at least 2 calls.
- expect(provider2.callIdx).toBeGreaterThanOrEqual(2);
- });
- });
|