suby
/
qmd


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219
							/**
 * embedding-store-integration.test.ts - Tests for the
 * generateEmbeddings() / EmbeddingProvider integration in store.ts.
 *
 * Uses an in-memory SQLite + a stub EmbeddingProvider to avoid loading
 * node-llama-cpp models. Verifies:
 *   - Provider's embedBatch is called when options.embedProvider is set
 *   - Model-id guard throws ModelMismatchError on mismatch
 *   - Force re-embed bypasses the guard
 *   - getDistinctEmbeddingModels reads content_vectors correctly
 */

import { describe, test, expect, beforeEach, afterEach } from "vitest";
import { mkdtempSync, rmSync } from "node:fs";
import { tmpdir } from "node:os";
import { join } from "node:path";
import {
  createStore,
  generateEmbeddings,
  getDistinctEmbeddingModels,
  insertEmbedding,
  type Store,
} from "../src/store.js";
import {
  ModelMismatchError,
  type EmbeddingProvider,
  type ProviderEmbedding,
  type ProviderHealth,
} from "../src/embedding/provider.js";

// ─────────────────────────── Stub provider ───────────────────────────────────

class StubProvider implements EmbeddingProvider {
  readonly kind = "openai" as const;
  readonly modelId: string;
  readonly dim: number;
  embedBatchCalls = 0;
  embedCalls = 0;
  totalTextsEmbedded = 0;

  constructor(modelId: string, dim = 4) {
    this.modelId = modelId;
    this.dim = dim;
  }

  getModelId(): string {
    return this.modelId;
  }
  getDimensions(): number | undefined {
    return this.dim;
  }
  async healthcheck(): Promise<ProviderHealth> {
    return { ok: true, model: this.modelId, dimensions: this.dim };
  }
  async embed(text: string): Promise<ProviderEmbedding | null> {
    this.embedCalls++;
    this.totalTextsEmbedded++;
    return { embedding: this.fakeEmbed(text), model: this.modelId };
  }
  async embedBatch(texts: string[]): Promise<(ProviderEmbedding | null)[]> {
    this.embedBatchCalls++;
    this.totalTextsEmbedded += texts.length;
    return texts.map((t) => ({ embedding: this.fakeEmbed(t), model: this.modelId }));
  }
  async dispose(): Promise<void> {}

  private fakeEmbed(text: string): number[] {
    return Array.from({ length: this.dim }, (_, i) => (text.length + i) * 0.01);
  }
}

// ─────────────────────────── Test setup ──────────────────────────────────────

let workDir: string;
let store: Store;

beforeEach(() => {
  workDir = mkdtempSync(join(tmpdir(), "qmd-store-int-test-"));
  process.env.INDEX_PATH = join(workDir, "index.sqlite");
  store = createStore(process.env.INDEX_PATH);
  // Insert content + documents with the bare-minimum schema. The content
  // body needs to be non-empty so chunkDocumentByTokens emits at least one
  // chunk per doc.
  const now = "2026-04-27T00:00:00Z";
  store.db
    .prepare(`INSERT INTO content (hash, doc, created_at) VALUES (?, ?, ?)`)
    .run("hash1", "Document one body content here that is long enough to chunk.", now);
  store.db
    .prepare(`INSERT INTO content (hash, doc, created_at) VALUES (?, ?, ?)`)
    .run("hash2", "Document two body content there with different words to chunk.", now);
  store.db
    .prepare(
      `INSERT INTO documents (hash, collection, path, title, created_at, modified_at, active) VALUES (?, ?, ?, ?, ?, ?, ?)`,
    )
    .run("hash1", "test", "one.md", "One", now, now, 1);
  store.db
    .prepare(
      `INSERT INTO documents (hash, collection, path, title, created_at, modified_at, active) VALUES (?, ?, ?, ?, ?, ?, ?)`,
    )
    .run("hash2", "test", "two.md", "Two", now, now, 1);
});

afterEach(() => {
  try {
    store.close();
  } catch { /* ignore */ }
  delete process.env.INDEX_PATH;
  rmSync(workDir, { recursive: true, force: true });
});

// ─────────────────────────── getDistinctEmbeddingModels ──────────────────────

describe("getDistinctEmbeddingModels", () => {
  test("returns [] when content_vectors is empty", () => {
    expect(getDistinctEmbeddingModels(store.db)).toEqual([]);
  });

  test("returns distinct model strings", () => {
    store.ensureVecTable(4);
    insertEmbedding(store.db, "hash1", 0, 0, new Float32Array([0.1, 0.2, 0.3, 0.4]), "embeddinggemma", "2026-04-27T00:00:00Z");
    insertEmbedding(store.db, "hash2", 0, 0, new Float32Array([0.5, 0.6, 0.7, 0.8]), "embeddinggemma", "2026-04-27T00:00:00Z");
    expect(getDistinctEmbeddingModels(store.db)).toEqual(["embeddinggemma"]);
  });

  test("returns multiple distinct models when present", () => {
    store.ensureVecTable(4);
    insertEmbedding(store.db, "hash1", 0, 0, new Float32Array([0.1, 0.2, 0.3, 0.4]), "model-a", "2026-04-27T00:00:00Z");
    insertEmbedding(store.db, "hash2", 0, 0, new Float32Array([0.5, 0.6, 0.7, 0.8]), "model-b", "2026-04-27T00:00:00Z");
    const models = getDistinctEmbeddingModels(store.db).sort();
    expect(models).toEqual(["model-a", "model-b"]);
  });
});

// ─────────────────────────── generateEmbeddings + provider ───────────────────

describe("generateEmbeddings with EmbeddingProvider", () => {
  test("uses provider.embedBatch when supplied", async () => {
    const provider = new StubProvider("embeddinggemma", 4);
    const result = await generateEmbeddings(store, {
      embedProvider: provider,
      // Use small batches to keep test fast
      maxDocsPerBatch: 64,
    });
    expect(result.docsProcessed).toBe(2);
    expect(result.chunksEmbedded).toBeGreaterThan(0);
    expect(result.errors).toBe(0);
    expect(provider.embedBatchCalls + provider.embedCalls).toBeGreaterThan(0);
    expect(provider.totalTextsEmbedded).toBeGreaterThan(0);
  }, 30000); // Cold-cache llama-cpp init can take >5s on first session call.
  // Provider short-circuits embed calls (line 1494-1499 of store.ts) but the
  // outer `withLLMSessionForLlm` wrapper still warms the LLM. DoD #9 (skip
  // LLM init when provider supplied) is a follow-up refactor.

  test("model-id guard throws ModelMismatchError on mismatch", async () => {
    // Pre-populate content_vectors with a different model id
    store.ensureVecTable(4);
    insertEmbedding(
      store.db,
      "hash1",
      0,
      0,
      new Float32Array([0.1, 0.2, 0.3, 0.4]),
      "old-model",
      "2026-04-27T00:00:00Z",
    );
    const provider = new StubProvider("new-model", 4);
    await expect(
      generateEmbeddings(store, { embedProvider: provider }),
    ).rejects.toBeInstanceOf(ModelMismatchError);
  });

  test("model-id matches → proceeds", async () => {
    store.ensureVecTable(4);
    insertEmbedding(
      store.db,
      "hash1",
      0,
      0,
      new Float32Array([0.1, 0.2, 0.3, 0.4]),
      "embeddinggemma",
      "2026-04-27T00:00:00Z",
    );
    const provider = new StubProvider("embeddinggemma", 4);
    const result = await generateEmbeddings(store, { embedProvider: provider });
    // Only hash2 needs embedding (hash1 already has one)
    expect(result.docsProcessed).toBeLessThanOrEqual(2);
    expect(result.errors).toBe(0);
  });

  test("force=true bypasses model-id guard", async () => {
    store.ensureVecTable(4);
    insertEmbedding(
      store.db,
      "hash1",
      0,
      0,
      new Float32Array([0.1, 0.2, 0.3, 0.4]),
      "old-model",
      "2026-04-27T00:00:00Z",
    );
    const provider = new StubProvider("new-model", 4);
    // force=true wipes content_vectors first → guard sees empty → no throw
    const result = await generateEmbeddings(store, {
      embedProvider: provider,
      force: true,
    });
    expect(result.docsProcessed).toBe(2);
    expect(result.errors).toBe(0);
    // Now only "new-model" should be in the DB
    expect(getDistinctEmbeddingModels(store.db)).toEqual(["new-model"]);
  });

  test("empty DB → no guard issue, anything goes", async () => {
    expect(getDistinctEmbeddingModels(store.db)).toEqual([]);
    const provider = new StubProvider("anything-id", 4);
    const result = await generateEmbeddings(store, { embedProvider: provider });
    expect(result.errors).toBe(0);
  });
});