SHA1
--- a/dist/cli/qmd.js
+++ b/dist/cli/qmd.js
@@ -1,4 +1,3 @@
 
															-#!/usr/bin/env node
														
 
															 import { openDatabase } from "../db.js";
														
 
															 import fastGlob from "fast-glob";
														
 
															 import { execSync, spawn as nodeSpawn } from "child_process";
														
@@ -1438,6 +1437,46 @@ function parseOptionalPositiveInt(name, value) {
 
															     }
														
 
															     return parsed;
														
 
															 }
														
 
															+/**
														
 
															+ * Build an `EmbeddingProvider` for the QUERY-side path (vsearch / query)
														
 
															+ * if and only if the user has opted into a non-local provider via flags or
														
 
															+ * env vars. Returns `undefined` for the zero-config case so the legacy
														
 
															+ * `getDefaultLlamaCpp().embed(...)` path is used unchanged — preserving
														
 
															+ * pre-patch behavior for callers that have not configured remote embedding
														
 
															+ * (i-loazq6ze DoD #5: backward compat).
														
 
															+ *
														
 
															+ * Resolution mirrors `qmd embed` (factory.resolveProviderKind):
														
 
															+ *   1. Explicit `--provider` flag → build provider
														
 
															+ *   2. Any `--embed-*` flag / `QMD_EMBED_*` env / `embedProvider.endpoint`
														
 
															+ *      in `~/.config/qmd/config.json` → build provider
														
 
															+ *   3. Otherwise → return `undefined` (legacy path)
														
 
															+ *
														
 
															+ * Returns `null` on construction failure (e.g. malformed flags) so the
														
 
															+ * caller can warn + fall back to the legacy path.
														
 
															+ */
														
 
															+function buildQueryEmbedProvider(values) {
														
 
															+    const providerCliKind = parseProviderKind(values["provider"]);
														
 
															+    const opts = buildProviderOpts(values, providerCliKind);
														
 
															+    // Determine whether the user opted into a provider. The factory's resolve
														
 
															+    // step returns "local" by default; without explicit opt-in (flag/env/
														
 
															+    // config), we keep the legacy path with no construction overhead.
														
 
															+    const resolved = resolveProviderKind(opts);
														
 
															+    const hasProviderFlag = providerCliKind !== undefined;
														
 
															+    const hasOpenAiOverride = !!opts.openai && Object.keys(opts.openai).length > 0;
														
 
															+    const envOptIn = !!(process.env.QMD_EMBED_PROVIDER ||
														
 
															+        process.env.QMD_EMBED_ENDPOINT ||
														
 
															+        process.env.QMD_EMBED_AUTO_FALLBACK);
														
 
															+    if (!hasProviderFlag && !hasOpenAiOverride && !envOptIn && resolved === "local") {
														
 
															+        return undefined;
														
 
															+    }
														
 
															+    try {
														
 
															+        return createEmbeddingProvider(opts);
														
 
															+    }
														
 
															+    catch (err) {
														
 
															+        process.stderr.write(`${c.yellow}Warning: failed to build query embedding provider — using local fallback (${err instanceof Error ? err.message : String(err)})${c.reset}\n`);
														
 
															+        return undefined;
														
 
															+    }
														
 
															+}
														
 
															 /**
														
 
															  * Translate `cli.values` into `CreateEmbeddingProviderOptions`. CLI flags
														
 
															  * win over env vars (the factory itself reads env when these are unset).
														
@@ -1974,12 +2013,19 @@ async function vectorSearch(query, opts, _model = DEFAULT_EMBED_MODEL) {
 
															     const collectionNames = resolveCollectionFilter(opts.collection, true);
														
 
															     const singleCollection = collectionNames.length === 1 ? collectionNames[0] : undefined;
														
 
															     checkIndexHealth(store.db);
														
 
															+    // Build embedding provider for query encoding (i-loazq6ze).
														
 
															+    // Same precedence as `qmd embed`: explicit `--provider` flag → env vars →
														
 
															+    // `~/.config/qmd/config.json` → default LocalLlamaCppProvider. The local
														
 
															+    // default keeps zero-config callers on the legacy llama-cpp path with no
														
 
															+    // observable change.
														
 
															+    const embedProvider = opts.embedProvider;
														
 
															     await withLLMSession(async () => {
														
 
															         let results = await vectorSearchQuery(store, query, {
														
 
															             collection: singleCollection,
														
 
															             limit: opts.all ? 500 : (opts.limit || 10),
														
 
															             minScore: opts.minScore || 0.3,
														
 
															             intent: opts.intent,
														
 
															+            ...(embedProvider ? { embedProvider } : {}),
														
 
															             hooks: {
														
 
															                 onExpand: (original, expanded) => {
														
 
															                     logExpansionTree(original, expanded);
														
@@ -2048,6 +2094,7 @@ async function querySearch(query, opts, _embedModel = DEFAULT_EMBED_MODEL, _rera
 
															                 explain: !!opts.explain,
														
 
															                 intent,
														
 
															                 chunkStrategy: opts.chunkStrategy,
														
 
															+                ...(opts.embedProvider ? { embedProvider: opts.embedProvider } : {}),
														
 
															                 hooks: {
														
 
															                     onEmbedStart: (count) => {
														
 
															                         process.stderr.write(`${c.dim}Embedding ${count} ${count === 1 ? 'query' : 'queries'}...${c.reset}`);
														
@@ -2077,6 +2124,7 @@ async function querySearch(query, opts, _embedModel = DEFAULT_EMBED_MODEL, _rera
 
															                 explain: !!opts.explain,
														
 
															                 intent,
														
 
															                 chunkStrategy: opts.chunkStrategy,
														
 
															+                ...(opts.embedProvider ? { embedProvider: opts.embedProvider } : {}),
														
 
															                 hooks: {
														
 
															                     onStrongSignal: (score) => {
														
 
															                         process.stderr.write(`${c.dim}Strong BM25 signal (${score.toFixed(2)}) — skipping expansion${c.reset}\n`);
														
@@ -2811,6 +2859,9 @@ if (isMain) {
 
															             if (!cli.values["min-score"]) {
														
 
															                 cli.opts.minScore = 0.3;
														
 
															             }
														
 
															+            // Build query-side embedding provider (i-loazq6ze).
														
 
															+            // Returns undefined for zero-config callers (legacy local path).
														
 
															+            cli.opts.embedProvider = buildQueryEmbedProvider(cli.values);
														
 
															             await vectorSearch(cli.query, cli.opts);
														
 
															             break;
														
 
															         case "query":
														
@@ -2819,6 +2870,7 @@ if (isMain) {
 
															                 console.error("Usage: qmd query [options] <query>");
														
 
															                 process.exit(1);
														
 
															             }
														
 
															+            cli.opts.embedProvider = buildQueryEmbedProvider(cli.values);
														
 
															             await querySearch(cli.query, cli.opts);
														
 
															             break;
														
 
															         case "bench": {
														
--- a/dist/index.d.ts
+++ b/dist/index.d.ts
@@ -24,6 +24,7 @@ export { extractSnippet, addLineNumbers, DEFAULT_MULTI_GET_MAX_BYTES };
 
															 export type { ChunkStrategy } from "./store.js";
														
 
															 export { getDefaultDbPath } from "./store.js";
														
 
															 export { Maintenance } from "./maintenance.js";
														
 
															+import type { EmbeddingProvider } from "./embedding/index.js";
														
 
															 export { createEmbeddingProvider, resolveProviderKind, LocalLlamaCppProvider, OpenAIEmbeddingsProvider, CircuitBreaker, CircuitOpenError, HttpError, ModelMismatchError, assertModelCompatible, type EmbeddingProvider, type ProviderKind, type ProviderEmbedding, type ProviderEmbedOptions, type ProviderHealth, type CreateEmbeddingProviderOptions, type OpenAIProviderConfig, type LocalLlamaCppProviderConfig, type EmbedProviderConfigFile, DEFAULT_BATCH_SIZE as DEFAULT_PROVIDER_BATCH_SIZE, DEFAULT_TIMEOUT_MS as DEFAULT_PROVIDER_TIMEOUT_MS, RETRY_BACKOFFS_MS as PROVIDER_RETRY_BACKOFFS_MS, } from "./embedding/index.js";
														
 
															 export { getDistinctEmbeddingModels } from "./store.js";
														
 
															 /**
														
@@ -70,6 +71,14 @@ export interface SearchOptions {
 
															     explain?: boolean;
														
 
															     /** Chunk strategy: "auto" (default, uses AST for code files) or "regex" (legacy) */
														
 
															     chunkStrategy?: ChunkStrategy;
														
 
															+    /**
														
 
															+     * Optional embedding provider for query-side encoding (i-loazq6ze).
														
 
															+     * When supplied, vec/hyde sub-queries are encoded through the provider
														
 
															+     * (HTTP / GPU worker / AutoFallback chain) instead of the local llama-cpp
														
 
															+     * model. Omit to keep pre-patch behavior — the SDK store still works
														
 
															+     * unchanged for callers that have not opted into a remote provider.
														
 
															+     */
														
 
															+    embedProvider?: EmbeddingProvider;
														
 
															 }
														
 
															 /**
														
 
															  * Options for searchLex() — BM25 keyword search.
														
@@ -84,6 +93,11 @@ export interface LexSearchOptions {
 
															 export interface VectorSearchOptions {
														
 
															     limit?: number;
														
 
															     collection?: string;
														
 
															+    /**
														
 
															+     * Optional embedding provider for query encoding (i-loazq6ze). Forwarded
														
 
															+     * through to `searchVec`. Defaults to local llama-cpp.
														
 
															+     */
														
 
															+    embedProvider?: EmbeddingProvider;
														
 
															 }
														
 
															 /**
														
 
															  * Options for expandQuery() — manual query expansion.
														
@@ -105,6 +119,14 @@ export interface StoreOptions {
 
															     configPath?: string;
														
 
															     /** Inline collection config (mutually exclusive with `configPath`) */
														
 
															     config?: CollectionConfig;
														
 
															+    /**
														
 
															+     * Optional default embedding provider for query encoding (i-loazq6ze).
														
 
															+     * When set, every `store.search(...)` call uses this provider unless the
														
 
															+     * caller passes its own `embedProvider` in `SearchOptions`. MCP / HTTP
														
 
															+     * server constructs the provider once at startup and injects it here so
														
 
															+     * every query routes through the GPU worker.
														
 
															+     */
														
 
															+    embedProvider?: EmbeddingProvider;
														
 
															 }
														
 
															 /**
														
 
															  * The QMD SDK store — provides search, retrieval, collection management,
														
--- a/dist/index.js
+++ b/dist/index.js
@@ -107,6 +107,8 @@ export async function createStore(options) {
 
															                 ...(opts.collections ?? []),
														
 
															             ];
														
 
															             const skipRerank = opts.rerank === false;
														
 
															+            // Per-call provider wins over store-level default.
														
 
															+            const provider = opts.embedProvider ?? options.embedProvider;
														
 
															             if (opts.queries) {
														
 
															                 // Pre-expanded queries — use structuredSearch
														
 
															                 return structuredSearch(internal, opts.queries, {
														
@@ -117,6 +119,7 @@ export async function createStore(options) {
 
															                     intent: opts.intent,
														
 
															                     skipRerank,
														
 
															                     chunkStrategy: opts.chunkStrategy,
														
 
															+                    ...(provider ? { embedProvider: provider } : {}),
														
 
															                 });
														
 
															             }
														
 
															             // Simple query string — use hybridQuery (expand + search + rerank)
														
@@ -128,10 +131,11 @@ export async function createStore(options) {
 
															                 intent: opts.intent,
														
 
															                 skipRerank,
														
 
															                 chunkStrategy: opts.chunkStrategy,
														
 
															+                ...(provider ? { embedProvider: provider } : {}),
														
 
															             });
														
 
															         },
														
 
															         searchLex: async (q, opts) => internal.searchFTS(q, opts?.limit, opts?.collection),
														
 
															-        searchVector: async (q, opts) => internal.searchVec(q, DEFAULT_EMBED_MODEL, opts?.limit, opts?.collection),
														
 
															+        searchVector: async (q, opts) => internal.searchVec(q, DEFAULT_EMBED_MODEL, opts?.limit, opts?.collection, undefined, undefined, opts?.embedProvider ?? options.embedProvider),
														
 
															         expandQuery: async (q, opts) => internal.expandQuery(q, undefined, opts?.intent),
														
 
															         get: async (pathOrDocid, opts) => internal.findDocument(pathOrDocid, opts),
														
 
															         getDocumentBody: async (pathOrDocid, opts) => {
														
--- a/dist/mcp/server.js
+++ b/dist/mcp/server.js
@@ -17,8 +17,33 @@ import { WebStandardStreamableHTTPServerTransport } from "@modelcontextprotocol/
 
															 import { isInitializeRequest } from "@modelcontextprotocol/sdk/types.js";
														
 
															 import { z } from "zod";
														
 
															 import { existsSync } from "fs";
														
 
															-import { createStore, extractSnippet, addLineNumbers, getDefaultDbPath, DEFAULT_MULTI_GET_MAX_BYTES, } from "../index.js";
														
 
															+import { createStore, extractSnippet, addLineNumbers, getDefaultDbPath, DEFAULT_MULTI_GET_MAX_BYTES, createEmbeddingProvider, resolveProviderKind, } from "../index.js";
														
 
															 import { getConfigPath } from "../collections.js";
														
 
															+/**
														
 
															+ * Build a query-side embedding provider (i-loazq6ze) for MCP server start.
														
 
															+ * Mirrors `buildQueryEmbedProvider` in the CLI: returns `undefined` when
														
 
															+ * the user has not opted into a remote provider, preserving pre-patch
														
 
															+ * behavior (local llama-cpp). Construction errors are logged and the
														
 
															+ * server falls back to the legacy path.
														
 
															+ */
														
 
															+function buildMcpEmbedProvider() {
														
 
															+    const env = process.env;
														
 
															+    const envOptIn = !!(env.QMD_EMBED_PROVIDER ||
														
 
															+        env.QMD_EMBED_ENDPOINT ||
														
 
															+        env.QMD_EMBED_AUTO_FALLBACK);
														
 
															+    // Probe resolved kind via the factory's standard precedence (env + config).
														
 
															+    const resolved = resolveProviderKind({});
														
 
															+    if (!envOptIn && resolved === "local")
														
 
															+        return undefined;
														
 
															+    try {
														
 
															+        return createEmbeddingProvider({});
														
 
															+    }
														
 
															+    catch (err) {
														
 
															+        // Log + fall through to undefined so legacy local path is used.
														
 
															+        process.stderr.write(`[qmd mcp] WARN failed to build embedding provider — using local fallback: ${err instanceof Error ? err.message : String(err)}\n`);
														
 
															+        return undefined;
														
 
															+    }
														
 
															+}
														
 
															 // =============================================================================
														
 
															 // Helper functions
														
 
															 // =============================================================================
														
@@ -417,9 +442,11 @@ Intent-aware lex (C++ performance, not sports):
 
															 // =============================================================================
														
 
															 export async function startMcpServer() {
														
 
															     const configPath = getConfigPath();
														
 
															+    const embedProvider = buildMcpEmbedProvider();
														
 
															     const store = await createStore({
														
 
															         dbPath: getDefaultDbPath(),
														
 
															         ...(existsSync(configPath) ? { configPath } : {}),
														
 
															+        ...(embedProvider ? { embedProvider } : {}),
														
 
															     });
														
 
															     const server = await createMcpServer(store);
														
 
															     const transport = new StdioServerTransport();
														
@@ -431,9 +458,11 @@ export async function startMcpServer() {
 
															  */
														
 
															 export async function startMcpHttpServer(port, options) {
														
 
															     const configPath = getConfigPath();
														
 
															+    const embedProvider = buildMcpEmbedProvider();
														
 
															     const store = await createStore({
														
 
															         dbPath: getDefaultDbPath(),
														
 
															         ...(existsSync(configPath) ? { configPath } : {}),
														
 
															+        ...(embedProvider ? { embedProvider } : {}),
														
 
															     });
														
 
															     // Pre-fetch default collection names for REST endpoint
														
 
															     const defaultCollectionNames = await store.getDefaultCollectionNames();
														
@@ -655,6 +684,14 @@ export async function startMcpHttpServer(port, options) {
 
															         sessions.clear();
														
 
															         httpServer.close();
														
 
															         await store.close();
														
 
															+        // Dispose the query-side embedding provider (if any) — releases
														
 
															+        // HTTP keep-alive sockets in OpenAIEmbeddingsProvider (i-loazq6ze).
														
 
															+        if (embedProvider) {
														
 
															+            try {
														
 
															+                await embedProvider.dispose();
														
 
															+            }
														
 
															+            catch { /* ignore */ }
														
 
															+        }
														
 
															     };
														
 
															     process.on("SIGTERM", async () => {
														
 
															         console.error("Shutting down (SIGTERM)...");
														
--- a/dist/store.d.ts
+++ b/dist/store.d.ts
@@ -246,7 +246,7 @@ export type Store = {
 
															     resolveVirtualPath: (virtualPath: string) => string | null;
														
 
															     toVirtualPath: (absolutePath: string) => string | null;
														
 
															     searchFTS: (query: string, limit?: number, collectionName?: string) => SearchResult[];
														
 
															-    searchVec: (query: string, model: string, limit?: number, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[]) => Promise<SearchResult[]>;
														
 
															+    searchVec: (query: string, model: string, limit?: number, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[], embedProvider?: EmbeddingProvider) => Promise<SearchResult[]>;
														
 
															     expandQuery: (query: string, model?: string, intent?: string) => Promise<ExpandedQuery[]>;
														
 
															     rerank: (query: string, documents: {
														
 
															         file: string;
														
@@ -562,14 +562,38 @@ export declare function chunkDocumentAsync(content: string, maxChars?: number, o
 
															     text: string;
														
 
															     pos: number;
														
 
															 }[]>;
														
 
															+/**
														
 
															+ * Counts the tokens in `text`. Used by `chunkDocumentByTokens` for the
														
 
															+ * safety re-split that splits chunks exceeding `maxTokens`.
														
 
															+ *
														
 
															+ * When `chunkDocumentByTokens` is called WITHOUT a tokenizer (default),
														
 
															+ * it lazily resolves `getDefaultLlamaCpp()` and uses `llm.tokenize` —
														
 
															+ * accurate but expensive (loads the local GGUF embed model + initialises
														
 
															+ * llama.cpp, ~22s on cold cache).
														
 
															+ *
														
 
															+ * Provider-mode callers (HTTP embed providers like the GPU worker on
														
 
															+ * `models` LXC) MUST pass a JS-only approximator to avoid loading the
														
 
															+ * local model entirely. A char-based estimate like
														
 
															+ * `Math.ceil(text.length / 3)` is a reasonable default — it matches the
														
 
															+ * `avgCharsPerToken=3` heuristic used for the initial char-space chunk
														
 
															+ * step, so the safety re-split stays a near no-op while populating the
														
 
															+ * `tokens` field with a stable estimate.
														
 
															+ */
														
 
															+export type TokenCounter = (text: string) => number | Promise<number>;
														
 
															 /**
														
 
															  * Chunk a document by actual token count using the LLM tokenizer.
														
 
															  * More accurate than character-based chunking but requires async.
														
 
															  *
														
 
															- * When filepath and chunkStrategy are provided, uses AST-aware break points
														
 
															- * for supported code files.
														
 
															+ * When `tokenizer` is supplied, it is used in place of the local
														
 
															+ * `llm.tokenize(...)` call — neither `getDefaultLlamaCpp()` nor
														
 
															+ * `llm.tokenize(...)` is invoked. This lets remote-only deployments
														
 
															+ * (`QMD_EMBED_ENDPOINT=...`) chunk documents without warming up
														
 
															+ * node-llama-cpp (DoD #1 of i-1rqixh6m / i-qkarfffa).
														
 
															+ *
														
 
															+ * When `filepath` and `chunkStrategy` are provided, uses AST-aware break
														
 
															+ * points for supported code files.
														
 
															  */
														
 
															-export declare function chunkDocumentByTokens(content: string, maxTokens?: number, overlapTokens?: number, windowTokens?: number, filepath?: string, chunkStrategy?: ChunkStrategy, signal?: AbortSignal): Promise<{
														
 
															+export declare function chunkDocumentByTokens(content: string, maxTokens?: number, overlapTokens?: number, windowTokens?: number, filepath?: string, chunkStrategy?: ChunkStrategy, signal?: AbortSignal, tokenizer?: TokenCounter): Promise<{
														
 
															     text: string;
														
 
															     pos: number;
														
 
															     tokens: number;
														
@@ -709,7 +733,7 @@ export declare function sanitizeFTS5Term(term: string): string;
 
															 export declare function validateSemanticQuery(query: string): string | null;
														
 
															 export declare function validateLexQuery(query: string): string | null;
														
 
															 export declare function searchFTS(db: Database, query: string, limit?: number, collectionName?: string): SearchResult[];
														
 
															-export declare function searchVec(db: Database, query: string, model: string, limit?: number, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[]): Promise<SearchResult[]>;
														
 
															+export declare function searchVec(db: Database, query: string, model: string, limit?: number, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[], embedProvider?: EmbeddingProvider): Promise<SearchResult[]>;
														
 
															 /**
														
 
															  * Get all unique content hashes that need embeddings (from active documents).
														
 
															  * Returns hash, document body, and a sample path for display purposes.
														
@@ -844,6 +868,14 @@ export interface HybridQueryOptions {
 
															     skipRerank?: boolean;
														
 
															     chunkStrategy?: ChunkStrategy;
														
 
															     hooks?: SearchHooks;
														
 
															+    /**
														
 
															+     * Optional embedding provider for query-side encoding (i-loazq6ze).
														
 
															+     * When supplied, the original-query vector AND any vec/hyde expansion
														
 
															+     * variants are encoded through this provider (HTTP, GPU worker,
														
 
															+     * AutoFallback chain) instead of `getLlm(store).embedBatch(...)`. Skip
														
 
															+     * to keep pre-patch behavior (uses local LlamaCpp).
														
 
															+     */
														
 
															+    embedProvider?: EmbeddingProvider;
														
 
															 }
														
 
															 export interface HybridQueryResult {
														
 
															     file: string;
														
@@ -882,6 +914,12 @@ export interface VectorSearchOptions {
 
															     minScore?: number;
														
 
															     intent?: string;
														
 
															     hooks?: Pick<SearchHooks, 'onExpand'>;
														
 
															+    /**
														
 
															+     * Optional embedding provider for query-side encoding (i-loazq6ze).
														
 
															+     * When supplied, query vectors are encoded via the provider (HTTP /
														
 
															+     * GPU worker / fallback chain) instead of the local llama-cpp model.
														
 
															+     */
														
 
															+    embedProvider?: EmbeddingProvider;
														
 
															 }
														
 
															 export interface VectorSearchResult {
														
 
															     file: string;
														
@@ -918,6 +956,12 @@ export interface StructuredSearchOptions {
 
															     skipRerank?: boolean;
														
 
															     chunkStrategy?: ChunkStrategy;
														
 
															     hooks?: SearchHooks;
														
 
															+    /**
														
 
															+     * Optional embedding provider for query-side encoding (i-loazq6ze).
														
 
															+     * When supplied, vec/hyde sub-queries are batch-encoded via the provider
														
 
															+     * (HTTP / GPU worker / fallback chain) instead of `getLlm(store).embedBatch`.
														
 
															+     */
														
 
															+    embedProvider?: EmbeddingProvider;
														
 
															 }
														
 
															 /**
														
 
															  * Structured search: execute pre-expanded queries without LLM query expansion.
														
--- a/dist/store.js
+++ b/dist/store.js
@@ -1141,6 +1141,15 @@ export async function generateEmbeddings(store, options) {
 
															             }
														
 
															             return session.embedBatch(texts, { model: modelArg });
														
 
															         };
														
 
															+        // JS-only token estimator for the provider path. Char-based with
														
 
															+        // avgCharsPerToken=3 — matches the heuristic the chunker already
														
 
															+        // uses for its initial char-space pass, so the safety re-split is a
														
 
															+        // near no-op while populating the `tokens` field with a stable
														
 
															+        // estimate. CRITICAL: avoids loading node-llama-cpp on remote-only
														
 
															+        // deployments (`QMD_EMBED_ENDPOINT=...`). i-1rqixh6m DoD #1.
														
 
															+        const chunkTokenizer = provider
														
 
															+            ? (text) => Math.ceil(text.length / 3)
														
 
															+            : undefined;
														
 
															         for (const batchMeta of batches) {
														
 
															             // Abort early if session has been invalidated
														
 
															             if (!session.isValid) {
														
@@ -1156,7 +1165,7 @@ export async function generateEmbeddings(store, options) {
 
															                 const title = extractTitle(doc.body, doc.path);
														
 
															                 const perCollectionStrategy = collectionStrategies.get(doc.collection);
														
 
															                 const chunkStrategy = perCollectionStrategy ?? options?.chunkStrategy;
														
 
															-                const chunks = await chunkDocumentByTokens(doc.body, undefined, undefined, undefined, doc.path, chunkStrategy, session.signal);
														
 
															+                const chunks = await chunkDocumentByTokens(doc.body, undefined, undefined, undefined, doc.path, chunkStrategy, session.signal, chunkTokenizer);
														
 
															                 for (let seq = 0; seq < chunks.length; seq++) {
														
 
															                     batchChunks.push({
														
 
															                         hash: doc.hash,
														
@@ -1316,7 +1325,7 @@ export function createStore(dbPath) {
 
															         toVirtualPath: (absolutePath) => toVirtualPath(db, absolutePath),
														
 
															         // Search
														
 
															         searchFTS: (query, limit, collectionName) => searchFTS(db, query, limit, collectionName),
														
 
															-        searchVec: (query, model, limit, collectionName, session, precomputedEmbedding) => searchVec(db, query, model, limit, collectionName, session, precomputedEmbedding),
														
 
															+        searchVec: (query, model, limit, collectionName, session, precomputedEmbedding, embedProvider) => searchVec(db, query, model, limit, collectionName, session, precomputedEmbedding, embedProvider),
														
 
															         // Query expansion & reranking
														
 
															         expandQuery: (query, model, intent) => expandQuery(query, model, db, intent, store.llm),
														
 
															         rerank: (query, documents, model, intent) => rerank(query, documents, model, db, intent, store.llm),
														
@@ -1782,11 +1791,26 @@ function chunkByFunctionRanges(content, ranges, regexPoints, codeFences, maxChar
 
															  * Chunk a document by actual token count using the LLM tokenizer.
														
 
															  * More accurate than character-based chunking but requires async.
														
 
															  *
														
 
															- * When filepath and chunkStrategy are provided, uses AST-aware break points
														
 
															- * for supported code files.
														
 
															- */
														
 
															-export async function chunkDocumentByTokens(content, maxTokens = CHUNK_SIZE_TOKENS, overlapTokens = CHUNK_OVERLAP_TOKENS, windowTokens = CHUNK_WINDOW_TOKENS, filepath, chunkStrategy = "regex", signal) {
														
 
															-    const llm = getDefaultLlamaCpp();
														
 
															+ * When `tokenizer` is supplied, it is used in place of the local
														
 
															+ * `llm.tokenize(...)` call — neither `getDefaultLlamaCpp()` nor
														
 
															+ * `llm.tokenize(...)` is invoked. This lets remote-only deployments
														
 
															+ * (`QMD_EMBED_ENDPOINT=...`) chunk documents without warming up
														
 
															+ * node-llama-cpp (DoD #1 of i-1rqixh6m / i-qkarfffa).
														
 
															+ *
														
 
															+ * When `filepath` and `chunkStrategy` are provided, uses AST-aware break
														
 
															+ * points for supported code files.
														
 
															+ */
														
 
															+export async function chunkDocumentByTokens(content, maxTokens = CHUNK_SIZE_TOKENS, overlapTokens = CHUNK_OVERLAP_TOKENS, windowTokens = CHUNK_WINDOW_TOKENS, filepath, chunkStrategy = "regex", signal, tokenizer) {
														
 
															+    // Resolve token counter lazily so callers that supply `tokenizer` never
														
 
															+    // touch the local LlamaCpp instance — `getDefaultLlamaCpp()` is only
														
 
															+    // invoked from inside the default closure when it is actually called
														
 
															+    // (i.e. when no tokenizer is supplied).
														
 
															+    let llm;
														
 
															+    const countTokens = tokenizer ?? (async (text) => {
														
 
															+        if (!llm)
														
 
															+            llm = getDefaultLlamaCpp();
														
 
															+        return (await llm.tokenize(text)).length;
														
 
															+    });
														
 
															     // Use moderate chars/token estimate (prose ~4, code ~2, mixed ~3)
														
 
															     // If chunks exceed limit, they'll be re-split with actual ratio
														
 
															     const avgCharsPerToken = 3;
														
@@ -1802,24 +1826,24 @@ export async function chunkDocumentByTokens(content, maxTokens = CHUNK_SIZE_TOKE
 
															         // Respect abort signal to avoid runaway tokenization
														
 
															         if (signal?.aborted)
														
 
															             break;
														
 
															-        const tokens = await llm.tokenize(chunk.text);
														
 
															-        if (tokens.length <= maxTokens) {
														
 
															-            results.push({ text: chunk.text, pos: chunk.pos, tokens: tokens.length });
														
 
															+        const tokenCount = await countTokens(chunk.text);
														
 
															+        if (tokenCount <= maxTokens) {
														
 
															+            results.push({ text: chunk.text, pos: chunk.pos, tokens: tokenCount });
														
 
															         }
														
 
															         else {
														
 
															             // Chunk is still too large - split it further
														
 
															             // Use actual token count to estimate better char limit
														
 
															-            const actualCharsPerToken = chunk.text.length / tokens.length;
														
 
															+            const actualCharsPerToken = chunk.text.length / tokenCount;
														
 
															             const safeMaxChars = Math.floor(maxTokens * actualCharsPerToken * 0.95); // 5% safety margin
														
 
															             const subChunks = chunkDocument(chunk.text, safeMaxChars, Math.floor(overlapChars * actualCharsPerToken / 2), Math.floor(windowChars * actualCharsPerToken / 2));
														
 
															             for (const subChunk of subChunks) {
														
 
															                 if (signal?.aborted)
														
 
															                     break;
														
 
															-                const subTokens = await llm.tokenize(subChunk.text);
														
 
															+                const subCount = await countTokens(subChunk.text);
														
 
															                 results.push({
														
 
															                     text: subChunk.text,
														
 
															                     pos: chunk.pos + subChunk.pos,
														
 
															-                    tokens: subTokens.length,
														
 
															+                    tokens: subCount,
														
 
															                 });
														
 
															             }
														
 
															         }
														
@@ -2493,11 +2517,11 @@ export function searchFTS(db, query, limit = 20, collectionName) {
 
															 // =============================================================================
														
 
															 // Vector Search
														
 
															 // =============================================================================
														
 
															-export async function searchVec(db, query, model, limit = 20, collectionName, session, precomputedEmbedding) {
														
 
															+export async function searchVec(db, query, model, limit = 20, collectionName, session, precomputedEmbedding, embedProvider) {
														
 
															     const tableExists = db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
														
 
															     if (!tableExists)
														
 
															         return [];
														
 
															-    const embedding = precomputedEmbedding ?? await getEmbedding(query, model, true, session);
														
 
															+    const embedding = precomputedEmbedding ?? await getEmbedding(query, model, true, session, undefined, embedProvider);
														
 
															     if (!embedding)
														
 
															         return [];
														
 
															     // IMPORTANT: We use a two-step query approach here because sqlite-vec virtual tables
														
@@ -2571,7 +2595,24 @@ export async function searchVec(db, query, model, limit = 20, collectionName, se
 
															 // =============================================================================
														
 
															 // Embeddings
														
 
															 // =============================================================================
														
 
															-async function getEmbedding(text, model, isQuery, session, llmOverride) {
														
 
															+async function getEmbedding(text, model, isQuery, session, llmOverride, embedProvider) {
														
 
															+    // When an EmbeddingProvider is supplied, route the encoding through it
														
 
															+    // (HTTP / GPU worker / fallback chain) instead of touching local
														
 
															+    // node-llama-cpp at all. The provider sees the raw text + the desired
														
 
															+    // model id; query-formatting prefixes are still applied via
														
 
															+    // formatQueryForEmbedding so embedding parity with the index is preserved.
														
 
															+    if (embedProvider) {
														
 
															+        const providerModel = embedProvider.getModelId();
														
 
															+        const formattedText = isQuery
														
 
															+            ? formatQueryForEmbedding(text, providerModel)
														
 
															+            : formatDocForEmbedding(text, undefined, providerModel);
														
 
															+        // Only forward an AbortSignal when the provider is local-backed;
														
 
															+        // remote providers manage their own timeouts and an LLM-session signal
														
 
															+        // would abort their HTTP request prematurely (i-08ovbvtb).
														
 
															+        const sig = embedProvider.kind === "local" ? session?.signal : undefined;
														
 
															+        const result = await embedProvider.embed(formattedText, sig ? { model: providerModel, signal: sig } : { model: providerModel });
														
 
															+        return result?.embedding ?? null;
														
 
															+    }
														
 
															     // Format text using the appropriate prompt template
														
 
															     const formattedText = isQuery ? formatQueryForEmbedding(text, model) : formatDocForEmbedding(text, undefined, model);
														
 
															     const result = session
														
@@ -3236,6 +3277,7 @@ export async function hybridQuery(store, query, options) {
 
															     const intent = options?.intent;
														
 
															     const skipRerank = options?.skipRerank ?? false;
														
 
															     const hooks = options?.hooks;
														
 
															+    const embedProvider = options?.embedProvider;
														
 
															     const rankedLists = [];
														
 
															     const rankedListMeta = [];
														
 
															     const docidMap = new Map(); // filepath -> docid
														
@@ -3300,12 +3342,19 @@ export async function hybridQuery(store, query, options) {
 
															                 vecQueries.push({ text: q.query, queryType: q.type });
														
 
															             }
														
 
															         }
														
 
															-        // Batch embed all vector queries in a single call
														
 
															-        const llm = getLlm(store);
														
 
															-        const textsToEmbed = vecQueries.map(q => formatQueryForEmbedding(q.text, llm.embedModelName));
														
 
															+        // Batch embed all vector queries in a single call.
														
 
															+        // When `embedProvider` is supplied (i-loazq6ze), route the encode through
														
 
															+        // it (HTTP / GPU worker / AutoFallback chain) instead of warming the
														
 
															+        // local llama-cpp model — this is the whole point of the GPU worker.
														
 
															+        const embedModelName = embedProvider
														
 
															+            ? embedProvider.getModelId()
														
 
															+            : getLlm(store).embedModelName;
														
 
															+        const textsToEmbed = vecQueries.map(q => formatQueryForEmbedding(q.text, embedModelName));
														
 
															         hooks?.onEmbedStart?.(textsToEmbed.length);
														
 
															         const embedStart = Date.now();
														
 
															-        const embeddings = await llm.embedBatch(textsToEmbed);
														
 
															+        const embeddings = embedProvider
														
 
															+            ? await embedProvider.embedBatch(textsToEmbed, { model: embedModelName })
														
 
															+            : await getLlm(store).embedBatch(textsToEmbed);
														
 
															         hooks?.onEmbedDone?.(Date.now() - embedStart);
														
 
															         // Run sqlite-vec lookups with pre-computed embeddings
														
 
															         for (let i = 0; i < vecQueries.length; i++) {
														
@@ -3501,6 +3550,7 @@ export async function vectorSearchQuery(store, query, options) {
 
															     const minScore = options?.minScore ?? 0.3;
														
 
															     const collection = options?.collection;
														
 
															     const intent = options?.intent;
														
 
															+    const embedProvider = options?.embedProvider;
														
 
															     const hasVectors = !!store.db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
														
 
															     if (!hasVectors)
														
 
															         return [];
														
@@ -3509,11 +3559,14 @@ export async function vectorSearchQuery(store, query, options) {
 
															     const allExpanded = await store.expandQuery(query, undefined, intent);
														
 
															     const vecExpanded = allExpanded.filter(q => q.type !== 'lex');
														
 
															     options?.hooks?.onExpand?.(query, vecExpanded, Date.now() - expandStart);
														
 
															-    // Run original + vec/hyde expanded through vector, sequentially — concurrent embed() hangs
														
 
															+    // Run original + vec/hyde expanded through vector, sequentially — concurrent embed() hangs.
														
 
															+    // When `embedProvider` is supplied (i-loazq6ze), query encoding is routed
														
 
															+    // through it; the per-call signature `searchVec(...)` accepts the provider
														
 
															+    // as the trailing argument so existing tests / callers stay untouched.
														
 
															     const queryTexts = [query, ...vecExpanded.map(q => q.query)];
														
 
															     const allResults = new Map();
														
 
															     for (const q of queryTexts) {
														
 
															-        const vecResults = await store.searchVec(q, DEFAULT_EMBED_MODEL, limit, collection);
														
 
															+        const vecResults = await store.searchVec(q, DEFAULT_EMBED_MODEL, limit, collection, undefined, undefined, embedProvider);
														
 
															         for (const r of vecResults) {
														
 
															             const existing = allResults.get(r.filepath);
														
 
															             if (!existing || r.score > existing.score) {
														
@@ -3560,6 +3613,7 @@ export async function structuredSearch(store, searches, options) {
 
															     const intent = options?.intent;
														
 
															     const skipRerank = options?.skipRerank ?? false;
														
 
															     const hooks = options?.hooks;
														
 
															+    const embedProvider = options?.embedProvider;
														
 
															     const collections = options?.collections;
														
 
															     if (searches.length === 0)
														
 
															         return [];
														
@@ -3613,11 +3667,19 @@ export async function structuredSearch(store, searches, options) {
 
															     if (hasVectors) {
														
 
															         const vecSearches = searches.filter((s) => s.type === 'vec' || s.type === 'hyde');
														
 
															         if (vecSearches.length > 0) {
														
 
															-            const llm = getLlm(store);
														
 
															-            const textsToEmbed = vecSearches.map(s => formatQueryForEmbedding(s.query, llm.embedModelName));
														
 
															+            // Route batch encoding through the supplied EmbeddingProvider when
														
 
															+            // present (i-loazq6ze). Otherwise fall back to the local llama-cpp
														
 
															+            // singleton — preserves pre-patch behavior for callers that don't
														
 
															+            // configure a provider.
														
 
															+            const embedModelName = embedProvider
														
 
															+                ? embedProvider.getModelId()
														
 
															+                : getLlm(store).embedModelName;
														
 
															+            const textsToEmbed = vecSearches.map(s => formatQueryForEmbedding(s.query, embedModelName));
														
 
															             hooks?.onEmbedStart?.(textsToEmbed.length);
														
 
															             const embedStart = Date.now();
														
 
															-            const embeddings = await llm.embedBatch(textsToEmbed);
														
 
															+            const embeddings = embedProvider
														
 
															+                ? await embedProvider.embedBatch(textsToEmbed, { model: embedModelName })
														
 
															+                : await getLlm(store).embedBatch(textsToEmbed);
														
 
															             hooks?.onEmbedDone?.(Date.now() - embedStart);
														
 
															             for (let i = 0; i < vecSearches.length; i++) {
														
 
															                 const embedding = embeddings[i]?.embedding;
														
--- a/src/cli/qmd.ts
+++ b/src/cli/qmd.ts
@@ -1683,6 +1683,53 @@ function parseOptionalPositiveInt(name: string, value: unknown): number | undefi
 
															   return parsed;
														
 
															 }
														
 
															+/**
														
 
															+ * Build an `EmbeddingProvider` for the QUERY-side path (vsearch / query)
														
 
															+ * if and only if the user has opted into a non-local provider via flags or
														
 
															+ * env vars. Returns `undefined` for the zero-config case so the legacy
														
 
															+ * `getDefaultLlamaCpp().embed(...)` path is used unchanged — preserving
														
 
															+ * pre-patch behavior for callers that have not configured remote embedding
														
 
															+ * (i-loazq6ze DoD #5: backward compat).
														
 
															+ *
														
 
															+ * Resolution mirrors `qmd embed` (factory.resolveProviderKind):
														
 
															+ *   1. Explicit `--provider` flag → build provider
														
 
															+ *   2. Any `--embed-*` flag / `QMD_EMBED_*` env / `embedProvider.endpoint`
														
 
															+ *      in `~/.config/qmd/config.json` → build provider
														
 
															+ *   3. Otherwise → return `undefined` (legacy path)
														
 
															+ *
														
 
															+ * Returns `null` on construction failure (e.g. malformed flags) so the
														
 
															+ * caller can warn + fall back to the legacy path.
														
 
															+ */
														
 
															+function buildQueryEmbedProvider(values: Record<string, unknown>): EmbeddingProvider | undefined {
														
 
															+  const providerCliKind = parseProviderKind(values["provider"]);
														
 
															+  const opts = buildProviderOpts(values, providerCliKind);
														
 
															+
														
 
															+  // Determine whether the user opted into a provider. The factory's resolve
														
 
															+  // step returns "local" by default; without explicit opt-in (flag/env/
														
 
															+  // config), we keep the legacy path with no construction overhead.
														
 
															+  const resolved = resolveProviderKind(opts);
														
 
															+  const hasProviderFlag = providerCliKind !== undefined;
														
 
															+  const hasOpenAiOverride = !!opts.openai && Object.keys(opts.openai).length > 0;
														
 
															+  const envOptIn = !!(
														
 
															+    process.env.QMD_EMBED_PROVIDER ||
														
 
															+    process.env.QMD_EMBED_ENDPOINT ||
														
 
															+    process.env.QMD_EMBED_AUTO_FALLBACK
														
 
															+  );
														
 
															+
														
 
															+  if (!hasProviderFlag && !hasOpenAiOverride && !envOptIn && resolved === "local") {
														
 
															+    return undefined;
														
 
															+  }
														
 
															+
														
 
															+  try {
														
 
															+    return createEmbeddingProvider(opts);
														
 
															+  } catch (err) {
														
 
															+    process.stderr.write(
														
 
															+      `${c.yellow}Warning: failed to build query embedding provider — using local fallback (${err instanceof Error ? err.message : String(err)})${c.reset}\n`,
														
 
															+    );
														
 
															+    return undefined;
														
 
															+  }
														
 
															+}
														
 
															+
														
 
															 /**
														
 
															  * Translate `cli.values` into `CreateEmbeddingProviderOptions`. CLI flags
														
 
															  * win over env vars (the factory itself reads env when these are unset).
														
@@ -1872,6 +1919,12 @@ type OutputOptions = {
 
															   intent?: string;       // Domain intent for disambiguation
														
 
															   skipRerank?: boolean;  // Skip LLM reranking, use RRF scores only
														
 
															   chunkStrategy?: ChunkStrategy;  // "auto" (default) or "regex"
														
 
															+  /**
														
 
															+   * Optional embedding provider for query-side encoding (i-loazq6ze).
														
 
															+   * Built once in main() from the same flag/env/config precedence as
														
 
															+   * `qmd embed` and threaded into vsearch/query/search code paths.
														
 
															+   */
														
 
															+  embedProvider?: EmbeddingProvider;
														
 
															 };
														
 
															 // Highlight query terms in text (skip short words < 3 chars)
														
@@ -2341,12 +2394,20 @@ async function vectorSearch(query: string, opts: OutputOptions, _model: string =
 
															   checkIndexHealth(store.db);
														
 
															+  // Build embedding provider for query encoding (i-loazq6ze).
														
 
															+  // Same precedence as `qmd embed`: explicit `--provider` flag → env vars →
														
 
															+  // `~/.config/qmd/config.json` → default LocalLlamaCppProvider. The local
														
 
															+  // default keeps zero-config callers on the legacy llama-cpp path with no
														
 
															+  // observable change.
														
 
															+  const embedProvider = opts.embedProvider;
														
 
															+
														
 
															   await withLLMSession(async () => {
														
 
															     let results = await vectorSearchQuery(store, query, {
														
 
															       collection: singleCollection,
														
 
															       limit: opts.all ? 500 : (opts.limit || 10),
														
 
															       minScore: opts.minScore || 0.3,
														
 
															       intent: opts.intent,
														
 
															+      ...(embedProvider ? { embedProvider } : {}),
														
 
															       hooks: {
														
 
															         onExpand: (original, expanded) => {
														
 
															           logExpansionTree(original, expanded);
														
@@ -2426,6 +2487,7 @@ async function querySearch(query: string, opts: OutputOptions, _embedModel: stri
 
															         explain: !!opts.explain,
														
 
															         intent,
														
 
															         chunkStrategy: opts.chunkStrategy,
														
 
															+        ...(opts.embedProvider ? { embedProvider: opts.embedProvider } : {}),
														
 
															         hooks: {
														
 
															           onEmbedStart: (count) => {
														
 
															             process.stderr.write(`${c.dim}Embedding ${count} ${count === 1 ? 'query' : 'queries'}...${c.reset}`);
														
@@ -2454,6 +2516,7 @@ async function querySearch(query: string, opts: OutputOptions, _embedModel: stri
 
															         explain: !!opts.explain,
														
 
															         intent,
														
 
															         chunkStrategy: opts.chunkStrategy,
														
 
															+        ...(opts.embedProvider ? { embedProvider: opts.embedProvider } : {}),
														
 
															         hooks: {
														
 
															           onStrongSignal: (score) => {
														
 
															             process.stderr.write(`${c.dim}Strong BM25 signal (${score.toFixed(2)}) — skipping expansion${c.reset}\n`);
														
@@ -3243,6 +3306,9 @@ if (isMain) {
 
															       if (!cli.values["min-score"]) {
														
 
															         cli.opts.minScore = 0.3;
														
 
															       }
														
 
															+      // Build query-side embedding provider (i-loazq6ze).
														
 
															+      // Returns undefined for zero-config callers (legacy local path).
														
 
															+      cli.opts.embedProvider = buildQueryEmbedProvider(cli.values);
														
 
															       await vectorSearch(cli.query, cli.opts);
														
 
															       break;
														
@@ -3252,6 +3318,7 @@ if (isMain) {
 
															         console.error("Usage: qmd query [options] <query>");
														
 
															         process.exit(1);
														
 
															       }
														
 
															+      cli.opts.embedProvider = buildQueryEmbedProvider(cli.values);
														
 
															       await querySearch(cli.query, cli.opts);
														
 
															       break;
														
--- a/src/index.ts
+++ b/src/index.ts
@@ -119,6 +119,14 @@ export { getDefaultDbPath } from "./store.js";
 
															 // Re-export Maintenance class for CLI housekeeping operations
														
 
															 export { Maintenance } from "./maintenance.js";
														
 
															+// Local import so the type is in scope of the SDK option interfaces below
														
 
															+// (e.g. `embedProvider?: EmbeddingProvider`). The `export { ... }` re-export
														
 
															+// directly below is for SDK consumers, but TypeScript does not put names
														
 
															+// from `export { ... } from "..."` into the file's own scope — it needs
														
 
															+// a separate `import` for that. Without this import, sibling commit
														
 
															+// 20e44c9 references `EmbeddingProvider` in option interfaces uncompilably.
														
 
															+import type { EmbeddingProvider } from "./embedding/index.js";
														
 
															+
														
 
															 // Re-export embedding provider abstraction for SDK consumers (i-qkarfffa).
														
 
															 // `createEmbeddingProvider` honors QMD_EMBED_ENDPOINT / config-file / kind
														
 
															 // arg precedence; default fallback is the legacy LocalLlamaCppProvider so
														
@@ -195,6 +203,14 @@ export interface SearchOptions {
 
															   explain?: boolean;
														
 
															   /** Chunk strategy: "auto" (default, uses AST for code files) or "regex" (legacy) */
														
 
															   chunkStrategy?: ChunkStrategy;
														
 
															+  /**
														
 
															+   * Optional embedding provider for query-side encoding (i-loazq6ze).
														
 
															+   * When supplied, vec/hyde sub-queries are encoded through the provider
														
 
															+   * (HTTP / GPU worker / AutoFallback chain) instead of the local llama-cpp
														
 
															+   * model. Omit to keep pre-patch behavior — the SDK store still works
														
 
															+   * unchanged for callers that have not opted into a remote provider.
														
 
															+   */
														
 
															+  embedProvider?: EmbeddingProvider;
														
 
															 }
														
 
															 /**
														
@@ -211,6 +227,11 @@ export interface LexSearchOptions {
 
															 export interface VectorSearchOptions {
														
 
															   limit?: number;
														
 
															   collection?: string;
														
 
															+  /**
														
 
															+   * Optional embedding provider for query encoding (i-loazq6ze). Forwarded
														
 
															+   * through to `searchVec`. Defaults to local llama-cpp.
														
 
															+   */
														
 
															+  embedProvider?: EmbeddingProvider;
														
 
															 }
														
 
															 /**
														
@@ -234,6 +255,14 @@ export interface StoreOptions {
 
															   configPath?: string;
														
 
															   /** Inline collection config (mutually exclusive with `configPath`) */
														
 
															   config?: CollectionConfig;
														
 
															+  /**
														
 
															+   * Optional default embedding provider for query encoding (i-loazq6ze).
														
 
															+   * When set, every `store.search(...)` call uses this provider unless the
														
 
															+   * caller passes its own `embedProvider` in `SearchOptions`. MCP / HTTP
														
 
															+   * server constructs the provider once at startup and injects it here so
														
 
															+   * every query routes through the GPU worker.
														
 
															+   */
														
 
															+  embedProvider?: EmbeddingProvider;
														
 
															 }
														
 
															 /**
														
@@ -421,6 +450,8 @@ export async function createStore(options: StoreOptions): Promise<QMDStore> {
 
															         ...(opts.collections ?? []),
														
 
															       ];
														
 
															       const skipRerank = opts.rerank === false;
														
 
															+      // Per-call provider wins over store-level default.
														
 
															+      const provider = opts.embedProvider ?? options.embedProvider;
														
 
															       if (opts.queries) {
														
 
															         // Pre-expanded queries — use structuredSearch
														
@@ -432,6 +463,7 @@ export async function createStore(options: StoreOptions): Promise<QMDStore> {
 
															           intent: opts.intent,
														
 
															           skipRerank,
														
 
															           chunkStrategy: opts.chunkStrategy,
														
 
															+          ...(provider ? { embedProvider: provider } : {}),
														
 
															         });
														
 
															       }
														
@@ -444,10 +476,14 @@ export async function createStore(options: StoreOptions): Promise<QMDStore> {
 
															         intent: opts.intent,
														
 
															         skipRerank,
														
 
															         chunkStrategy: opts.chunkStrategy,
														
 
															+        ...(provider ? { embedProvider: provider } : {}),
														
 
															       });
														
 
															     },
														
 
															     searchLex: async (q, opts) => internal.searchFTS(q, opts?.limit, opts?.collection),
														
 
															-    searchVector: async (q, opts) => internal.searchVec(q, DEFAULT_EMBED_MODEL, opts?.limit, opts?.collection),
														
 
															+    searchVector: async (q, opts) => internal.searchVec(
														
 
															+      q, DEFAULT_EMBED_MODEL, opts?.limit, opts?.collection,
														
 
															+      undefined, undefined, opts?.embedProvider ?? options.embedProvider,
														
 
															+    ),
														
 
															     expandQuery: async (q, opts) => internal.expandQuery(q, undefined, opts?.intent),
														
 
															     get: async (pathOrDocid, opts) => internal.findDocument(pathOrDocid, opts),
														
 
															     getDocumentBody: async (pathOrDocid, opts) => {
														
--- a/src/mcp/server.ts
+++ b/src/mcp/server.ts
@@ -25,12 +25,43 @@ import {
 
															   addLineNumbers,
														
 
															   getDefaultDbPath,
														
 
															   DEFAULT_MULTI_GET_MAX_BYTES,
														
 
															+  createEmbeddingProvider,
														
 
															+  resolveProviderKind,
														
 
															   type QMDStore,
														
 
															   type ExpandedQuery,
														
 
															   type IndexStatus,
														
 
															+  type EmbeddingProvider,
														
 
															 } from "../index.js";
														
 
															 import { getConfigPath } from "../collections.js";
														
 
															+/**
														
 
															+ * Build a query-side embedding provider (i-loazq6ze) for MCP server start.
														
 
															+ * Mirrors `buildQueryEmbedProvider` in the CLI: returns `undefined` when
														
 
															+ * the user has not opted into a remote provider, preserving pre-patch
														
 
															+ * behavior (local llama-cpp). Construction errors are logged and the
														
 
															+ * server falls back to the legacy path.
														
 
															+ */
														
 
															+function buildMcpEmbedProvider(): EmbeddingProvider | undefined {
														
 
															+  const env = process.env;
														
 
															+  const envOptIn = !!(
														
 
															+    env.QMD_EMBED_PROVIDER ||
														
 
															+    env.QMD_EMBED_ENDPOINT ||
														
 
															+    env.QMD_EMBED_AUTO_FALLBACK
														
 
															+  );
														
 
															+  // Probe resolved kind via the factory's standard precedence (env + config).
														
 
															+  const resolved = resolveProviderKind({});
														
 
															+  if (!envOptIn && resolved === "local") return undefined;
														
 
															+  try {
														
 
															+    return createEmbeddingProvider({});
														
 
															+  } catch (err) {
														
 
															+    // Log + fall through to undefined so legacy local path is used.
														
 
															+    process.stderr.write(
														
 
															+      `[qmd mcp] WARN failed to build embedding provider — using local fallback: ${err instanceof Error ? err.message : String(err)}\n`,
														
 
															+    );
														
 
															+    return undefined;
														
 
															+  }
														
 
															+}
														
 
															+
														
 
															 // =============================================================================
														
 
															 // Types for structured content
														
 
															 // =============================================================================
														
@@ -539,9 +570,11 @@ Intent-aware lex (C++ performance, not sports):
 
															 export async function startMcpServer(): Promise<void> {
														
 
															   const configPath = getConfigPath();
														
 
															+  const embedProvider = buildMcpEmbedProvider();
														
 
															   const store = await createStore({
														
 
															     dbPath: getDefaultDbPath(),
														
 
															     ...(existsSync(configPath) ? { configPath } : {}),
														
 
															+    ...(embedProvider ? { embedProvider } : {}),
														
 
															   });
														
 
															   const server = await createMcpServer(store);
														
 
															   const transport = new StdioServerTransport();
														
@@ -564,9 +597,11 @@ export type HttpServerHandle = {
 
															  */
														
 
															 export async function startMcpHttpServer(port: number, options?: { quiet?: boolean }): Promise<HttpServerHandle> {
														
 
															   const configPath = getConfigPath();
														
 
															+  const embedProvider = buildMcpEmbedProvider();
														
 
															   const store = await createStore({
														
 
															     dbPath: getDefaultDbPath(),
														
 
															     ...(existsSync(configPath) ? { configPath } : {}),
														
 
															+    ...(embedProvider ? { embedProvider } : {}),
														
 
															   });
														
 
															   // Pre-fetch default collection names for REST endpoint
														
@@ -810,6 +845,11 @@ export async function startMcpHttpServer(port: number, options?: { quiet?: boole
 
															     sessions.clear();
														
 
															     httpServer.close();
														
 
															     await store.close();
														
 
															+    // Dispose the query-side embedding provider (if any) — releases
														
 
															+    // HTTP keep-alive sockets in OpenAIEmbeddingsProvider (i-loazq6ze).
														
 
															+    if (embedProvider) {
														
 
															+      try { await embedProvider.dispose(); } catch { /* ignore */ }
														
 
															+    }
														
 
															   };
														
 
															   process.on("SIGTERM", async () => {
														
--- a/src/store.ts
+++ b/src/store.ts
@@ -1120,7 +1120,7 @@ export type Store = {
 
															   // Search
														
 
															   searchFTS: (query: string, limit?: number, collectionName?: string) => SearchResult[];
														
 
															-  searchVec: (query: string, model: string, limit?: number, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[]) => Promise<SearchResult[]>;
														
 
															+  searchVec: (query: string, model: string, limit?: number, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[], embedProvider?: EmbeddingProvider) => Promise<SearchResult[]>;
														
 
															   // Query expansion & reranking
														
 
															   expandQuery: (query: string, model?: string, intent?: string) => Promise<ExpandedQuery[]>;
														
@@ -1565,6 +1565,16 @@ export async function generateEmbeddings(
 
															       return session.embedBatch(texts, { model: modelArg });
														
 
															     };
														
 
															+    // JS-only token estimator for the provider path. Char-based with
														
 
															+    // avgCharsPerToken=3 — matches the heuristic the chunker already
														
 
															+    // uses for its initial char-space pass, so the safety re-split is a
														
 
															+    // near no-op while populating the `tokens` field with a stable
														
 
															+    // estimate. CRITICAL: avoids loading node-llama-cpp on remote-only
														
 
															+    // deployments (`QMD_EMBED_ENDPOINT=...`). i-1rqixh6m DoD #1.
														
 
															+    const chunkTokenizer: TokenCounter | undefined = provider
														
 
															+      ? (text: string) => Math.ceil(text.length / 3)
														
 
															+      : undefined;
														
 
															+
														
 
															     for (const batchMeta of batches) {
														
 
															       // Abort early if session has been invalidated
														
 
															       if (!session.isValid) {
														
@@ -1588,6 +1598,7 @@ export async function generateEmbeddings(
 
															           doc.path,
														
 
															           chunkStrategy,
														
 
															           session.signal,
														
 
															+          chunkTokenizer,
														
 
															         );
														
 
															         for (let seq = 0; seq < chunks.length; seq++) {
														
@@ -1764,7 +1775,7 @@ export function createStore(dbPath?: string): Store {
 
															     // Search
														
 
															     searchFTS: (query: string, limit?: number, collectionName?: string) => searchFTS(db, query, limit, collectionName),
														
 
															-    searchVec: (query: string, model: string, limit?: number, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[]) => searchVec(db, query, model, limit, collectionName, session, precomputedEmbedding),
														
 
															+    searchVec: (query: string, model: string, limit?: number, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[], embedProvider?: EmbeddingProvider) => searchVec(db, query, model, limit, collectionName, session, precomputedEmbedding, embedProvider),
														
 
															     // Query expansion & reranking
														
 
															     expandQuery: (query: string, model?: string, intent?: string) => expandQuery(query, model, db, intent, store.llm),
														
@@ -2453,12 +2464,37 @@ function chunkByFunctionRanges(
 
															   return out;
														
 
															 }
														
 
															+/**
														
 
															+ * Counts the tokens in `text`. Used by `chunkDocumentByTokens` for the
														
 
															+ * safety re-split that splits chunks exceeding `maxTokens`.
														
 
															+ *
														
 
															+ * When `chunkDocumentByTokens` is called WITHOUT a tokenizer (default),
														
 
															+ * it lazily resolves `getDefaultLlamaCpp()` and uses `llm.tokenize` —
														
 
															+ * accurate but expensive (loads the local GGUF embed model + initialises
														
 
															+ * llama.cpp, ~22s on cold cache).
														
 
															+ *
														
 
															+ * Provider-mode callers (HTTP embed providers like the GPU worker on
														
 
															+ * `models` LXC) MUST pass a JS-only approximator to avoid loading the
														
 
															+ * local model entirely. A char-based estimate like
														
 
															+ * `Math.ceil(text.length / 3)` is a reasonable default — it matches the
														
 
															+ * `avgCharsPerToken=3` heuristic used for the initial char-space chunk
														
 
															+ * step, so the safety re-split stays a near no-op while populating the
														
 
															+ * `tokens` field with a stable estimate.
														
 
															+ */
														
 
															+export type TokenCounter = (text: string) => number | Promise<number>;
														
 
															+
														
 
															 /**
														
 
															  * Chunk a document by actual token count using the LLM tokenizer.
														
 
															  * More accurate than character-based chunking but requires async.
														
 
															  *
														
 
															- * When filepath and chunkStrategy are provided, uses AST-aware break points
														
 
															- * for supported code files.
														
 
															+ * When `tokenizer` is supplied, it is used in place of the local
														
 
															+ * `llm.tokenize(...)` call — neither `getDefaultLlamaCpp()` nor
														
 
															+ * `llm.tokenize(...)` is invoked. This lets remote-only deployments
														
 
															+ * (`QMD_EMBED_ENDPOINT=...`) chunk documents without warming up
														
 
															+ * node-llama-cpp (DoD #1 of i-1rqixh6m / i-qkarfffa).
														
 
															+ *
														
 
															+ * When `filepath` and `chunkStrategy` are provided, uses AST-aware break
														
 
															+ * points for supported code files.
														
 
															  */
														
 
															 export async function chunkDocumentByTokens(
														
 
															   content: string,
														
@@ -2467,9 +2503,18 @@ export async function chunkDocumentByTokens(
 
															   windowTokens: number = CHUNK_WINDOW_TOKENS,
														
 
															   filepath?: string,
														
 
															   chunkStrategy: ChunkStrategy = "regex",
														
 
															-  signal?: AbortSignal
														
 
															+  signal?: AbortSignal,
														
 
															+  tokenizer?: TokenCounter,
														
 
															 ): Promise<{ text: string; pos: number; tokens: number }[]> {
														
 
															-  const llm = getDefaultLlamaCpp();
														
 
															+  // Resolve token counter lazily so callers that supply `tokenizer` never
														
 
															+  // touch the local LlamaCpp instance — `getDefaultLlamaCpp()` is only
														
 
															+  // invoked from inside the default closure when it is actually called
														
 
															+  // (i.e. when no tokenizer is supplied).
														
 
															+  let llm: ReturnType<typeof getDefaultLlamaCpp> | undefined;
														
 
															+  const countTokens: TokenCounter = tokenizer ?? (async (text: string) => {
														
 
															+    if (!llm) llm = getDefaultLlamaCpp();
														
 
															+    return (await llm.tokenize(text)).length;
														
 
															+  });
														
 
															   // Use moderate chars/token estimate (prose ~4, code ~2, mixed ~3)
														
 
															   // If chunks exceed limit, they'll be re-split with actual ratio
														
@@ -2489,25 +2534,25 @@ export async function chunkDocumentByTokens(
 
															     // Respect abort signal to avoid runaway tokenization
														
 
															     if (signal?.aborted) break;
														
 
															-    const tokens = await llm.tokenize(chunk.text);
														
 
															+    const tokenCount = await countTokens(chunk.text);
														
 
															-    if (tokens.length <= maxTokens) {
														
 
															-      results.push({ text: chunk.text, pos: chunk.pos, tokens: tokens.length });
														
 
															+    if (tokenCount <= maxTokens) {
														
 
															+      results.push({ text: chunk.text, pos: chunk.pos, tokens: tokenCount });
														
 
															     } else {
														
 
															       // Chunk is still too large - split it further
														
 
															       // Use actual token count to estimate better char limit
														
 
															-      const actualCharsPerToken = chunk.text.length / tokens.length;
														
 
															+      const actualCharsPerToken = chunk.text.length / tokenCount;
														
 
															       const safeMaxChars = Math.floor(maxTokens * actualCharsPerToken * 0.95); // 5% safety margin
														
 
															       const subChunks = chunkDocument(chunk.text, safeMaxChars, Math.floor(overlapChars * actualCharsPerToken / 2), Math.floor(windowChars * actualCharsPerToken / 2));
														
 
															       for (const subChunk of subChunks) {
														
 
															         if (signal?.aborted) break;
														
 
															-        const subTokens = await llm.tokenize(subChunk.text);
														
 
															+        const subCount = await countTokens(subChunk.text);
														
 
															         results.push({
														
 
															           text: subChunk.text,
														
 
															           pos: chunk.pos + subChunk.pos,
														
 
															-          tokens: subTokens.length,
														
 
															+          tokens: subCount,
														
 
															         });
														
 
															       }
														
 
															     }
														
@@ -3260,11 +3305,11 @@ export function searchFTS(db: Database, query: string, limit: number = 20, colle
 
															 // Vector Search
														
 
															 // =============================================================================
														
 
															-export async function searchVec(db: Database, query: string, model: string, limit: number = 20, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[]): Promise<SearchResult[]> {
														
 
															+export async function searchVec(db: Database, query: string, model: string, limit: number = 20, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[], embedProvider?: EmbeddingProvider): Promise<SearchResult[]> {
														
 
															   const tableExists = db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
														
 
															   if (!tableExists) return [];
														
 
															-  const embedding = precomputedEmbedding ?? await getEmbedding(query, model, true, session);
														
 
															+  const embedding = precomputedEmbedding ?? await getEmbedding(query, model, true, session, undefined, embedProvider);
														
 
															   if (!embedding) return [];
														
 
															   // IMPORTANT: We use a two-step query approach here because sqlite-vec virtual tables
														
@@ -3350,7 +3395,24 @@ export async function searchVec(db: Database, query: string, model: string, limi
 
															 // Embeddings
														
 
															 // =============================================================================
														
 
															-async function getEmbedding(text: string, model: string, isQuery: boolean, session?: ILLMSession, llmOverride?: LlamaCpp): Promise<number[] | null> {
														
 
															+async function getEmbedding(text: string, model: string, isQuery: boolean, session?: ILLMSession, llmOverride?: LlamaCpp, embedProvider?: EmbeddingProvider): Promise<number[] | null> {
														
 
															+  // When an EmbeddingProvider is supplied, route the encoding through it
														
 
															+  // (HTTP / GPU worker / fallback chain) instead of touching local
														
 
															+  // node-llama-cpp at all. The provider sees the raw text + the desired
														
 
															+  // model id; query-formatting prefixes are still applied via
														
 
															+  // formatQueryForEmbedding so embedding parity with the index is preserved.
														
 
															+  if (embedProvider) {
														
 
															+    const providerModel = embedProvider.getModelId();
														
 
															+    const formattedText = isQuery
														
 
															+      ? formatQueryForEmbedding(text, providerModel)
														
 
															+      : formatDocForEmbedding(text, undefined, providerModel);
														
 
															+    // Only forward an AbortSignal when the provider is local-backed;
														
 
															+    // remote providers manage their own timeouts and an LLM-session signal
														
 
															+    // would abort their HTTP request prematurely (i-08ovbvtb).
														
 
															+    const sig = embedProvider.kind === "local" ? session?.signal : undefined;
														
 
															+    const result = await embedProvider.embed(formattedText, sig ? { model: providerModel, signal: sig } : { model: providerModel });
														
 
															+    return result?.embedding ?? null;
														
 
															+  }
														
 
															   // Format text using the appropriate prompt template
														
 
															   const formattedText = isQuery ? formatQueryForEmbedding(text, model) : formatDocForEmbedding(text, undefined, model);
														
 
															   const result = session
														
@@ -4147,6 +4209,14 @@ export interface HybridQueryOptions {
 
															   skipRerank?: boolean;     // skip LLM reranking, use only RRF scores
														
 
															   chunkStrategy?: ChunkStrategy;
														
 
															   hooks?: SearchHooks;
														
 
															+  /**
														
 
															+   * Optional embedding provider for query-side encoding (i-loazq6ze).
														
 
															+   * When supplied, the original-query vector AND any vec/hyde expansion
														
 
															+   * variants are encoded through this provider (HTTP, GPU worker,
														
 
															+   * AutoFallback chain) instead of `getLlm(store).embedBatch(...)`. Skip
														
 
															+   * to keep pre-patch behavior (uses local LlamaCpp).
														
 
															+   */
														
 
															+  embedProvider?: EmbeddingProvider;
														
 
															 }
														
 
															 export interface HybridQueryResult {
														
@@ -4194,6 +4264,7 @@ export async function hybridQuery(
 
															   const intent = options?.intent;
														
 
															   const skipRerank = options?.skipRerank ?? false;
														
 
															   const hooks = options?.hooks;
														
 
															+  const embedProvider = options?.embedProvider;
														
 
															   const rankedLists: RankedResult[][] = [];
														
 
															   const rankedListMeta: RankedListMeta[] = [];
														
@@ -4267,12 +4338,19 @@ export async function hybridQuery(
 
															       }
														
 
															     }
														
 
															-    // Batch embed all vector queries in a single call
														
 
															-    const llm = getLlm(store);
														
 
															-    const textsToEmbed = vecQueries.map(q => formatQueryForEmbedding(q.text, llm.embedModelName));
														
 
															+    // Batch embed all vector queries in a single call.
														
 
															+    // When `embedProvider` is supplied (i-loazq6ze), route the encode through
														
 
															+    // it (HTTP / GPU worker / AutoFallback chain) instead of warming the
														
 
															+    // local llama-cpp model — this is the whole point of the GPU worker.
														
 
															+    const embedModelName = embedProvider
														
 
															+      ? embedProvider.getModelId()
														
 
															+      : getLlm(store).embedModelName;
														
 
															+    const textsToEmbed = vecQueries.map(q => formatQueryForEmbedding(q.text, embedModelName));
														
 
															     hooks?.onEmbedStart?.(textsToEmbed.length);
														
 
															     const embedStart = Date.now();
														
 
															-    const embeddings = await llm.embedBatch(textsToEmbed);
														
 
															+    const embeddings = embedProvider
														
 
															+      ? await embedProvider.embedBatch(textsToEmbed, { model: embedModelName })
														
 
															+      : await getLlm(store).embedBatch(textsToEmbed);
														
 
															     hooks?.onEmbedDone?.(Date.now() - embedStart);
														
 
															     // Run sqlite-vec lookups with pre-computed embeddings
														
@@ -4468,6 +4546,12 @@ export interface VectorSearchOptions {
 
															   minScore?: number;        // default 0.3
														
 
															   intent?: string;          // domain intent hint for disambiguation
														
 
															   hooks?: Pick<SearchHooks, 'onExpand'>;
														
 
															+  /**
														
 
															+   * Optional embedding provider for query-side encoding (i-loazq6ze).
														
 
															+   * When supplied, query vectors are encoded via the provider (HTTP /
														
 
															+   * GPU worker / fallback chain) instead of the local llama-cpp model.
														
 
															+   */
														
 
															+  embedProvider?: EmbeddingProvider;
														
 
															 }
														
 
															 export interface VectorSearchResult {
														
@@ -4498,6 +4582,7 @@ export async function vectorSearchQuery(
 
															   const minScore = options?.minScore ?? 0.3;
														
 
															   const collection = options?.collection;
														
 
															   const intent = options?.intent;
														
 
															+  const embedProvider = options?.embedProvider;
														
 
															   const hasVectors = !!store.db.prepare(
														
 
															     `SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`
														
@@ -4510,11 +4595,17 @@ export async function vectorSearchQuery(
 
															   const vecExpanded = allExpanded.filter(q => q.type !== 'lex');
														
 
															   options?.hooks?.onExpand?.(query, vecExpanded, Date.now() - expandStart);
														
 
															-  // Run original + vec/hyde expanded through vector, sequentially — concurrent embed() hangs
														
 
															+  // Run original + vec/hyde expanded through vector, sequentially — concurrent embed() hangs.
														
 
															+  // When `embedProvider` is supplied (i-loazq6ze), query encoding is routed
														
 
															+  // through it; the per-call signature `searchVec(...)` accepts the provider
														
 
															+  // as the trailing argument so existing tests / callers stay untouched.
														
 
															   const queryTexts = [query, ...vecExpanded.map(q => q.query)];
														
 
															   const allResults = new Map<string, VectorSearchResult>();
														
 
															   for (const q of queryTexts) {
														
 
															-    const vecResults = await store.searchVec(q, DEFAULT_EMBED_MODEL, limit, collection);
														
 
															+    const vecResults = await store.searchVec(
														
 
															+      q, DEFAULT_EMBED_MODEL, limit, collection,
														
 
															+      undefined, undefined, embedProvider,
														
 
															+    );
														
 
															     for (const r of vecResults) {
														
 
															       const existing = allResults.get(r.filepath);
														
 
															       if (!existing || r.score > existing.score) {
														
@@ -4557,6 +4648,12 @@ export interface StructuredSearchOptions {
 
															   skipRerank?: boolean;
														
 
															   chunkStrategy?: ChunkStrategy;
														
 
															   hooks?: SearchHooks;
														
 
															+  /**
														
 
															+   * Optional embedding provider for query-side encoding (i-loazq6ze).
														
 
															+   * When supplied, vec/hyde sub-queries are batch-encoded via the provider
														
 
															+   * (HTTP / GPU worker / fallback chain) instead of `getLlm(store).embedBatch`.
														
 
															+   */
														
 
															+  embedProvider?: EmbeddingProvider;
														
 
															 }
														
 
															 /**
														
@@ -4589,6 +4686,7 @@ export async function structuredSearch(
 
															   const intent = options?.intent;
														
 
															   const skipRerank = options?.skipRerank ?? false;
														
 
															   const hooks = options?.hooks;
														
 
															+  const embedProvider = options?.embedProvider;
														
 
															   const collections = options?.collections;
														
@@ -4651,11 +4749,19 @@ export async function structuredSearch(
 
															         s.type === 'vec' || s.type === 'hyde'
														
 
															     );
														
 
															     if (vecSearches.length > 0) {
														
 
															-      const llm = getLlm(store);
														
 
															-      const textsToEmbed = vecSearches.map(s => formatQueryForEmbedding(s.query, llm.embedModelName));
														
 
															+      // Route batch encoding through the supplied EmbeddingProvider when
														
 
															+      // present (i-loazq6ze). Otherwise fall back to the local llama-cpp
														
 
															+      // singleton — preserves pre-patch behavior for callers that don't
														
 
															+      // configure a provider.
														
 
															+      const embedModelName = embedProvider
														
 
															+        ? embedProvider.getModelId()
														
 
															+        : getLlm(store).embedModelName;
														
 
															+      const textsToEmbed = vecSearches.map(s => formatQueryForEmbedding(s.query, embedModelName));
														
 
															       hooks?.onEmbedStart?.(textsToEmbed.length);
														
 
															       const embedStart = Date.now();
														
 
															-      const embeddings = await llm.embedBatch(textsToEmbed);
														
 
															+      const embeddings = embedProvider
														
 
															+        ? await embedProvider.embedBatch(textsToEmbed, { model: embedModelName })
														
 
															+        : await getLlm(store).embedBatch(textsToEmbed);
														
 
															       hooks?.onEmbedDone?.(Date.now() - embedStart);
														
 
															       for (let i = 0; i < vecSearches.length; i++) {
														
--- a/test/embedding-store-integration.test.ts
+++ b/test/embedding-store-integration.test.ts
@@ -10,10 +10,34 @@
 
															  *   - getDistinctEmbeddingModels reads content_vectors correctly
														
 
															  */
														
 
															-import { describe, test, expect, beforeEach, afterEach } from "vitest";
														
 
															+import { describe, test, expect, beforeEach, afterEach, vi } from "vitest";
														
 
															 import { mkdtempSync, rmSync } from "node:fs";
														
 
															 import { tmpdir } from "node:os";
														
 
															 import { join } from "node:path";
														
 
															+
														
 
															+// Mock the llm.js module so `getDefaultLlamaCpp` (the only function
														
 
															+// `chunkDocumentByTokens` reaches into when no `tokenizer` is supplied)
														
 
															+// throws on call. This is the strongest possible assertion of DoD #1
														
 
															+// for i-1rqixh6m: provider-mode embed runs MUST never load node-llama-cpp.
														
 
															+//
														
 
															+// Vitest hoists this `vi.mock` above the `import` lines below, and
														
 
															+// since the module replacement applies to ALL importers (including
														
 
															+// `store.js`), any leaked call from `chunkDocumentByTokens` (or any
														
 
															+// sibling code path) into `getDefaultLlamaCpp` will throw a clear
														
 
															+// "DoD violation" error and fail the test.
														
 
															+vi.mock("../src/llm.js", async (importOriginal) => {
														
 
															+  const actual = await importOriginal<typeof import("../src/llm.js")>();
														
 
															+  return {
														
 
															+    ...actual,
														
 
															+    getDefaultLlamaCpp: vi.fn(() => {
														
 
															+      throw new Error(
														
 
															+        "getDefaultLlamaCpp() invoked when embedProvider was supplied — " +
														
 
															+        "DoD #1 violation (i-1rqixh6m). Provider-mode embed must not load node-llama-cpp.",
														
 
															+      );
														
 
															+    }),
														
 
															+  };
														
 
															+});
														
 
															+
														
 
															 import {
														
 
															   createStore,
														
 
															   generateEmbeddings,
														
@@ -27,6 +51,7 @@ import {
 
															   type ProviderEmbedding,
														
 
															   type ProviderHealth,
														
 
															 } from "../src/embedding/provider.js";
														
 
															+import * as llmModule from "../src/llm.js";
														
 
															 // ─────────────────────────── Stub provider ───────────────────────────────────
														
@@ -106,6 +131,10 @@ afterEach(() => {
 
															   } catch { /* ignore */ }
														
 
															   delete process.env.INDEX_PATH;
														
 
															   rmSync(workDir, { recursive: true, force: true });
														
 
															+  // Reset call history on the mocked getDefaultLlamaCpp between tests so
														
 
															+  // each test gets a clean ledger to assert against.
														
 
															+  const spy = llmModule.getDefaultLlamaCpp as unknown as ReturnType<typeof vi.fn>;
														
 
															+  spy.mockClear();
														
 
															 });
														
 
															 // ─────────────────────────── getDistinctEmbeddingModels ──────────────────────
														
@@ -177,6 +206,32 @@ describe("generateEmbeddings with EmbeddingProvider", () => {
 
															     expect(provider.totalTextsEmbedded).toBeGreaterThan(0);
														
 
															   });
														
 
															+  test("provider mode does not call getDefaultLlamaCpp (DoD #3 — i-1rqixh6m)", async () => {
														
 
															+    // Stronger assertion than the `store.llm` Proxy above: when the
														
 
															+    // chunker or any sibling code path falls back to the *global*
														
 
															+    // `getDefaultLlamaCpp()` singleton (the previous warm-up source
														
 
															+    // inside `chunkDocumentByTokens`), the module-level mock at the top
														
 
															+    // of this file would throw — so a successful run is itself proof of
														
 
															+    // compliance. We additionally assert call count = 0 for clarity.
														
 
															+    const spy = llmModule.getDefaultLlamaCpp as unknown as ReturnType<typeof vi.fn>;
														
 
															+    expect(spy).not.toHaveBeenCalled();
														
 
															+
														
 
															+    const provider = new StubProvider("embeddinggemma", 4);
														
 
															+    const result = await generateEmbeddings(store, { embedProvider: provider });
														
 
															+
														
 
															+    expect(result.docsProcessed).toBe(2);
														
 
															+    expect(result.chunksEmbedded).toBeGreaterThan(0);
														
 
															+    expect(result.errors).toBe(0);
														
 
															+    expect(provider.totalTextsEmbedded).toBeGreaterThan(0);
														
 
															+
														
 
															+    // The hard assertion: not a single call to the local LlamaCpp
														
 
															+    // singleton during the entire embed run. If `chunkDocumentByTokens`
														
 
															+    // (or any sibling) regresses and reaches `getDefaultLlamaCpp()` on
														
 
															+    // the provider path, this test fails with a clear DoD-violation
														
 
															+    // message — and the run itself would have already thrown.
														
 
															+    expect(spy).not.toHaveBeenCalled();
														
 
															+  });
														
 
															+
														
 
															   test("model-id guard throws ModelMismatchError on mismatch", async () => {
														
 
															     // Pre-populate content_vectors with a different model id
														
 
															     store.ensureVecTable(4);
														
--- a/test/embedding-vsearch.test.ts
+++ b/test/embedding-vsearch.test.ts
@@ -0,0 +1,391 @@
 
															+/**
														
 
															+ * embedding-vsearch.test.ts — Query-side EmbeddingProvider integration
														
 
															+ * (issue i-loazq6ze).
														
 
															+ *
														
 
															+ * Verifies that `searchVec`, `structuredSearch`, and `vectorSearchQuery`
														
 
															+ * route query encoding through the supplied `EmbeddingProvider` instead
														
 
															+ * of the local `node-llama-cpp` model when one is configured. Also covers
														
 
															+ * the AutoFallback path so a transient remote outage degrades to local
														
 
															+ * instead of throwing.
														
 
															+ *
														
 
															+ * The store is in-memory (sqlite + sqlite-vec); the provider is a stub
														
 
															+ * that records calls and returns deterministic vectors so we can verify
														
 
															+ * routing without standing up real services.
														
 
															+ */
														
 
															+
														
 
															+import { describe, test, expect, beforeEach, afterEach } from "vitest";
														
 
															+import { mkdtempSync, rmSync } from "node:fs";
														
 
															+import { tmpdir } from "node:os";
														
 
															+import { join } from "node:path";
														
 
															+import {
														
 
															+  createStore,
														
 
															+  searchVec,
														
 
															+  structuredSearch,
														
 
															+  vectorSearchQuery,
														
 
															+  type Store,
														
 
															+  type ExpandedQuery,
														
 
															+} from "../src/store.js";
														
 
															+import {
														
 
															+  AutoFallbackEmbeddingProvider,
														
 
															+  CircuitOpenError,
														
 
															+  type EmbeddingProvider,
														
 
															+  type ProviderEmbedding,
														
 
															+  type ProviderHealth,
														
 
															+} from "../src/embedding/index.js";
														
 
															+
														
 
															+// ─────────────────────────── Stub providers ──────────────────────────────────
														
 
															+
														
 
															+/** Deterministic stub — returns a fixed embedding to match index vectors. */
														
 
															+class FixedProvider implements EmbeddingProvider {
														
 
															+  readonly kind = "openai" as const;
														
 
															+  embedCalls = 0;
														
 
															+  embedBatchCalls = 0;
														
 
															+  lastEmbedTexts: string[] = [];
														
 
															+  constructor(
														
 
															+    private readonly modelId: string,
														
 
															+    private readonly embedding: number[],
														
 
															+  ) {}
														
 
															+  getModelId(): string { return this.modelId; }
														
 
															+  getDimensions(): number | undefined { return this.embedding.length; }
														
 
															+  async healthcheck(): Promise<ProviderHealth> {
														
 
															+    return { ok: true, model: this.modelId, dimensions: this.embedding.length };
														
 
															+  }
														
 
															+  async embed(text: string): Promise<ProviderEmbedding | null> {
														
 
															+    this.embedCalls++;
														
 
															+    this.lastEmbedTexts.push(text);
														
 
															+    return { embedding: this.embedding.slice(), model: this.modelId };
														
 
															+  }
														
 
															+  async embedBatch(texts: string[]): Promise<(ProviderEmbedding | null)[]> {
														
 
															+    this.embedBatchCalls++;
														
 
															+    this.lastEmbedTexts.push(...texts);
														
 
															+    return texts.map(() => ({ embedding: this.embedding.slice(), model: this.modelId }));
														
 
															+  }
														
 
															+  async dispose(): Promise<void> {}
														
 
															+}
														
 
															+
														
 
															+/** Throws CircuitOpenError on every call — simulates "remote down". */
														
 
															+class CircuitOpenProvider implements EmbeddingProvider {
														
 
															+  readonly kind = "openai" as const;
														
 
															+  embedCalls = 0;
														
 
															+  embedBatchCalls = 0;
														
 
															+  constructor(private readonly modelId: string = "embeddinggemma") {}
														
 
															+  getModelId(): string { return this.modelId; }
														
 
															+  getDimensions(): number | undefined { return undefined; }
														
 
															+  async healthcheck(): Promise<ProviderHealth> {
														
 
															+    return { ok: false, model: this.modelId, detail: "circuit open" };
														
 
															+  }
														
 
															+  async embed(): Promise<ProviderEmbedding | null> {
														
 
															+    this.embedCalls++;
														
 
															+    throw new CircuitOpenError("remote down");
														
 
															+  }
														
 
															+  async embedBatch(): Promise<(ProviderEmbedding | null)[]> {
														
 
															+    this.embedBatchCalls++;
														
 
															+    throw new CircuitOpenError("remote down");
														
 
															+  }
														
 
															+  async dispose(): Promise<void> {}
														
 
															+}
														
 
															+
														
 
															+/** Throws a generic error on every call — simulates total backend failure. */
														
 
															+class AlwaysFailProvider implements EmbeddingProvider {
														
 
															+  readonly kind = "openai" as const;
														
 
															+  constructor(private readonly modelId: string = "embeddinggemma") {}
														
 
															+  getModelId(): string { return this.modelId; }
														
 
															+  getDimensions(): number | undefined { return undefined; }
														
 
															+  async healthcheck(): Promise<ProviderHealth> {
														
 
															+    return { ok: false, model: this.modelId, detail: "always fail" };
														
 
															+  }
														
 
															+  async embed(): Promise<ProviderEmbedding | null> {
														
 
															+    throw new Error("backend unreachable");
														
 
															+  }
														
 
															+  async embedBatch(): Promise<(ProviderEmbedding | null)[]> {
														
 
															+    throw new Error("backend unreachable");
														
 
															+  }
														
 
															+  async dispose(): Promise<void> {}
														
 
															+}
														
 
															+
														
 
															+// ─────────────────────────── Test setup ──────────────────────────────────────
														
 
															+
														
 
															+let workDir: string;
														
 
															+let store: Store;
														
 
															+
														
 
															+const DIM = 4;
														
 
															+// Fixed embedding used for both index vectors and query vectors so the
														
 
															+// stub provider's response will match the indexed vector exactly (cosine
														
 
															+// distance ≈ 0 → similarity ≈ 1).
														
 
															+const FIXED_VEC = [0.1, 0.2, 0.3, 0.4];
														
 
															+
														
 
															+beforeEach(() => {
														
 
															+  workDir = mkdtempSync(join(tmpdir(), "qmd-vsearch-test-"));
														
 
															+  process.env.INDEX_PATH = join(workDir, "index.sqlite");
														
 
															+  store = createStore(process.env.INDEX_PATH);
														
 
															+
														
 
															+  const now = "2026-04-28T00:00:00Z";
														
 
															+  store.db
														
 
															+    .prepare(`INSERT INTO content (hash, doc, created_at) VALUES (?, ?, ?)`)
														
 
															+    .run("hashA", "Alpha document body about query encoding via remote provider.", now);
														
 
															+  store.db
														
 
															+    .prepare(`INSERT INTO content (hash, doc, created_at) VALUES (?, ?, ?)`)
														
 
															+    .run("hashB", "Beta document body about fallback chain semantics.", now);
														
 
															+  store.db
														
 
															+    .prepare(`INSERT INTO documents (hash, collection, path, title, created_at, modified_at, active) VALUES (?, ?, ?, ?, ?, ?, ?)`)
														
 
															+    .run("hashA", "test", "alpha.md", "Alpha", now, now, 1);
														
 
															+  store.db
														
 
															+    .prepare(`INSERT INTO documents (hash, collection, path, title, created_at, modified_at, active) VALUES (?, ?, ?, ?, ?, ?, ?)`)
														
 
															+    .run("hashB", "test", "beta.md", "Beta", now, now, 1);
														
 
															+
														
 
															+  // Seed vectors_vec with the same fixed vector so stub provider's query
														
 
															+  // embedding lines up with the index entries.
														
 
															+  store.ensureVecTable(DIM);
														
 
															+  store.db
														
 
															+    .prepare(`INSERT INTO content_vectors (hash, seq, pos, model, embedded_at) VALUES (?, 0, 0, 'embeddinggemma', ?)`)
														
 
															+    .run("hashA", now);
														
 
															+  store.db
														
 
															+    .prepare(`INSERT INTO content_vectors (hash, seq, pos, model, embedded_at) VALUES (?, 0, 0, 'embeddinggemma', ?)`)
														
 
															+    .run("hashB", now);
														
 
															+  store.db
														
 
															+    .prepare(`INSERT INTO vectors_vec (hash_seq, embedding) VALUES (?, ?)`)
														
 
															+    .run("hashA_0", new Float32Array(FIXED_VEC));
														
 
															+  store.db
														
 
															+    .prepare(`INSERT INTO vectors_vec (hash_seq, embedding) VALUES (?, ?)`)
														
 
															+    .run("hashB_0", new Float32Array(FIXED_VEC));
														
 
															+});
														
 
															+
														
 
															+afterEach(() => {
														
 
															+  try { store.close(); } catch { /* ignore */ }
														
 
															+  delete process.env.INDEX_PATH;
														
 
															+  rmSync(workDir, { recursive: true, force: true });
														
 
															+});
														
 
															+
														
 
															+// ─────────────────────────── searchVec ──────────────────────────────────────
														
 
															+
														
 
															+describe("searchVec with EmbeddingProvider", () => {
														
 
															+  test("encodes the query through the provider when supplied", async () => {
														
 
															+    const provider = new FixedProvider("embeddinggemma", FIXED_VEC);
														
 
															+
														
 
															+    // Sanity: store.llm is not set; if searchVec touched local llama-cpp
														
 
															+    // it would fail (no model loaded). Provider routing must be exclusive.
														
 
															+    const results = await searchVec(
														
 
															+      store.db, "hello", "embeddinggemma", 10,
														
 
															+      undefined, undefined, undefined, provider,
														
 
															+    );
														
 
															+
														
 
															+    expect(provider.embedCalls).toBe(1);
														
 
															+    expect(provider.embedBatchCalls).toBe(0);
														
 
															+    expect(results.length).toBeGreaterThan(0);
														
 
															+    // Both alpha + beta share the same vector — both should be returned.
														
 
															+    const filepaths = results.map((r) => r.filepath).sort();
														
 
															+    expect(filepaths).toEqual(["qmd://test/alpha.md", "qmd://test/beta.md"]);
														
 
															+  });
														
 
															+
														
 
															+  test("provider mode does not access the local llama-cpp instance", async () => {
														
 
															+    const provider = new FixedProvider("embeddinggemma", FIXED_VEC);
														
 
															+
														
 
															+    // If anything touches `store.llm` while the provider is set, the proxy
														
 
															+    // throws — proves the provider path is truly exclusive (mirrors the
														
 
															+    // i-08ovbvtb regression guard in embedding-store-integration.test.ts).
														
 
															+    store.llm = new Proxy({}, {
														
 
															+      get(_target, prop) {
														
 
															+        throw new Error(
														
 
															+          `store.llm.${String(prop)} accessed when embedProvider was supplied — DoD violation`,
														
 
															+        );
														
 
															+      },
														
 
															+    }) as never;
														
 
															+
														
 
															+    const results = await searchVec(
														
 
															+      store.db, "hello", "embeddinggemma", 10,
														
 
															+      undefined, undefined, undefined, provider,
														
 
															+    );
														
 
															+    expect(results.length).toBeGreaterThan(0);
														
 
															+  });
														
 
															+
														
 
															+  test("survives transient primary failure via AutoFallback", async () => {
														
 
															+    const primary = new CircuitOpenProvider("embeddinggemma");
														
 
															+    const fallback = new FixedProvider("embeddinggemma", FIXED_VEC);
														
 
															+    const wrapped = new AutoFallbackEmbeddingProvider({
														
 
															+      primary,
														
 
															+      fallback,
														
 
															+      warn: () => { /* swallow noisy WARN in tests */ },
														
 
															+    });
														
 
															+
														
 
															+    const results = await searchVec(
														
 
															+      store.db, "fallback test", "embeddinggemma", 10,
														
 
															+      undefined, undefined, undefined, wrapped,
														
 
															+    );
														
 
															+
														
 
															+    expect(primary.embedCalls).toBe(1);
														
 
															+    expect(fallback.embedCalls).toBe(1);
														
 
															+    expect(results.length).toBeGreaterThan(0);
														
 
															+  });
														
 
															+
														
 
															+  test("surfaces error when both primary AND fallback fail", async () => {
														
 
															+    const primary = new AlwaysFailProvider("embeddinggemma");
														
 
															+    const fallback = new AlwaysFailProvider("embeddinggemma");
														
 
															+    const wrapped = new AutoFallbackEmbeddingProvider({
														
 
															+      primary,
														
 
															+      fallback,
														
 
															+      warn: () => { /* swallow */ },
														
 
															+    });
														
 
															+
														
 
															+    await expect(
														
 
															+      searchVec(
														
 
															+        store.db, "doomed", "embeddinggemma", 10,
														
 
															+        undefined, undefined, undefined, wrapped,
														
 
															+      ),
														
 
															+    ).rejects.toThrow(/backend unreachable/);
														
 
															+  });
														
 
															+});
														
 
															+
														
 
															+// ─────────────────────────── structuredSearch ───────────────────────────────
														
 
															+
														
 
															+describe("structuredSearch with EmbeddingProvider", () => {
														
 
															+  test("uses provider.embedBatch for vec/hyde sub-queries", async () => {
														
 
															+    const provider = new FixedProvider("embeddinggemma", FIXED_VEC);
														
 
															+
														
 
															+    // Deny access to the local llama-cpp — proves the provider path is exclusive.
														
 
															+    store.llm = new Proxy({}, {
														
 
															+      get(_target, prop) {
														
 
															+        throw new Error(
														
 
															+          `store.llm.${String(prop)} accessed when embedProvider was supplied — DoD violation`,
														
 
															+        );
														
 
															+      },
														
 
															+    }) as never;
														
 
															+
														
 
															+    const queries: ExpandedQuery[] = [
														
 
															+      { type: "vec", query: "what is the fallback chain about" },
														
 
															+      { type: "hyde", query: "Fallback chains route around primary failure transparently." },
														
 
															+    ];
														
 
															+
														
 
															+    const results = await structuredSearch(store, queries, {
														
 
															+      skipRerank: true, // reranker uses local llm — skip in this isolation test
														
 
															+      embedProvider: provider,
														
 
															+    });
														
 
															+
														
 
															+    // One batch call covering both vec/hyde queries.
														
 
															+    expect(provider.embedBatchCalls).toBe(1);
														
 
															+    expect(provider.lastEmbedTexts.length).toBe(2);
														
 
															+    expect(results.length).toBeGreaterThan(0);
														
 
															+  });
														
 
															+
														
 
															+  test("AutoFallback covers structuredSearch query batch", async () => {
														
 
															+    const primary = new CircuitOpenProvider("embeddinggemma");
														
 
															+    const fallback = new FixedProvider("embeddinggemma", FIXED_VEC);
														
 
															+    const wrapped = new AutoFallbackEmbeddingProvider({
														
 
															+      primary,
														
 
															+      fallback,
														
 
															+      warn: () => { /* swallow */ },
														
 
															+    });
														
 
															+
														
 
															+    const queries: ExpandedQuery[] = [
														
 
															+      { type: "vec", query: "fallback test" },
														
 
															+    ];
														
 
															+
														
 
															+    const results = await structuredSearch(store, queries, {
														
 
															+      skipRerank: true,
														
 
															+      embedProvider: wrapped,
														
 
															+    });
														
 
															+
														
 
															+    expect(primary.embedBatchCalls).toBe(1);
														
 
															+    expect(fallback.embedBatchCalls).toBe(1);
														
 
															+    expect(results.length).toBeGreaterThan(0);
														
 
															+  });
														
 
															+
														
 
															+  test("structuredSearch degrades to empty results when both providers fail (batch path)", async () => {
														
 
															+    // AutoFallback.embedBatch is contract-bound to return nulls on total
														
 
															+    // failure (graceful degradation in batch mode — see autofallback.ts
														
 
															+    // onTotalFail). structuredSearch then has no embeddings to query
														
 
															+    // sqlite-vec with and returns []. This is the documented behavior;
														
 
															+    // searchVec (single-embed path) is the one that surfaces a thrown
														
 
															+    // error to the caller, see the test above.
														
 
															+    const primary = new AlwaysFailProvider("embeddinggemma");
														
 
															+    const fallback = new AlwaysFailProvider("embeddinggemma");
														
 
															+    const wrapped = new AutoFallbackEmbeddingProvider({
														
 
															+      primary,
														
 
															+      fallback,
														
 
															+      warn: () => { /* swallow */ },
														
 
															+    });
														
 
															+
														
 
															+    const queries: ExpandedQuery[] = [
														
 
															+      { type: "vec", query: "doomed" },
														
 
															+    ];
														
 
															+
														
 
															+    const results = await structuredSearch(store, queries, {
														
 
															+      skipRerank: true,
														
 
															+      embedProvider: wrapped,
														
 
															+    });
														
 
															+    expect(results).toEqual([]);
														
 
															+  });
														
 
															+});
														
 
															+
														
 
															+// ─────────────────────────── vectorSearchQuery ──────────────────────────────
														
 
															+
														
 
															+describe("vectorSearchQuery with EmbeddingProvider", () => {
														
 
															+  test("encodes original query via provider, no local llm access", async () => {
														
 
															+    const provider = new FixedProvider("embeddinggemma", FIXED_VEC);
														
 
															+
														
 
															+    // Stub expandQuery to return no expansions — this isolates the
														
 
															+    // embedding path from the LLM-driven query expansion path.
														
 
															+    store.expandQuery = async () => [];
														
 
															+
														
 
															+    store.llm = new Proxy({}, {
														
 
															+      get(_target, prop) {
														
 
															+        throw new Error(
														
 
															+          `store.llm.${String(prop)} accessed when embedProvider was supplied — DoD violation`,
														
 
															+        );
														
 
															+      },
														
 
															+    }) as never;
														
 
															+
														
 
															+    const results = await vectorSearchQuery(store, "vector search test", {
														
 
															+      limit: 5,
														
 
															+      minScore: 0,
														
 
															+      embedProvider: provider,
														
 
															+    });
														
 
															+
														
 
															+    // vectorSearchQuery sequentializes — at minimum the original query
														
 
															+    // triggers one embed call via the provider.
														
 
															+    expect(provider.embedCalls).toBeGreaterThanOrEqual(1);
														
 
															+    expect(results.length).toBeGreaterThan(0);
														
 
															+  });
														
 
															+
														
 
															+  test("AutoFallback rescues vectorSearchQuery from primary failure", async () => {
														
 
															+    const primary = new CircuitOpenProvider("embeddinggemma");
														
 
															+    const fallback = new FixedProvider("embeddinggemma", FIXED_VEC);
														
 
															+    const wrapped = new AutoFallbackEmbeddingProvider({
														
 
															+      primary,
														
 
															+      fallback,
														
 
															+      warn: () => { /* swallow */ },
														
 
															+    });
														
 
															+
														
 
															+    store.expandQuery = async () => [];
														
 
															+
														
 
															+    const results = await vectorSearchQuery(store, "fallback path", {
														
 
															+      minScore: 0,
														
 
															+      embedProvider: wrapped,
														
 
															+    });
														
 
															+
														
 
															+    expect(primary.embedCalls).toBeGreaterThanOrEqual(1);
														
 
															+    expect(fallback.embedCalls).toBeGreaterThanOrEqual(1);
														
 
															+    expect(results.length).toBeGreaterThan(0);
														
 
															+  });
														
 
															+});
														
 
															+
														
 
															+// ─────────────────────────── Backward compat ────────────────────────────────
														
 
															+
														
 
															+describe("backward compat — no provider supplied", () => {
														
 
															+  test("searchVec without provider uses precomputed embedding path (no llm needed)", async () => {
														
 
															+    // When the caller passes `precomputedEmbedding`, searchVec must not
														
 
															+    // touch any embedding backend at all — neither local nor provider.
														
 
															+    // This is the cheapest backward-compat smoke test we can run without
														
 
															+    // loading node-llama-cpp.
														
 
															+    store.llm = new Proxy({}, {
														
 
															+      get(_target, prop) {
														
 
															+        throw new Error(`store.llm.${String(prop)} accessed unexpectedly`);
														
 
															+      },
														
 
															+    }) as never;
														
 
															+
														
 
															+    const results = await searchVec(
														
 
															+      store.db, "hello", "embeddinggemma", 10,
														
 
															+      undefined, undefined, FIXED_VEC, // precomputedEmbedding
														
 
															+    );
														
 
															+    expect(results.length).toBeGreaterThan(0);
														
 
															+  });
														
 
															+});
Autor	SHA1 Mensagem	Data
root	99bd369cdc feat(embedding): provider-aware tokenizer for chunkDocumentByTokens (i-1rqixh6m)	há 3 semanas atrás
root	20e44c90b5 feat(embedding): query-side EmbeddingProvider with auto-fallback (i-loazq6ze)	há 3 semanas atrás