2 ヶ月前 · 99bd369cdc
--- a/dist/cli/qmd.js
+++ b/dist/cli/qmd.js
@@ -1,4 +1,3 @@
 
				-#!/usr/bin/env node
			
 
				 import { openDatabase } from "../db.js";
			
 
				 import fastGlob from "fast-glob";
			
 
				 import { execSync, spawn as nodeSpawn } from "child_process";
			
@@ -1438,6 +1437,46 @@ function parseOptionalPositiveInt(name, value) {
 
				     }
			
 
				     return parsed;
			
 
				 }
			
 
				+/**
			
 
				+ * Build an `EmbeddingProvider` for the QUERY-side path (vsearch / query)
			
 
				+ * if and only if the user has opted into a non-local provider via flags or
			
 
				+ * env vars. Returns `undefined` for the zero-config case so the legacy
			
 
				+ * `getDefaultLlamaCpp().embed(...)` path is used unchanged — preserving
			
 
				+ * pre-patch behavior for callers that have not configured remote embedding
			
 
				+ * (i-loazq6ze DoD #5: backward compat).
			
 
				+ *
			
 
				+ * Resolution mirrors `qmd embed` (factory.resolveProviderKind):
			
 
				+ *   1. Explicit `--provider` flag → build provider
			
 
				+ *   2. Any `--embed-*` flag / `QMD_EMBED_*` env / `embedProvider.endpoint`
			
 
				+ *      in `~/.config/qmd/config.json` → build provider
			
 
				+ *   3. Otherwise → return `undefined` (legacy path)
			
 
				+ *
			
 
				+ * Returns `null` on construction failure (e.g. malformed flags) so the
			
 
				+ * caller can warn + fall back to the legacy path.
			
 
				+ */
			
 
				+function buildQueryEmbedProvider(values) {
			
 
				+    const providerCliKind = parseProviderKind(values["provider"]);
			
 
				+    const opts = buildProviderOpts(values, providerCliKind);
			
 
				+    // Determine whether the user opted into a provider. The factory's resolve
			
 
				+    // step returns "local" by default; without explicit opt-in (flag/env/
			
 
				+    // config), we keep the legacy path with no construction overhead.
			
 
				+    const resolved = resolveProviderKind(opts);
			
 
				+    const hasProviderFlag = providerCliKind !== undefined;
			
 
				+    const hasOpenAiOverride = !!opts.openai && Object.keys(opts.openai).length > 0;
			
 
				+    const envOptIn = !!(process.env.QMD_EMBED_PROVIDER ||
			
 
				+        process.env.QMD_EMBED_ENDPOINT ||
			
 
				+        process.env.QMD_EMBED_AUTO_FALLBACK);
			
 
				+    if (!hasProviderFlag && !hasOpenAiOverride && !envOptIn && resolved === "local") {
			
 
				+        return undefined;
			
 
				+    }
			
 
				+    try {
			
 
				+        return createEmbeddingProvider(opts);
			
 
				+    }
			
 
				+    catch (err) {
			
 
				+        process.stderr.write(`${c.yellow}Warning: failed to build query embedding provider — using local fallback (${err instanceof Error ? err.message : String(err)})${c.reset}\n`);
			
 
				+        return undefined;
			
 
				+    }
			
 
				+}
			
 
				 /**
			
 
				  * Translate `cli.values` into `CreateEmbeddingProviderOptions`. CLI flags
			
 
				  * win over env vars (the factory itself reads env when these are unset).
			
@@ -1974,12 +2013,19 @@ async function vectorSearch(query, opts, _model = DEFAULT_EMBED_MODEL) {
 
				     const collectionNames = resolveCollectionFilter(opts.collection, true);
			
 
				     const singleCollection = collectionNames.length === 1 ? collectionNames[0] : undefined;
			
 
				     checkIndexHealth(store.db);
			
 
				+    // Build embedding provider for query encoding (i-loazq6ze).
			
 
				+    // Same precedence as `qmd embed`: explicit `--provider` flag → env vars →
			
 
				+    // `~/.config/qmd/config.json` → default LocalLlamaCppProvider. The local
			
 
				+    // default keeps zero-config callers on the legacy llama-cpp path with no
			
 
				+    // observable change.
			
 
				+    const embedProvider = opts.embedProvider;
			
 
				     await withLLMSession(async () => {
			
 
				         let results = await vectorSearchQuery(store, query, {
			
 
				             collection: singleCollection,
			
 
				             limit: opts.all ? 500 : (opts.limit || 10),
			
 
				             minScore: opts.minScore || 0.3,
			
 
				             intent: opts.intent,
			
 
				+            ...(embedProvider ? { embedProvider } : {}),
			
 
				             hooks: {
			
 
				                 onExpand: (original, expanded) => {
			
 
				                     logExpansionTree(original, expanded);
			
@@ -2048,6 +2094,7 @@ async function querySearch(query, opts, _embedModel = DEFAULT_EMBED_MODEL, _rera
 
				                 explain: !!opts.explain,
			
 
				                 intent,
			
 
				                 chunkStrategy: opts.chunkStrategy,
			
 
				+                ...(opts.embedProvider ? { embedProvider: opts.embedProvider } : {}),
			
 
				                 hooks: {
			
 
				                     onEmbedStart: (count) => {
			
 
				                         process.stderr.write(`${c.dim}Embedding ${count} ${count === 1 ? 'query' : 'queries'}...${c.reset}`);
			
@@ -2077,6 +2124,7 @@ async function querySearch(query, opts, _embedModel = DEFAULT_EMBED_MODEL, _rera
 
				                 explain: !!opts.explain,
			
 
				                 intent,
			
 
				                 chunkStrategy: opts.chunkStrategy,
			
 
				+                ...(opts.embedProvider ? { embedProvider: opts.embedProvider } : {}),
			
 
				                 hooks: {
			
 
				                     onStrongSignal: (score) => {
			
 
				                         process.stderr.write(`${c.dim}Strong BM25 signal (${score.toFixed(2)}) — skipping expansion${c.reset}\n`);
			
@@ -2811,6 +2859,9 @@ if (isMain) {
 
				             if (!cli.values["min-score"]) {
			
 
				                 cli.opts.minScore = 0.3;
			
 
				             }
			
 
				+            // Build query-side embedding provider (i-loazq6ze).
			
 
				+            // Returns undefined for zero-config callers (legacy local path).
			
 
				+            cli.opts.embedProvider = buildQueryEmbedProvider(cli.values);
			
 
				             await vectorSearch(cli.query, cli.opts);
			
 
				             break;
			
 
				         case "query":
			
@@ -2819,6 +2870,7 @@ if (isMain) {
 
				                 console.error("Usage: qmd query [options] <query>");
			
 
				                 process.exit(1);
			
 
				             }
			
 
				+            cli.opts.embedProvider = buildQueryEmbedProvider(cli.values);
			
 
				             await querySearch(cli.query, cli.opts);
			
 
				             break;
			
 
				         case "bench": {
			
--- a/dist/index.d.ts
+++ b/dist/index.d.ts
@@ -24,6 +24,7 @@ export { extractSnippet, addLineNumbers, DEFAULT_MULTI_GET_MAX_BYTES };
 
				 export type { ChunkStrategy } from "./store.js";
			
 
				 export { getDefaultDbPath } from "./store.js";
			
 
				 export { Maintenance } from "./maintenance.js";
			
 
				+import type { EmbeddingProvider } from "./embedding/index.js";
			
 
				 export { createEmbeddingProvider, resolveProviderKind, LocalLlamaCppProvider, OpenAIEmbeddingsProvider, CircuitBreaker, CircuitOpenError, HttpError, ModelMismatchError, assertModelCompatible, type EmbeddingProvider, type ProviderKind, type ProviderEmbedding, type ProviderEmbedOptions, type ProviderHealth, type CreateEmbeddingProviderOptions, type OpenAIProviderConfig, type LocalLlamaCppProviderConfig, type EmbedProviderConfigFile, DEFAULT_BATCH_SIZE as DEFAULT_PROVIDER_BATCH_SIZE, DEFAULT_TIMEOUT_MS as DEFAULT_PROVIDER_TIMEOUT_MS, RETRY_BACKOFFS_MS as PROVIDER_RETRY_BACKOFFS_MS, } from "./embedding/index.js";
			
 
				 export { getDistinctEmbeddingModels } from "./store.js";
			
 
				 /**
			
@@ -70,6 +71,14 @@ export interface SearchOptions {
 
				     explain?: boolean;
			
 
				     /** Chunk strategy: "auto" (default, uses AST for code files) or "regex" (legacy) */
			
 
				     chunkStrategy?: ChunkStrategy;
			
 
				+    /**
			
 
				+     * Optional embedding provider for query-side encoding (i-loazq6ze).
			
 
				+     * When supplied, vec/hyde sub-queries are encoded through the provider
			
 
				+     * (HTTP / GPU worker / AutoFallback chain) instead of the local llama-cpp
			
 
				+     * model. Omit to keep pre-patch behavior — the SDK store still works
			
 
				+     * unchanged for callers that have not opted into a remote provider.
			
 
				+     */
			
 
				+    embedProvider?: EmbeddingProvider;
			
 
				 }
			
 
				 /**
			
 
				  * Options for searchLex() — BM25 keyword search.
			
@@ -84,6 +93,11 @@ export interface LexSearchOptions {
 
				 export interface VectorSearchOptions {
			
 
				     limit?: number;
			
 
				     collection?: string;
			
 
				+    /**
			
 
				+     * Optional embedding provider for query encoding (i-loazq6ze). Forwarded
			
 
				+     * through to `searchVec`. Defaults to local llama-cpp.
			
 
				+     */
			
 
				+    embedProvider?: EmbeddingProvider;
			
 
				 }
			
 
				 /**
			
 
				  * Options for expandQuery() — manual query expansion.
			
@@ -105,6 +119,14 @@ export interface StoreOptions {
 
				     configPath?: string;
			
 
				     /** Inline collection config (mutually exclusive with `configPath`) */
			
 
				     config?: CollectionConfig;
			
 
				+    /**
			
 
				+     * Optional default embedding provider for query encoding (i-loazq6ze).
			
 
				+     * When set, every `store.search(...)` call uses this provider unless the
			
 
				+     * caller passes its own `embedProvider` in `SearchOptions`. MCP / HTTP
			
 
				+     * server constructs the provider once at startup and injects it here so
			
 
				+     * every query routes through the GPU worker.
			
 
				+     */
			
 
				+    embedProvider?: EmbeddingProvider;
			
 
				 }
			
 
				 /**
			
 
				  * The QMD SDK store — provides search, retrieval, collection management,
			
--- a/dist/index.js
+++ b/dist/index.js
@@ -107,6 +107,8 @@ export async function createStore(options) {
 
				                 ...(opts.collections ?? []),
			
 
				             ];
			
 
				             const skipRerank = opts.rerank === false;
			
 
				+            // Per-call provider wins over store-level default.
			
 
				+            const provider = opts.embedProvider ?? options.embedProvider;
			
 
				             if (opts.queries) {
			
 
				                 // Pre-expanded queries — use structuredSearch
			
 
				                 return structuredSearch(internal, opts.queries, {
			
@@ -117,6 +119,7 @@ export async function createStore(options) {
 
				                     intent: opts.intent,
			
 
				                     skipRerank,
			
 
				                     chunkStrategy: opts.chunkStrategy,
			
 
				+                    ...(provider ? { embedProvider: provider } : {}),
			
 
				                 });
			
 
				             }
			
 
				             // Simple query string — use hybridQuery (expand + search + rerank)
			
@@ -128,10 +131,11 @@ export async function createStore(options) {
 
				                 intent: opts.intent,
			
 
				                 skipRerank,
			
 
				                 chunkStrategy: opts.chunkStrategy,
			
 
				+                ...(provider ? { embedProvider: provider } : {}),
			
 
				             });
			
 
				         },
			
 
				         searchLex: async (q, opts) => internal.searchFTS(q, opts?.limit, opts?.collection),
			
 
				-        searchVector: async (q, opts) => internal.searchVec(q, DEFAULT_EMBED_MODEL, opts?.limit, opts?.collection),
			
 
				+        searchVector: async (q, opts) => internal.searchVec(q, DEFAULT_EMBED_MODEL, opts?.limit, opts?.collection, undefined, undefined, opts?.embedProvider ?? options.embedProvider),
			
 
				         expandQuery: async (q, opts) => internal.expandQuery(q, undefined, opts?.intent),
			
 
				         get: async (pathOrDocid, opts) => internal.findDocument(pathOrDocid, opts),
			
 
				         getDocumentBody: async (pathOrDocid, opts) => {
			
--- a/dist/mcp/server.js
+++ b/dist/mcp/server.js
@@ -17,8 +17,33 @@ import { WebStandardStreamableHTTPServerTransport } from "@modelcontextprotocol/
 
				 import { isInitializeRequest } from "@modelcontextprotocol/sdk/types.js";
			
 
				 import { z } from "zod";
			
 
				 import { existsSync } from "fs";
			
 
				-import { createStore, extractSnippet, addLineNumbers, getDefaultDbPath, DEFAULT_MULTI_GET_MAX_BYTES, } from "../index.js";
			
 
				+import { createStore, extractSnippet, addLineNumbers, getDefaultDbPath, DEFAULT_MULTI_GET_MAX_BYTES, createEmbeddingProvider, resolveProviderKind, } from "../index.js";
			
 
				 import { getConfigPath } from "../collections.js";
			
 
				+/**
			
 
				+ * Build a query-side embedding provider (i-loazq6ze) for MCP server start.
			
 
				+ * Mirrors `buildQueryEmbedProvider` in the CLI: returns `undefined` when
			
 
				+ * the user has not opted into a remote provider, preserving pre-patch
			
 
				+ * behavior (local llama-cpp). Construction errors are logged and the
			
 
				+ * server falls back to the legacy path.
			
 
				+ */
			
 
				+function buildMcpEmbedProvider() {
			
 
				+    const env = process.env;
			
 
				+    const envOptIn = !!(env.QMD_EMBED_PROVIDER ||
			
 
				+        env.QMD_EMBED_ENDPOINT ||
			
 
				+        env.QMD_EMBED_AUTO_FALLBACK);
			
 
				+    // Probe resolved kind via the factory's standard precedence (env + config).
			
 
				+    const resolved = resolveProviderKind({});
			
 
				+    if (!envOptIn && resolved === "local")
			
 
				+        return undefined;
			
 
				+    try {
			
 
				+        return createEmbeddingProvider({});
			
 
				+    }
			
 
				+    catch (err) {
			
 
				+        // Log + fall through to undefined so legacy local path is used.
			
 
				+        process.stderr.write(`[qmd mcp] WARN failed to build embedding provider — using local fallback: ${err instanceof Error ? err.message : String(err)}\n`);
			
 
				+        return undefined;
			
 
				+    }
			
 
				+}
			
 
				 // =============================================================================
			
 
				 // Helper functions
			
 
				 // =============================================================================
			
@@ -417,9 +442,11 @@ Intent-aware lex (C++ performance, not sports):
 
				 // =============================================================================
			
 
				 export async function startMcpServer() {
			
 
				     const configPath = getConfigPath();
			
 
				+    const embedProvider = buildMcpEmbedProvider();
			
 
				     const store = await createStore({
			
 
				         dbPath: getDefaultDbPath(),
			
 
				         ...(existsSync(configPath) ? { configPath } : {}),
			
 
				+        ...(embedProvider ? { embedProvider } : {}),
			
 
				     });
			
 
				     const server = await createMcpServer(store);
			
 
				     const transport = new StdioServerTransport();
			
@@ -431,9 +458,11 @@ export async function startMcpServer() {
 
				  */
			
 
				 export async function startMcpHttpServer(port, options) {
			
 
				     const configPath = getConfigPath();
			
 
				+    const embedProvider = buildMcpEmbedProvider();
			
 
				     const store = await createStore({
			
 
				         dbPath: getDefaultDbPath(),
			
 
				         ...(existsSync(configPath) ? { configPath } : {}),
			
 
				+        ...(embedProvider ? { embedProvider } : {}),
			
 
				     });
			
 
				     // Pre-fetch default collection names for REST endpoint
			
 
				     const defaultCollectionNames = await store.getDefaultCollectionNames();
			
@@ -655,6 +684,14 @@ export async function startMcpHttpServer(port, options) {
 
				         sessions.clear();
			
 
				         httpServer.close();
			
 
				         await store.close();
			
 
				+        // Dispose the query-side embedding provider (if any) — releases
			
 
				+        // HTTP keep-alive sockets in OpenAIEmbeddingsProvider (i-loazq6ze).
			
 
				+        if (embedProvider) {
			
 
				+            try {
			
 
				+                await embedProvider.dispose();
			
 
				+            }
			
 
				+            catch { /* ignore */ }
			
 
				+        }
			
 
				     };
			
 
				     process.on("SIGTERM", async () => {
			
 
				         console.error("Shutting down (SIGTERM)...");
			
--- a/dist/store.d.ts
+++ b/dist/store.d.ts
@@ -246,7 +246,7 @@ export type Store = {
 
				     resolveVirtualPath: (virtualPath: string) => string | null;
			
 
				     toVirtualPath: (absolutePath: string) => string | null;
			
 
				     searchFTS: (query: string, limit?: number, collectionName?: string) => SearchResult[];
			
 
				-    searchVec: (query: string, model: string, limit?: number, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[]) => Promise<SearchResult[]>;
			
 
				+    searchVec: (query: string, model: string, limit?: number, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[], embedProvider?: EmbeddingProvider) => Promise<SearchResult[]>;
			
 
				     expandQuery: (query: string, model?: string, intent?: string) => Promise<ExpandedQuery[]>;
			
 
				     rerank: (query: string, documents: {
			
 
				         file: string;
			
@@ -562,14 +562,38 @@ export declare function chunkDocumentAsync(content: string, maxChars?: number, o
 
				     text: string;
			
 
				     pos: number;
			
 
				 }[]>;
			
 
				+/**
			
 
				+ * Counts the tokens in `text`. Used by `chunkDocumentByTokens` for the
			
 
				+ * safety re-split that splits chunks exceeding `maxTokens`.
			
 
				+ *
			
 
				+ * When `chunkDocumentByTokens` is called WITHOUT a tokenizer (default),
			
 
				+ * it lazily resolves `getDefaultLlamaCpp()` and uses `llm.tokenize` —
			
 
				+ * accurate but expensive (loads the local GGUF embed model + initialises
			
 
				+ * llama.cpp, ~22s on cold cache).
			
 
				+ *
			
 
				+ * Provider-mode callers (HTTP embed providers like the GPU worker on
			
 
				+ * `models` LXC) MUST pass a JS-only approximator to avoid loading the
			
 
				+ * local model entirely. A char-based estimate like
			
 
				+ * `Math.ceil(text.length / 3)` is a reasonable default — it matches the
			
 
				+ * `avgCharsPerToken=3` heuristic used for the initial char-space chunk
			
 
				+ * step, so the safety re-split stays a near no-op while populating the
			
 
				+ * `tokens` field with a stable estimate.
			
 
				+ */
			
 
				+export type TokenCounter = (text: string) => number | Promise<number>;
			
 
				 /**
			
 
				  * Chunk a document by actual token count using the LLM tokenizer.
			
 
				  * More accurate than character-based chunking but requires async.
			
 
				  *
			
 
				- * When filepath and chunkStrategy are provided, uses AST-aware break points
			
 
				- * for supported code files.
			
 
				+ * When `tokenizer` is supplied, it is used in place of the local
			
 
				+ * `llm.tokenize(...)` call — neither `getDefaultLlamaCpp()` nor
			
 
				+ * `llm.tokenize(...)` is invoked. This lets remote-only deployments
			
 
				+ * (`QMD_EMBED_ENDPOINT=...`) chunk documents without warming up
			
 
				+ * node-llama-cpp (DoD #1 of i-1rqixh6m / i-qkarfffa).
			
 
				+ *
			
 
				+ * When `filepath` and `chunkStrategy` are provided, uses AST-aware break
			
 
				+ * points for supported code files.
			
 
				  */
			
 
				-export declare function chunkDocumentByTokens(content: string, maxTokens?: number, overlapTokens?: number, windowTokens?: number, filepath?: string, chunkStrategy?: ChunkStrategy, signal?: AbortSignal): Promise<{
			
 
				+export declare function chunkDocumentByTokens(content: string, maxTokens?: number, overlapTokens?: number, windowTokens?: number, filepath?: string, chunkStrategy?: ChunkStrategy, signal?: AbortSignal, tokenizer?: TokenCounter): Promise<{
			
 
				     text: string;
			
 
				     pos: number;
			
 
				     tokens: number;
			
@@ -709,7 +733,7 @@ export declare function sanitizeFTS5Term(term: string): string;
 
				 export declare function validateSemanticQuery(query: string): string | null;
			
 
				 export declare function validateLexQuery(query: string): string | null;
			
 
				 export declare function searchFTS(db: Database, query: string, limit?: number, collectionName?: string): SearchResult[];
			
 
				-export declare function searchVec(db: Database, query: string, model: string, limit?: number, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[]): Promise<SearchResult[]>;
			
 
				+export declare function searchVec(db: Database, query: string, model: string, limit?: number, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[], embedProvider?: EmbeddingProvider): Promise<SearchResult[]>;
			
 
				 /**
			
 
				  * Get all unique content hashes that need embeddings (from active documents).
			
 
				  * Returns hash, document body, and a sample path for display purposes.
			
@@ -844,6 +868,14 @@ export interface HybridQueryOptions {
 
				     skipRerank?: boolean;
			
 
				     chunkStrategy?: ChunkStrategy;
			
 
				     hooks?: SearchHooks;
			
 
				+    /**
			
 
				+     * Optional embedding provider for query-side encoding (i-loazq6ze).
			
 
				+     * When supplied, the original-query vector AND any vec/hyde expansion
			
 
				+     * variants are encoded through this provider (HTTP, GPU worker,
			
 
				+     * AutoFallback chain) instead of `getLlm(store).embedBatch(...)`. Skip
			
 
				+     * to keep pre-patch behavior (uses local LlamaCpp).
			
 
				+     */
			
 
				+    embedProvider?: EmbeddingProvider;
			
 
				 }
			
 
				 export interface HybridQueryResult {
			
 
				     file: string;
			
@@ -882,6 +914,12 @@ export interface VectorSearchOptions {
 
				     minScore?: number;
			
 
				     intent?: string;
			
 
				     hooks?: Pick<SearchHooks, 'onExpand'>;
			
 
				+    /**
			
 
				+     * Optional embedding provider for query-side encoding (i-loazq6ze).
			
 
				+     * When supplied, query vectors are encoded via the provider (HTTP /
			
 
				+     * GPU worker / fallback chain) instead of the local llama-cpp model.
			
 
				+     */
			
 
				+    embedProvider?: EmbeddingProvider;
			
 
				 }
			
 
				 export interface VectorSearchResult {
			
 
				     file: string;
			
@@ -918,6 +956,12 @@ export interface StructuredSearchOptions {
 
				     skipRerank?: boolean;
			
 
				     chunkStrategy?: ChunkStrategy;
			
 
				     hooks?: SearchHooks;
			
 
				+    /**
			
 
				+     * Optional embedding provider for query-side encoding (i-loazq6ze).
			
 
				+     * When supplied, vec/hyde sub-queries are batch-encoded via the provider
			
 
				+     * (HTTP / GPU worker / fallback chain) instead of `getLlm(store).embedBatch`.
			
 
				+     */
			
 
				+    embedProvider?: EmbeddingProvider;
			
 
				 }
			
 
				 /**
			
 
				  * Structured search: execute pre-expanded queries without LLM query expansion.
			
--- a/dist/store.js
+++ b/dist/store.js
@@ -1141,6 +1141,15 @@ export async function generateEmbeddings(store, options) {
 
				             }
			
 
				             return session.embedBatch(texts, { model: modelArg });
			
 
				         };
			
 
				+        // JS-only token estimator for the provider path. Char-based with
			
 
				+        // avgCharsPerToken=3 — matches the heuristic the chunker already
			
 
				+        // uses for its initial char-space pass, so the safety re-split is a
			
 
				+        // near no-op while populating the `tokens` field with a stable
			
 
				+        // estimate. CRITICAL: avoids loading node-llama-cpp on remote-only
			
 
				+        // deployments (`QMD_EMBED_ENDPOINT=...`). i-1rqixh6m DoD #1.
			
 
				+        const chunkTokenizer = provider
			
 
				+            ? (text) => Math.ceil(text.length / 3)
			
 
				+            : undefined;
			
 
				         for (const batchMeta of batches) {
			
 
				             // Abort early if session has been invalidated
			
 
				             if (!session.isValid) {
			
@@ -1156,7 +1165,7 @@ export async function generateEmbeddings(store, options) {
 
				                 const title = extractTitle(doc.body, doc.path);
			
 
				                 const perCollectionStrategy = collectionStrategies.get(doc.collection);
			
 
				                 const chunkStrategy = perCollectionStrategy ?? options?.chunkStrategy;
			
 
				-                const chunks = await chunkDocumentByTokens(doc.body, undefined, undefined, undefined, doc.path, chunkStrategy, session.signal);
			
 
				+                const chunks = await chunkDocumentByTokens(doc.body, undefined, undefined, undefined, doc.path, chunkStrategy, session.signal, chunkTokenizer);
			
 
				                 for (let seq = 0; seq < chunks.length; seq++) {
			
 
				                     batchChunks.push({
			
 
				                         hash: doc.hash,
			
@@ -1316,7 +1325,7 @@ export function createStore(dbPath) {
 
				         toVirtualPath: (absolutePath) => toVirtualPath(db, absolutePath),
			
 
				         // Search
			
 
				         searchFTS: (query, limit, collectionName) => searchFTS(db, query, limit, collectionName),
			
 
				-        searchVec: (query, model, limit, collectionName, session, precomputedEmbedding) => searchVec(db, query, model, limit, collectionName, session, precomputedEmbedding),
			
 
				+        searchVec: (query, model, limit, collectionName, session, precomputedEmbedding, embedProvider) => searchVec(db, query, model, limit, collectionName, session, precomputedEmbedding, embedProvider),
			
 
				         // Query expansion & reranking
			
 
				         expandQuery: (query, model, intent) => expandQuery(query, model, db, intent, store.llm),
			
 
				         rerank: (query, documents, model, intent) => rerank(query, documents, model, db, intent, store.llm),
			
@@ -1782,11 +1791,26 @@ function chunkByFunctionRanges(content, ranges, regexPoints, codeFences, maxChar
 
				  * Chunk a document by actual token count using the LLM tokenizer.
			
 
				  * More accurate than character-based chunking but requires async.
			
 
				  *
			
 
				- * When filepath and chunkStrategy are provided, uses AST-aware break points
			
 
				- * for supported code files.
			
 
				- */
			
 
				-export async function chunkDocumentByTokens(content, maxTokens = CHUNK_SIZE_TOKENS, overlapTokens = CHUNK_OVERLAP_TOKENS, windowTokens = CHUNK_WINDOW_TOKENS, filepath, chunkStrategy = "regex", signal) {
			
 
				-    const llm = getDefaultLlamaCpp();
			
 
				+ * When `tokenizer` is supplied, it is used in place of the local
			
 
				+ * `llm.tokenize(...)` call — neither `getDefaultLlamaCpp()` nor
			
 
				+ * `llm.tokenize(...)` is invoked. This lets remote-only deployments
			
 
				+ * (`QMD_EMBED_ENDPOINT=...`) chunk documents without warming up
			
 
				+ * node-llama-cpp (DoD #1 of i-1rqixh6m / i-qkarfffa).
			
 
				+ *
			
 
				+ * When `filepath` and `chunkStrategy` are provided, uses AST-aware break
			
 
				+ * points for supported code files.
			
 
				+ */
			
 
				+export async function chunkDocumentByTokens(content, maxTokens = CHUNK_SIZE_TOKENS, overlapTokens = CHUNK_OVERLAP_TOKENS, windowTokens = CHUNK_WINDOW_TOKENS, filepath, chunkStrategy = "regex", signal, tokenizer) {
			
 
				+    // Resolve token counter lazily so callers that supply `tokenizer` never
			
 
				+    // touch the local LlamaCpp instance — `getDefaultLlamaCpp()` is only
			
 
				+    // invoked from inside the default closure when it is actually called
			
 
				+    // (i.e. when no tokenizer is supplied).
			
 
				+    let llm;
			
 
				+    const countTokens = tokenizer ?? (async (text) => {
			
 
				+        if (!llm)
			
 
				+            llm = getDefaultLlamaCpp();
			
 
				+        return (await llm.tokenize(text)).length;
			
 
				+    });
			
 
				     // Use moderate chars/token estimate (prose ~4, code ~2, mixed ~3)
			
 
				     // If chunks exceed limit, they'll be re-split with actual ratio
			
 
				     const avgCharsPerToken = 3;
			
@@ -1802,24 +1826,24 @@ export async function chunkDocumentByTokens(content, maxTokens = CHUNK_SIZE_TOKE
 
				         // Respect abort signal to avoid runaway tokenization
			
 
				         if (signal?.aborted)
			
 
				             break;
			
 
				-        const tokens = await llm.tokenize(chunk.text);
			
 
				-        if (tokens.length <= maxTokens) {
			
 
				-            results.push({ text: chunk.text, pos: chunk.pos, tokens: tokens.length });
			
 
				+        const tokenCount = await countTokens(chunk.text);
			
 
				+        if (tokenCount <= maxTokens) {
			
 
				+            results.push({ text: chunk.text, pos: chunk.pos, tokens: tokenCount });
			
 
				         }
			
 
				         else {
			
 
				             // Chunk is still too large - split it further
			
 
				             // Use actual token count to estimate better char limit
			
 
				-            const actualCharsPerToken = chunk.text.length / tokens.length;
			
 
				+            const actualCharsPerToken = chunk.text.length / tokenCount;
			
 
				             const safeMaxChars = Math.floor(maxTokens * actualCharsPerToken * 0.95); // 5% safety margin
			
 
				             const subChunks = chunkDocument(chunk.text, safeMaxChars, Math.floor(overlapChars * actualCharsPerToken / 2), Math.floor(windowChars * actualCharsPerToken / 2));
			
 
				             for (const subChunk of subChunks) {
			
 
				                 if (signal?.aborted)
			
 
				                     break;
			
 
				-                const subTokens = await llm.tokenize(subChunk.text);
			
 
				+                const subCount = await countTokens(subChunk.text);
			
 
				                 results.push({
			
 
				                     text: subChunk.text,
			
 
				                     pos: chunk.pos + subChunk.pos,
			
 
				-                    tokens: subTokens.length,
			
 
				+                    tokens: subCount,
			
 
				                 });
			
 
				             }
			
 
				         }
			
@@ -2493,11 +2517,11 @@ export function searchFTS(db, query, limit = 20, collectionName) {
 
				 // =============================================================================
			
 
				 // Vector Search
			
 
				 // =============================================================================
			
 
				-export async function searchVec(db, query, model, limit = 20, collectionName, session, precomputedEmbedding) {
			
 
				+export async function searchVec(db, query, model, limit = 20, collectionName, session, precomputedEmbedding, embedProvider) {
			
 
				     const tableExists = db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
			
 
				     if (!tableExists)
			
 
				         return [];
			
 
				-    const embedding = precomputedEmbedding ?? await getEmbedding(query, model, true, session);
			
 
				+    const embedding = precomputedEmbedding ?? await getEmbedding(query, model, true, session, undefined, embedProvider);
			
 
				     if (!embedding)
			
 
				         return [];
			
 
				     // IMPORTANT: We use a two-step query approach here because sqlite-vec virtual tables
			
@@ -2571,7 +2595,24 @@ export async function searchVec(db, query, model, limit = 20, collectionName, se
 
				 // =============================================================================
			
 
				 // Embeddings
			
 
				 // =============================================================================
			
 
				-async function getEmbedding(text, model, isQuery, session, llmOverride) {
			
 
				+async function getEmbedding(text, model, isQuery, session, llmOverride, embedProvider) {
			
 
				+    // When an EmbeddingProvider is supplied, route the encoding through it
			
 
				+    // (HTTP / GPU worker / fallback chain) instead of touching local
			
 
				+    // node-llama-cpp at all. The provider sees the raw text + the desired
			
 
				+    // model id; query-formatting prefixes are still applied via
			
 
				+    // formatQueryForEmbedding so embedding parity with the index is preserved.
			
 
				+    if (embedProvider) {
			
 
				+        const providerModel = embedProvider.getModelId();
			
 
				+        const formattedText = isQuery
			
 
				+            ? formatQueryForEmbedding(text, providerModel)
			
 
				+            : formatDocForEmbedding(text, undefined, providerModel);
			
 
				+        // Only forward an AbortSignal when the provider is local-backed;
			
 
				+        // remote providers manage their own timeouts and an LLM-session signal
			
 
				+        // would abort their HTTP request prematurely (i-08ovbvtb).
			
 
				+        const sig = embedProvider.kind === "local" ? session?.signal : undefined;
			
 
				+        const result = await embedProvider.embed(formattedText, sig ? { model: providerModel, signal: sig } : { model: providerModel });
			
 
				+        return result?.embedding ?? null;
			
 
				+    }
			
 
				     // Format text using the appropriate prompt template
			
 
				     const formattedText = isQuery ? formatQueryForEmbedding(text, model) : formatDocForEmbedding(text, undefined, model);
			
 
				     const result = session
			
@@ -3236,6 +3277,7 @@ export async function hybridQuery(store, query, options) {
 
				     const intent = options?.intent;
			
 
				     const skipRerank = options?.skipRerank ?? false;
			
 
				     const hooks = options?.hooks;
			
 
				+    const embedProvider = options?.embedProvider;
			
 
				     const rankedLists = [];
			
 
				     const rankedListMeta = [];
			
 
				     const docidMap = new Map(); // filepath -> docid
			
@@ -3300,12 +3342,19 @@ export async function hybridQuery(store, query, options) {
 
				                 vecQueries.push({ text: q.query, queryType: q.type });
			
 
				             }
			
 
				         }
			
 
				-        // Batch embed all vector queries in a single call
			
 
				-        const llm = getLlm(store);
			
 
				-        const textsToEmbed = vecQueries.map(q => formatQueryForEmbedding(q.text, llm.embedModelName));
			
 
				+        // Batch embed all vector queries in a single call.
			
 
				+        // When `embedProvider` is supplied (i-loazq6ze), route the encode through
			
 
				+        // it (HTTP / GPU worker / AutoFallback chain) instead of warming the
			
 
				+        // local llama-cpp model — this is the whole point of the GPU worker.
			
 
				+        const embedModelName = embedProvider
			
 
				+            ? embedProvider.getModelId()
			
 
				+            : getLlm(store).embedModelName;
			
 
				+        const textsToEmbed = vecQueries.map(q => formatQueryForEmbedding(q.text, embedModelName));
			
 
				         hooks?.onEmbedStart?.(textsToEmbed.length);
			
 
				         const embedStart = Date.now();
			
 
				-        const embeddings = await llm.embedBatch(textsToEmbed);
			
 
				+        const embeddings = embedProvider
			
 
				+            ? await embedProvider.embedBatch(textsToEmbed, { model: embedModelName })
			
 
				+            : await getLlm(store).embedBatch(textsToEmbed);
			
 
				         hooks?.onEmbedDone?.(Date.now() - embedStart);
			
 
				         // Run sqlite-vec lookups with pre-computed embeddings
			
 
				         for (let i = 0; i < vecQueries.length; i++) {
			
@@ -3501,6 +3550,7 @@ export async function vectorSearchQuery(store, query, options) {
 
				     const minScore = options?.minScore ?? 0.3;
			
 
				     const collection = options?.collection;
			
 
				     const intent = options?.intent;
			
 
				+    const embedProvider = options?.embedProvider;
			
 
				     const hasVectors = !!store.db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
			
 
				     if (!hasVectors)
			
 
				         return [];
			
@@ -3509,11 +3559,14 @@ export async function vectorSearchQuery(store, query, options) {
 
				     const allExpanded = await store.expandQuery(query, undefined, intent);
			
 
				     const vecExpanded = allExpanded.filter(q => q.type !== 'lex');
			
 
				     options?.hooks?.onExpand?.(query, vecExpanded, Date.now() - expandStart);
			
 
				-    // Run original + vec/hyde expanded through vector, sequentially — concurrent embed() hangs
			
 
				+    // Run original + vec/hyde expanded through vector, sequentially — concurrent embed() hangs.
			
 
				+    // When `embedProvider` is supplied (i-loazq6ze), query encoding is routed
			
 
				+    // through it; the per-call signature `searchVec(...)` accepts the provider
			
 
				+    // as the trailing argument so existing tests / callers stay untouched.
			
 
				     const queryTexts = [query, ...vecExpanded.map(q => q.query)];
			
 
				     const allResults = new Map();
			
 
				     for (const q of queryTexts) {
			
 
				-        const vecResults = await store.searchVec(q, DEFAULT_EMBED_MODEL, limit, collection);
			
 
				+        const vecResults = await store.searchVec(q, DEFAULT_EMBED_MODEL, limit, collection, undefined, undefined, embedProvider);
			
 
				         for (const r of vecResults) {
			
 
				             const existing = allResults.get(r.filepath);
			
 
				             if (!existing || r.score > existing.score) {
			
@@ -3560,6 +3613,7 @@ export async function structuredSearch(store, searches, options) {
 
				     const intent = options?.intent;
			
 
				     const skipRerank = options?.skipRerank ?? false;
			
 
				     const hooks = options?.hooks;
			
 
				+    const embedProvider = options?.embedProvider;
			
 
				     const collections = options?.collections;
			
 
				     if (searches.length === 0)
			
 
				         return [];
			
@@ -3613,11 +3667,19 @@ export async function structuredSearch(store, searches, options) {
 
				     if (hasVectors) {
			
 
				         const vecSearches = searches.filter((s) => s.type === 'vec' || s.type === 'hyde');
			
 
				         if (vecSearches.length > 0) {
			
 
				-            const llm = getLlm(store);
			
 
				-            const textsToEmbed = vecSearches.map(s => formatQueryForEmbedding(s.query, llm.embedModelName));
			
 
				+            // Route batch encoding through the supplied EmbeddingProvider when
			
 
				+            // present (i-loazq6ze). Otherwise fall back to the local llama-cpp
			
 
				+            // singleton — preserves pre-patch behavior for callers that don't
			
 
				+            // configure a provider.
			
 
				+            const embedModelName = embedProvider
			
 
				+                ? embedProvider.getModelId()
			
 
				+                : getLlm(store).embedModelName;
			
 
				+            const textsToEmbed = vecSearches.map(s => formatQueryForEmbedding(s.query, embedModelName));
			
 
				             hooks?.onEmbedStart?.(textsToEmbed.length);
			
 
				             const embedStart = Date.now();
			
 
				-            const embeddings = await llm.embedBatch(textsToEmbed);
			
 
				+            const embeddings = embedProvider
			
 
				+                ? await embedProvider.embedBatch(textsToEmbed, { model: embedModelName })
			
 
				+                : await getLlm(store).embedBatch(textsToEmbed);
			
 
				             hooks?.onEmbedDone?.(Date.now() - embedStart);
			
 
				             for (let i = 0; i < vecSearches.length; i++) {
			
 
				                 const embedding = embeddings[i]?.embedding;
			
--- a/src/index.ts
+++ b/src/index.ts
@@ -119,6 +119,14 @@ export { getDefaultDbPath } from "./store.js";
 
				 // Re-export Maintenance class for CLI housekeeping operations
			
 
				 export { Maintenance } from "./maintenance.js";
			
 
				 
			
 
				+// Local import so the type is in scope of the SDK option interfaces below
			
 
				+// (e.g. `embedProvider?: EmbeddingProvider`). The `export { ... }` re-export
			
 
				+// directly below is for SDK consumers, but TypeScript does not put names
			
 
				+// from `export { ... } from "..."` into the file's own scope — it needs
			
 
				+// a separate `import` for that. Without this import, sibling commit
			
 
				+// 20e44c9 references `EmbeddingProvider` in option interfaces uncompilably.
			
 
				+import type { EmbeddingProvider } from "./embedding/index.js";
			
 
				+
			
 
				 // Re-export embedding provider abstraction for SDK consumers (i-qkarfffa).
			
 
				 // `createEmbeddingProvider` honors QMD_EMBED_ENDPOINT / config-file / kind
			
 
				 // arg precedence; default fallback is the legacy LocalLlamaCppProvider so
			
--- a/test/embedding-store-integration.test.ts
+++ b/test/embedding-store-integration.test.ts
@@ -10,10 +10,34 @@
 
				  *   - getDistinctEmbeddingModels reads content_vectors correctly
			
 
				  */
			
 
				 
			
 
				-import { describe, test, expect, beforeEach, afterEach } from "vitest";
			
 
				+import { describe, test, expect, beforeEach, afterEach, vi } from "vitest";
			
 
				 import { mkdtempSync, rmSync } from "node:fs";
			
 
				 import { tmpdir } from "node:os";
			
 
				 import { join } from "node:path";
			
 
				+
			
 
				+// Mock the llm.js module so `getDefaultLlamaCpp` (the only function
			
 
				+// `chunkDocumentByTokens` reaches into when no `tokenizer` is supplied)
			
 
				+// throws on call. This is the strongest possible assertion of DoD #1
			
 
				+// for i-1rqixh6m: provider-mode embed runs MUST never load node-llama-cpp.
			
 
				+//
			
 
				+// Vitest hoists this `vi.mock` above the `import` lines below, and
			
 
				+// since the module replacement applies to ALL importers (including
			
 
				+// `store.js`), any leaked call from `chunkDocumentByTokens` (or any
			
 
				+// sibling code path) into `getDefaultLlamaCpp` will throw a clear
			
 
				+// "DoD violation" error and fail the test.
			
 
				+vi.mock("../src/llm.js", async (importOriginal) => {
			
 
				+  const actual = await importOriginal<typeof import("../src/llm.js")>();
			
 
				+  return {
			
 
				+    ...actual,
			
 
				+    getDefaultLlamaCpp: vi.fn(() => {
			
 
				+      throw new Error(
			
 
				+        "getDefaultLlamaCpp() invoked when embedProvider was supplied — " +
			
 
				+        "DoD #1 violation (i-1rqixh6m). Provider-mode embed must not load node-llama-cpp.",
			
 
				+      );
			
 
				+    }),
			
 
				+  };
			
 
				+});
			
 
				+
			
 
				 import {
			
 
				   createStore,
			
 
				   generateEmbeddings,
			
@@ -27,6 +51,7 @@ import {
 
				   type ProviderEmbedding,
			
 
				   type ProviderHealth,
			
 
				 } from "../src/embedding/provider.js";
			
 
				+import * as llmModule from "../src/llm.js";
			
 
				 
			
 
				 // ─────────────────────────── Stub provider ───────────────────────────────────
			
 
				 
			
@@ -106,6 +131,10 @@ afterEach(() => {
 
				   } catch { /* ignore */ }
			
 
				   delete process.env.INDEX_PATH;
			
 
				   rmSync(workDir, { recursive: true, force: true });
			
 
				+  // Reset call history on the mocked getDefaultLlamaCpp between tests so
			
 
				+  // each test gets a clean ledger to assert against.
			
 
				+  const spy = llmModule.getDefaultLlamaCpp as unknown as ReturnType<typeof vi.fn>;
			
 
				+  spy.mockClear();
			
 
				 });
			
 
				 
			
 
				 // ─────────────────────────── getDistinctEmbeddingModels ──────────────────────
			
@@ -177,6 +206,32 @@ describe("generateEmbeddings with EmbeddingProvider", () => {
 
				     expect(provider.totalTextsEmbedded).toBeGreaterThan(0);
			
 
				   });
			
 
				 
			
 
				+  test("provider mode does not call getDefaultLlamaCpp (DoD #3 — i-1rqixh6m)", async () => {
			
 
				+    // Stronger assertion than the `store.llm` Proxy above: when the
			
 
				+    // chunker or any sibling code path falls back to the *global*
			
 
				+    // `getDefaultLlamaCpp()` singleton (the previous warm-up source
			
 
				+    // inside `chunkDocumentByTokens`), the module-level mock at the top
			
 
				+    // of this file would throw — so a successful run is itself proof of
			
 
				+    // compliance. We additionally assert call count = 0 for clarity.
			
 
				+    const spy = llmModule.getDefaultLlamaCpp as unknown as ReturnType<typeof vi.fn>;
			
 
				+    expect(spy).not.toHaveBeenCalled();
			
 
				+
			
 
				+    const provider = new StubProvider("embeddinggemma", 4);
			
 
				+    const result = await generateEmbeddings(store, { embedProvider: provider });
			
 
				+
			
 
				+    expect(result.docsProcessed).toBe(2);
			
 
				+    expect(result.chunksEmbedded).toBeGreaterThan(0);
			
 
				+    expect(result.errors).toBe(0);
			
 
				+    expect(provider.totalTextsEmbedded).toBeGreaterThan(0);
			
 
				+
			
 
				+    // The hard assertion: not a single call to the local LlamaCpp
			
 
				+    // singleton during the entire embed run. If `chunkDocumentByTokens`
			
 
				+    // (or any sibling) regresses and reaches `getDefaultLlamaCpp()` on
			
 
				+    // the provider path, this test fails with a clear DoD-violation
			
 
				+    // message — and the run itself would have already thrown.
			
 
				+    expect(spy).not.toHaveBeenCalled();
			
 
				+  });
			
 
				+
			
 
				   test("model-id guard throws ModelMismatchError on mismatch", async () => {
			
 
				     // Pre-populate content_vectors with a different model id
			
 
				     store.ensureVecTable(4);