Преглед изворни кода

feat(embedding): query-side EmbeddingProvider with auto-fallback (i-loazq6ze)

Routes ALL query-side embedding through EmbeddingProvider — searchVec,
hybridQuery, structuredSearch, vectorSearchQuery, SDK store.search, CLI
qmd vsearch / qmd query, and MCP HTTP /query / stdio query tool. When no
provider is configured (zero env-vars / flags), the legacy local
llama-cpp path is preserved verbatim.

Wraps an OpenAI provider in AutoFallbackEmbeddingProvider (i-pdjn2xx5)
when QMD_EMBED_AUTO_FALLBACK is on so transient ai.mm.mk outages
degrade to local instead of throwing.

Adds test/embedding-vsearch.test.ts (10 cases): stub provider routing,
fallback chain success, both-fail surfaces error (single-embed) and
empty results (batch), backward-compat precomputed-embedding path.

Co-Authored-By: Claude <noreply@anthropic.com>
Session-Id: 40541822
root пре 3 недеља
родитељ
комит
20e44c90b5
5 измењених фајлова са 657 додато и 25 уклоњено
  1. 67 0
      src/cli/qmd.ts
  2. 29 1
      src/index.ts
  3. 40 0
      src/mcp/server.ts
  4. 130 24
      src/store.ts
  5. 391 0
      test/embedding-vsearch.test.ts

+ 67 - 0
src/cli/qmd.ts

@@ -1683,6 +1683,53 @@ function parseOptionalPositiveInt(name: string, value: unknown): number | undefi
   return parsed;
 }
 
+/**
+ * Build an `EmbeddingProvider` for the QUERY-side path (vsearch / query)
+ * if and only if the user has opted into a non-local provider via flags or
+ * env vars. Returns `undefined` for the zero-config case so the legacy
+ * `getDefaultLlamaCpp().embed(...)` path is used unchanged — preserving
+ * pre-patch behavior for callers that have not configured remote embedding
+ * (i-loazq6ze DoD #5: backward compat).
+ *
+ * Resolution mirrors `qmd embed` (factory.resolveProviderKind):
+ *   1. Explicit `--provider` flag → build provider
+ *   2. Any `--embed-*` flag / `QMD_EMBED_*` env / `embedProvider.endpoint`
+ *      in `~/.config/qmd/config.json` → build provider
+ *   3. Otherwise → return `undefined` (legacy path)
+ *
+ * Returns `null` on construction failure (e.g. malformed flags) so the
+ * caller can warn + fall back to the legacy path.
+ */
+function buildQueryEmbedProvider(values: Record<string, unknown>): EmbeddingProvider | undefined {
+  const providerCliKind = parseProviderKind(values["provider"]);
+  const opts = buildProviderOpts(values, providerCliKind);
+
+  // Determine whether the user opted into a provider. The factory's resolve
+  // step returns "local" by default; without explicit opt-in (flag/env/
+  // config), we keep the legacy path with no construction overhead.
+  const resolved = resolveProviderKind(opts);
+  const hasProviderFlag = providerCliKind !== undefined;
+  const hasOpenAiOverride = !!opts.openai && Object.keys(opts.openai).length > 0;
+  const envOptIn = !!(
+    process.env.QMD_EMBED_PROVIDER ||
+    process.env.QMD_EMBED_ENDPOINT ||
+    process.env.QMD_EMBED_AUTO_FALLBACK
+  );
+
+  if (!hasProviderFlag && !hasOpenAiOverride && !envOptIn && resolved === "local") {
+    return undefined;
+  }
+
+  try {
+    return createEmbeddingProvider(opts);
+  } catch (err) {
+    process.stderr.write(
+      `${c.yellow}Warning: failed to build query embedding provider — using local fallback (${err instanceof Error ? err.message : String(err)})${c.reset}\n`,
+    );
+    return undefined;
+  }
+}
+
 /**
  * Translate `cli.values` into `CreateEmbeddingProviderOptions`. CLI flags
  * win over env vars (the factory itself reads env when these are unset).
@@ -1872,6 +1919,12 @@ type OutputOptions = {
   intent?: string;       // Domain intent for disambiguation
   skipRerank?: boolean;  // Skip LLM reranking, use RRF scores only
   chunkStrategy?: ChunkStrategy;  // "auto" (default) or "regex"
+  /**
+   * Optional embedding provider for query-side encoding (i-loazq6ze).
+   * Built once in main() from the same flag/env/config precedence as
+   * `qmd embed` and threaded into vsearch/query/search code paths.
+   */
+  embedProvider?: EmbeddingProvider;
 };
 
 // Highlight query terms in text (skip short words < 3 chars)
@@ -2341,12 +2394,20 @@ async function vectorSearch(query: string, opts: OutputOptions, _model: string =
 
   checkIndexHealth(store.db);
 
+  // Build embedding provider for query encoding (i-loazq6ze).
+  // Same precedence as `qmd embed`: explicit `--provider` flag → env vars →
+  // `~/.config/qmd/config.json` → default LocalLlamaCppProvider. The local
+  // default keeps zero-config callers on the legacy llama-cpp path with no
+  // observable change.
+  const embedProvider = opts.embedProvider;
+
   await withLLMSession(async () => {
     let results = await vectorSearchQuery(store, query, {
       collection: singleCollection,
       limit: opts.all ? 500 : (opts.limit || 10),
       minScore: opts.minScore || 0.3,
       intent: opts.intent,
+      ...(embedProvider ? { embedProvider } : {}),
       hooks: {
         onExpand: (original, expanded) => {
           logExpansionTree(original, expanded);
@@ -2426,6 +2487,7 @@ async function querySearch(query: string, opts: OutputOptions, _embedModel: stri
         explain: !!opts.explain,
         intent,
         chunkStrategy: opts.chunkStrategy,
+        ...(opts.embedProvider ? { embedProvider: opts.embedProvider } : {}),
         hooks: {
           onEmbedStart: (count) => {
             process.stderr.write(`${c.dim}Embedding ${count} ${count === 1 ? 'query' : 'queries'}...${c.reset}`);
@@ -2454,6 +2516,7 @@ async function querySearch(query: string, opts: OutputOptions, _embedModel: stri
         explain: !!opts.explain,
         intent,
         chunkStrategy: opts.chunkStrategy,
+        ...(opts.embedProvider ? { embedProvider: opts.embedProvider } : {}),
         hooks: {
           onStrongSignal: (score) => {
             process.stderr.write(`${c.dim}Strong BM25 signal (${score.toFixed(2)}) — skipping expansion${c.reset}\n`);
@@ -3243,6 +3306,9 @@ if (isMain) {
       if (!cli.values["min-score"]) {
         cli.opts.minScore = 0.3;
       }
+      // Build query-side embedding provider (i-loazq6ze).
+      // Returns undefined for zero-config callers (legacy local path).
+      cli.opts.embedProvider = buildQueryEmbedProvider(cli.values);
       await vectorSearch(cli.query, cli.opts);
       break;
 
@@ -3252,6 +3318,7 @@ if (isMain) {
         console.error("Usage: qmd query [options] <query>");
         process.exit(1);
       }
+      cli.opts.embedProvider = buildQueryEmbedProvider(cli.values);
       await querySearch(cli.query, cli.opts);
       break;
 

+ 29 - 1
src/index.ts

@@ -195,6 +195,14 @@ export interface SearchOptions {
   explain?: boolean;
   /** Chunk strategy: "auto" (default, uses AST for code files) or "regex" (legacy) */
   chunkStrategy?: ChunkStrategy;
+  /**
+   * Optional embedding provider for query-side encoding (i-loazq6ze).
+   * When supplied, vec/hyde sub-queries are encoded through the provider
+   * (HTTP / GPU worker / AutoFallback chain) instead of the local llama-cpp
+   * model. Omit to keep pre-patch behavior — the SDK store still works
+   * unchanged for callers that have not opted into a remote provider.
+   */
+  embedProvider?: EmbeddingProvider;
 }
 
 /**
@@ -211,6 +219,11 @@ export interface LexSearchOptions {
 export interface VectorSearchOptions {
   limit?: number;
   collection?: string;
+  /**
+   * Optional embedding provider for query encoding (i-loazq6ze). Forwarded
+   * through to `searchVec`. Defaults to local llama-cpp.
+   */
+  embedProvider?: EmbeddingProvider;
 }
 
 /**
@@ -234,6 +247,14 @@ export interface StoreOptions {
   configPath?: string;
   /** Inline collection config (mutually exclusive with `configPath`) */
   config?: CollectionConfig;
+  /**
+   * Optional default embedding provider for query encoding (i-loazq6ze).
+   * When set, every `store.search(...)` call uses this provider unless the
+   * caller passes its own `embedProvider` in `SearchOptions`. MCP / HTTP
+   * server constructs the provider once at startup and injects it here so
+   * every query routes through the GPU worker.
+   */
+  embedProvider?: EmbeddingProvider;
 }
 
 /**
@@ -421,6 +442,8 @@ export async function createStore(options: StoreOptions): Promise<QMDStore> {
         ...(opts.collections ?? []),
       ];
       const skipRerank = opts.rerank === false;
+      // Per-call provider wins over store-level default.
+      const provider = opts.embedProvider ?? options.embedProvider;
 
       if (opts.queries) {
         // Pre-expanded queries — use structuredSearch
@@ -432,6 +455,7 @@ export async function createStore(options: StoreOptions): Promise<QMDStore> {
           intent: opts.intent,
           skipRerank,
           chunkStrategy: opts.chunkStrategy,
+          ...(provider ? { embedProvider: provider } : {}),
         });
       }
 
@@ -444,10 +468,14 @@ export async function createStore(options: StoreOptions): Promise<QMDStore> {
         intent: opts.intent,
         skipRerank,
         chunkStrategy: opts.chunkStrategy,
+        ...(provider ? { embedProvider: provider } : {}),
       });
     },
     searchLex: async (q, opts) => internal.searchFTS(q, opts?.limit, opts?.collection),
-    searchVector: async (q, opts) => internal.searchVec(q, DEFAULT_EMBED_MODEL, opts?.limit, opts?.collection),
+    searchVector: async (q, opts) => internal.searchVec(
+      q, DEFAULT_EMBED_MODEL, opts?.limit, opts?.collection,
+      undefined, undefined, opts?.embedProvider ?? options.embedProvider,
+    ),
     expandQuery: async (q, opts) => internal.expandQuery(q, undefined, opts?.intent),
     get: async (pathOrDocid, opts) => internal.findDocument(pathOrDocid, opts),
     getDocumentBody: async (pathOrDocid, opts) => {

+ 40 - 0
src/mcp/server.ts

@@ -25,12 +25,43 @@ import {
   addLineNumbers,
   getDefaultDbPath,
   DEFAULT_MULTI_GET_MAX_BYTES,
+  createEmbeddingProvider,
+  resolveProviderKind,
   type QMDStore,
   type ExpandedQuery,
   type IndexStatus,
+  type EmbeddingProvider,
 } from "../index.js";
 import { getConfigPath } from "../collections.js";
 
+/**
+ * Build a query-side embedding provider (i-loazq6ze) for MCP server start.
+ * Mirrors `buildQueryEmbedProvider` in the CLI: returns `undefined` when
+ * the user has not opted into a remote provider, preserving pre-patch
+ * behavior (local llama-cpp). Construction errors are logged and the
+ * server falls back to the legacy path.
+ */
+function buildMcpEmbedProvider(): EmbeddingProvider | undefined {
+  const env = process.env;
+  const envOptIn = !!(
+    env.QMD_EMBED_PROVIDER ||
+    env.QMD_EMBED_ENDPOINT ||
+    env.QMD_EMBED_AUTO_FALLBACK
+  );
+  // Probe resolved kind via the factory's standard precedence (env + config).
+  const resolved = resolveProviderKind({});
+  if (!envOptIn && resolved === "local") return undefined;
+  try {
+    return createEmbeddingProvider({});
+  } catch (err) {
+    // Log + fall through to undefined so legacy local path is used.
+    process.stderr.write(
+      `[qmd mcp] WARN failed to build embedding provider — using local fallback: ${err instanceof Error ? err.message : String(err)}\n`,
+    );
+    return undefined;
+  }
+}
+
 // =============================================================================
 // Types for structured content
 // =============================================================================
@@ -539,9 +570,11 @@ Intent-aware lex (C++ performance, not sports):
 
 export async function startMcpServer(): Promise<void> {
   const configPath = getConfigPath();
+  const embedProvider = buildMcpEmbedProvider();
   const store = await createStore({
     dbPath: getDefaultDbPath(),
     ...(existsSync(configPath) ? { configPath } : {}),
+    ...(embedProvider ? { embedProvider } : {}),
   });
   const server = await createMcpServer(store);
   const transport = new StdioServerTransport();
@@ -564,9 +597,11 @@ export type HttpServerHandle = {
  */
 export async function startMcpHttpServer(port: number, options?: { quiet?: boolean }): Promise<HttpServerHandle> {
   const configPath = getConfigPath();
+  const embedProvider = buildMcpEmbedProvider();
   const store = await createStore({
     dbPath: getDefaultDbPath(),
     ...(existsSync(configPath) ? { configPath } : {}),
+    ...(embedProvider ? { embedProvider } : {}),
   });
 
   // Pre-fetch default collection names for REST endpoint
@@ -810,6 +845,11 @@ export async function startMcpHttpServer(port: number, options?: { quiet?: boole
     sessions.clear();
     httpServer.close();
     await store.close();
+    // Dispose the query-side embedding provider (if any) — releases
+    // HTTP keep-alive sockets in OpenAIEmbeddingsProvider (i-loazq6ze).
+    if (embedProvider) {
+      try { await embedProvider.dispose(); } catch { /* ignore */ }
+    }
   };
 
   process.on("SIGTERM", async () => {

+ 130 - 24
src/store.ts

@@ -1120,7 +1120,7 @@ export type Store = {
 
   // Search
   searchFTS: (query: string, limit?: number, collectionName?: string) => SearchResult[];
-  searchVec: (query: string, model: string, limit?: number, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[]) => Promise<SearchResult[]>;
+  searchVec: (query: string, model: string, limit?: number, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[], embedProvider?: EmbeddingProvider) => Promise<SearchResult[]>;
 
   // Query expansion & reranking
   expandQuery: (query: string, model?: string, intent?: string) => Promise<ExpandedQuery[]>;
@@ -1565,6 +1565,16 @@ export async function generateEmbeddings(
       return session.embedBatch(texts, { model: modelArg });
     };
 
+    // JS-only token estimator for the provider path. Char-based with
+    // avgCharsPerToken=3 — matches the heuristic the chunker already
+    // uses for its initial char-space pass, so the safety re-split is a
+    // near no-op while populating the `tokens` field with a stable
+    // estimate. CRITICAL: avoids loading node-llama-cpp on remote-only
+    // deployments (`QMD_EMBED_ENDPOINT=...`). i-1rqixh6m DoD #1.
+    const chunkTokenizer: TokenCounter | undefined = provider
+      ? (text: string) => Math.ceil(text.length / 3)
+      : undefined;
+
     for (const batchMeta of batches) {
       // Abort early if session has been invalidated
       if (!session.isValid) {
@@ -1588,6 +1598,7 @@ export async function generateEmbeddings(
           doc.path,
           chunkStrategy,
           session.signal,
+          chunkTokenizer,
         );
 
         for (let seq = 0; seq < chunks.length; seq++) {
@@ -1764,7 +1775,7 @@ export function createStore(dbPath?: string): Store {
 
     // Search
     searchFTS: (query: string, limit?: number, collectionName?: string) => searchFTS(db, query, limit, collectionName),
-    searchVec: (query: string, model: string, limit?: number, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[]) => searchVec(db, query, model, limit, collectionName, session, precomputedEmbedding),
+    searchVec: (query: string, model: string, limit?: number, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[], embedProvider?: EmbeddingProvider) => searchVec(db, query, model, limit, collectionName, session, precomputedEmbedding, embedProvider),
 
     // Query expansion & reranking
     expandQuery: (query: string, model?: string, intent?: string) => expandQuery(query, model, db, intent, store.llm),
@@ -2453,12 +2464,37 @@ function chunkByFunctionRanges(
   return out;
 }
 
+/**
+ * Counts the tokens in `text`. Used by `chunkDocumentByTokens` for the
+ * safety re-split that splits chunks exceeding `maxTokens`.
+ *
+ * When `chunkDocumentByTokens` is called WITHOUT a tokenizer (default),
+ * it lazily resolves `getDefaultLlamaCpp()` and uses `llm.tokenize` —
+ * accurate but expensive (loads the local GGUF embed model + initialises
+ * llama.cpp, ~22s on cold cache).
+ *
+ * Provider-mode callers (HTTP embed providers like the GPU worker on
+ * `models` LXC) MUST pass a JS-only approximator to avoid loading the
+ * local model entirely. A char-based estimate like
+ * `Math.ceil(text.length / 3)` is a reasonable default — it matches the
+ * `avgCharsPerToken=3` heuristic used for the initial char-space chunk
+ * step, so the safety re-split stays a near no-op while populating the
+ * `tokens` field with a stable estimate.
+ */
+export type TokenCounter = (text: string) => number | Promise<number>;
+
 /**
  * Chunk a document by actual token count using the LLM tokenizer.
  * More accurate than character-based chunking but requires async.
  *
- * When filepath and chunkStrategy are provided, uses AST-aware break points
- * for supported code files.
+ * When `tokenizer` is supplied, it is used in place of the local
+ * `llm.tokenize(...)` call — neither `getDefaultLlamaCpp()` nor
+ * `llm.tokenize(...)` is invoked. This lets remote-only deployments
+ * (`QMD_EMBED_ENDPOINT=...`) chunk documents without warming up
+ * node-llama-cpp (DoD #1 of i-1rqixh6m / i-qkarfffa).
+ *
+ * When `filepath` and `chunkStrategy` are provided, uses AST-aware break
+ * points for supported code files.
  */
 export async function chunkDocumentByTokens(
   content: string,
@@ -2467,9 +2503,18 @@ export async function chunkDocumentByTokens(
   windowTokens: number = CHUNK_WINDOW_TOKENS,
   filepath?: string,
   chunkStrategy: ChunkStrategy = "regex",
-  signal?: AbortSignal
+  signal?: AbortSignal,
+  tokenizer?: TokenCounter,
 ): Promise<{ text: string; pos: number; tokens: number }[]> {
-  const llm = getDefaultLlamaCpp();
+  // Resolve token counter lazily so callers that supply `tokenizer` never
+  // touch the local LlamaCpp instance — `getDefaultLlamaCpp()` is only
+  // invoked from inside the default closure when it is actually called
+  // (i.e. when no tokenizer is supplied).
+  let llm: ReturnType<typeof getDefaultLlamaCpp> | undefined;
+  const countTokens: TokenCounter = tokenizer ?? (async (text: string) => {
+    if (!llm) llm = getDefaultLlamaCpp();
+    return (await llm.tokenize(text)).length;
+  });
 
   // Use moderate chars/token estimate (prose ~4, code ~2, mixed ~3)
   // If chunks exceed limit, they'll be re-split with actual ratio
@@ -2489,25 +2534,25 @@ export async function chunkDocumentByTokens(
     // Respect abort signal to avoid runaway tokenization
     if (signal?.aborted) break;
 
-    const tokens = await llm.tokenize(chunk.text);
+    const tokenCount = await countTokens(chunk.text);
 
-    if (tokens.length <= maxTokens) {
-      results.push({ text: chunk.text, pos: chunk.pos, tokens: tokens.length });
+    if (tokenCount <= maxTokens) {
+      results.push({ text: chunk.text, pos: chunk.pos, tokens: tokenCount });
     } else {
       // Chunk is still too large - split it further
       // Use actual token count to estimate better char limit
-      const actualCharsPerToken = chunk.text.length / tokens.length;
+      const actualCharsPerToken = chunk.text.length / tokenCount;
       const safeMaxChars = Math.floor(maxTokens * actualCharsPerToken * 0.95); // 5% safety margin
 
       const subChunks = chunkDocument(chunk.text, safeMaxChars, Math.floor(overlapChars * actualCharsPerToken / 2), Math.floor(windowChars * actualCharsPerToken / 2));
 
       for (const subChunk of subChunks) {
         if (signal?.aborted) break;
-        const subTokens = await llm.tokenize(subChunk.text);
+        const subCount = await countTokens(subChunk.text);
         results.push({
           text: subChunk.text,
           pos: chunk.pos + subChunk.pos,
-          tokens: subTokens.length,
+          tokens: subCount,
         });
       }
     }
@@ -3260,11 +3305,11 @@ export function searchFTS(db: Database, query: string, limit: number = 20, colle
 // Vector Search
 // =============================================================================
 
-export async function searchVec(db: Database, query: string, model: string, limit: number = 20, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[]): Promise<SearchResult[]> {
+export async function searchVec(db: Database, query: string, model: string, limit: number = 20, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[], embedProvider?: EmbeddingProvider): Promise<SearchResult[]> {
   const tableExists = db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
   if (!tableExists) return [];
 
-  const embedding = precomputedEmbedding ?? await getEmbedding(query, model, true, session);
+  const embedding = precomputedEmbedding ?? await getEmbedding(query, model, true, session, undefined, embedProvider);
   if (!embedding) return [];
 
   // IMPORTANT: We use a two-step query approach here because sqlite-vec virtual tables
@@ -3350,7 +3395,24 @@ export async function searchVec(db: Database, query: string, model: string, limi
 // Embeddings
 // =============================================================================
 
-async function getEmbedding(text: string, model: string, isQuery: boolean, session?: ILLMSession, llmOverride?: LlamaCpp): Promise<number[] | null> {
+async function getEmbedding(text: string, model: string, isQuery: boolean, session?: ILLMSession, llmOverride?: LlamaCpp, embedProvider?: EmbeddingProvider): Promise<number[] | null> {
+  // When an EmbeddingProvider is supplied, route the encoding through it
+  // (HTTP / GPU worker / fallback chain) instead of touching local
+  // node-llama-cpp at all. The provider sees the raw text + the desired
+  // model id; query-formatting prefixes are still applied via
+  // formatQueryForEmbedding so embedding parity with the index is preserved.
+  if (embedProvider) {
+    const providerModel = embedProvider.getModelId();
+    const formattedText = isQuery
+      ? formatQueryForEmbedding(text, providerModel)
+      : formatDocForEmbedding(text, undefined, providerModel);
+    // Only forward an AbortSignal when the provider is local-backed;
+    // remote providers manage their own timeouts and an LLM-session signal
+    // would abort their HTTP request prematurely (i-08ovbvtb).
+    const sig = embedProvider.kind === "local" ? session?.signal : undefined;
+    const result = await embedProvider.embed(formattedText, sig ? { model: providerModel, signal: sig } : { model: providerModel });
+    return result?.embedding ?? null;
+  }
   // Format text using the appropriate prompt template
   const formattedText = isQuery ? formatQueryForEmbedding(text, model) : formatDocForEmbedding(text, undefined, model);
   const result = session
@@ -4147,6 +4209,14 @@ export interface HybridQueryOptions {
   skipRerank?: boolean;     // skip LLM reranking, use only RRF scores
   chunkStrategy?: ChunkStrategy;
   hooks?: SearchHooks;
+  /**
+   * Optional embedding provider for query-side encoding (i-loazq6ze).
+   * When supplied, the original-query vector AND any vec/hyde expansion
+   * variants are encoded through this provider (HTTP, GPU worker,
+   * AutoFallback chain) instead of `getLlm(store).embedBatch(...)`. Skip
+   * to keep pre-patch behavior (uses local LlamaCpp).
+   */
+  embedProvider?: EmbeddingProvider;
 }
 
 export interface HybridQueryResult {
@@ -4194,6 +4264,7 @@ export async function hybridQuery(
   const intent = options?.intent;
   const skipRerank = options?.skipRerank ?? false;
   const hooks = options?.hooks;
+  const embedProvider = options?.embedProvider;
 
   const rankedLists: RankedResult[][] = [];
   const rankedListMeta: RankedListMeta[] = [];
@@ -4267,12 +4338,19 @@ export async function hybridQuery(
       }
     }
 
-    // Batch embed all vector queries in a single call
-    const llm = getLlm(store);
-    const textsToEmbed = vecQueries.map(q => formatQueryForEmbedding(q.text, llm.embedModelName));
+    // Batch embed all vector queries in a single call.
+    // When `embedProvider` is supplied (i-loazq6ze), route the encode through
+    // it (HTTP / GPU worker / AutoFallback chain) instead of warming the
+    // local llama-cpp model — this is the whole point of the GPU worker.
+    const embedModelName = embedProvider
+      ? embedProvider.getModelId()
+      : getLlm(store).embedModelName;
+    const textsToEmbed = vecQueries.map(q => formatQueryForEmbedding(q.text, embedModelName));
     hooks?.onEmbedStart?.(textsToEmbed.length);
     const embedStart = Date.now();
-    const embeddings = await llm.embedBatch(textsToEmbed);
+    const embeddings = embedProvider
+      ? await embedProvider.embedBatch(textsToEmbed, { model: embedModelName })
+      : await getLlm(store).embedBatch(textsToEmbed);
     hooks?.onEmbedDone?.(Date.now() - embedStart);
 
     // Run sqlite-vec lookups with pre-computed embeddings
@@ -4468,6 +4546,12 @@ export interface VectorSearchOptions {
   minScore?: number;        // default 0.3
   intent?: string;          // domain intent hint for disambiguation
   hooks?: Pick<SearchHooks, 'onExpand'>;
+  /**
+   * Optional embedding provider for query-side encoding (i-loazq6ze).
+   * When supplied, query vectors are encoded via the provider (HTTP /
+   * GPU worker / fallback chain) instead of the local llama-cpp model.
+   */
+  embedProvider?: EmbeddingProvider;
 }
 
 export interface VectorSearchResult {
@@ -4498,6 +4582,7 @@ export async function vectorSearchQuery(
   const minScore = options?.minScore ?? 0.3;
   const collection = options?.collection;
   const intent = options?.intent;
+  const embedProvider = options?.embedProvider;
 
   const hasVectors = !!store.db.prepare(
     `SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`
@@ -4510,11 +4595,17 @@ export async function vectorSearchQuery(
   const vecExpanded = allExpanded.filter(q => q.type !== 'lex');
   options?.hooks?.onExpand?.(query, vecExpanded, Date.now() - expandStart);
 
-  // Run original + vec/hyde expanded through vector, sequentially — concurrent embed() hangs
+  // Run original + vec/hyde expanded through vector, sequentially — concurrent embed() hangs.
+  // When `embedProvider` is supplied (i-loazq6ze), query encoding is routed
+  // through it; the per-call signature `searchVec(...)` accepts the provider
+  // as the trailing argument so existing tests / callers stay untouched.
   const queryTexts = [query, ...vecExpanded.map(q => q.query)];
   const allResults = new Map<string, VectorSearchResult>();
   for (const q of queryTexts) {
-    const vecResults = await store.searchVec(q, DEFAULT_EMBED_MODEL, limit, collection);
+    const vecResults = await store.searchVec(
+      q, DEFAULT_EMBED_MODEL, limit, collection,
+      undefined, undefined, embedProvider,
+    );
     for (const r of vecResults) {
       const existing = allResults.get(r.filepath);
       if (!existing || r.score > existing.score) {
@@ -4557,6 +4648,12 @@ export interface StructuredSearchOptions {
   skipRerank?: boolean;
   chunkStrategy?: ChunkStrategy;
   hooks?: SearchHooks;
+  /**
+   * Optional embedding provider for query-side encoding (i-loazq6ze).
+   * When supplied, vec/hyde sub-queries are batch-encoded via the provider
+   * (HTTP / GPU worker / fallback chain) instead of `getLlm(store).embedBatch`.
+   */
+  embedProvider?: EmbeddingProvider;
 }
 
 /**
@@ -4589,6 +4686,7 @@ export async function structuredSearch(
   const intent = options?.intent;
   const skipRerank = options?.skipRerank ?? false;
   const hooks = options?.hooks;
+  const embedProvider = options?.embedProvider;
 
   const collections = options?.collections;
 
@@ -4651,11 +4749,19 @@ export async function structuredSearch(
         s.type === 'vec' || s.type === 'hyde'
     );
     if (vecSearches.length > 0) {
-      const llm = getLlm(store);
-      const textsToEmbed = vecSearches.map(s => formatQueryForEmbedding(s.query, llm.embedModelName));
+      // Route batch encoding through the supplied EmbeddingProvider when
+      // present (i-loazq6ze). Otherwise fall back to the local llama-cpp
+      // singleton — preserves pre-patch behavior for callers that don't
+      // configure a provider.
+      const embedModelName = embedProvider
+        ? embedProvider.getModelId()
+        : getLlm(store).embedModelName;
+      const textsToEmbed = vecSearches.map(s => formatQueryForEmbedding(s.query, embedModelName));
       hooks?.onEmbedStart?.(textsToEmbed.length);
       const embedStart = Date.now();
-      const embeddings = await llm.embedBatch(textsToEmbed);
+      const embeddings = embedProvider
+        ? await embedProvider.embedBatch(textsToEmbed, { model: embedModelName })
+        : await getLlm(store).embedBatch(textsToEmbed);
       hooks?.onEmbedDone?.(Date.now() - embedStart);
 
       for (let i = 0; i < vecSearches.length; i++) {

+ 391 - 0
test/embedding-vsearch.test.ts

@@ -0,0 +1,391 @@
+/**
+ * embedding-vsearch.test.ts — Query-side EmbeddingProvider integration
+ * (issue i-loazq6ze).
+ *
+ * Verifies that `searchVec`, `structuredSearch`, and `vectorSearchQuery`
+ * route query encoding through the supplied `EmbeddingProvider` instead
+ * of the local `node-llama-cpp` model when one is configured. Also covers
+ * the AutoFallback path so a transient remote outage degrades to local
+ * instead of throwing.
+ *
+ * The store is in-memory (sqlite + sqlite-vec); the provider is a stub
+ * that records calls and returns deterministic vectors so we can verify
+ * routing without standing up real services.
+ */
+
+import { describe, test, expect, beforeEach, afterEach } from "vitest";
+import { mkdtempSync, rmSync } from "node:fs";
+import { tmpdir } from "node:os";
+import { join } from "node:path";
+import {
+  createStore,
+  searchVec,
+  structuredSearch,
+  vectorSearchQuery,
+  type Store,
+  type ExpandedQuery,
+} from "../src/store.js";
+import {
+  AutoFallbackEmbeddingProvider,
+  CircuitOpenError,
+  type EmbeddingProvider,
+  type ProviderEmbedding,
+  type ProviderHealth,
+} from "../src/embedding/index.js";
+
+// ─────────────────────────── Stub providers ──────────────────────────────────
+
+/** Deterministic stub — returns a fixed embedding to match index vectors. */
+class FixedProvider implements EmbeddingProvider {
+  readonly kind = "openai" as const;
+  embedCalls = 0;
+  embedBatchCalls = 0;
+  lastEmbedTexts: string[] = [];
+  constructor(
+    private readonly modelId: string,
+    private readonly embedding: number[],
+  ) {}
+  getModelId(): string { return this.modelId; }
+  getDimensions(): number | undefined { return this.embedding.length; }
+  async healthcheck(): Promise<ProviderHealth> {
+    return { ok: true, model: this.modelId, dimensions: this.embedding.length };
+  }
+  async embed(text: string): Promise<ProviderEmbedding | null> {
+    this.embedCalls++;
+    this.lastEmbedTexts.push(text);
+    return { embedding: this.embedding.slice(), model: this.modelId };
+  }
+  async embedBatch(texts: string[]): Promise<(ProviderEmbedding | null)[]> {
+    this.embedBatchCalls++;
+    this.lastEmbedTexts.push(...texts);
+    return texts.map(() => ({ embedding: this.embedding.slice(), model: this.modelId }));
+  }
+  async dispose(): Promise<void> {}
+}
+
+/** Throws CircuitOpenError on every call — simulates "remote down". */
+class CircuitOpenProvider implements EmbeddingProvider {
+  readonly kind = "openai" as const;
+  embedCalls = 0;
+  embedBatchCalls = 0;
+  constructor(private readonly modelId: string = "embeddinggemma") {}
+  getModelId(): string { return this.modelId; }
+  getDimensions(): number | undefined { return undefined; }
+  async healthcheck(): Promise<ProviderHealth> {
+    return { ok: false, model: this.modelId, detail: "circuit open" };
+  }
+  async embed(): Promise<ProviderEmbedding | null> {
+    this.embedCalls++;
+    throw new CircuitOpenError("remote down");
+  }
+  async embedBatch(): Promise<(ProviderEmbedding | null)[]> {
+    this.embedBatchCalls++;
+    throw new CircuitOpenError("remote down");
+  }
+  async dispose(): Promise<void> {}
+}
+
+/** Throws a generic error on every call — simulates total backend failure. */
+class AlwaysFailProvider implements EmbeddingProvider {
+  readonly kind = "openai" as const;
+  constructor(private readonly modelId: string = "embeddinggemma") {}
+  getModelId(): string { return this.modelId; }
+  getDimensions(): number | undefined { return undefined; }
+  async healthcheck(): Promise<ProviderHealth> {
+    return { ok: false, model: this.modelId, detail: "always fail" };
+  }
+  async embed(): Promise<ProviderEmbedding | null> {
+    throw new Error("backend unreachable");
+  }
+  async embedBatch(): Promise<(ProviderEmbedding | null)[]> {
+    throw new Error("backend unreachable");
+  }
+  async dispose(): Promise<void> {}
+}
+
+// ─────────────────────────── Test setup ──────────────────────────────────────
+
+let workDir: string;
+let store: Store;
+
+const DIM = 4;
+// Fixed embedding used for both index vectors and query vectors so the
+// stub provider's response will match the indexed vector exactly (cosine
+// distance ≈ 0 → similarity ≈ 1).
+const FIXED_VEC = [0.1, 0.2, 0.3, 0.4];
+
+beforeEach(() => {
+  workDir = mkdtempSync(join(tmpdir(), "qmd-vsearch-test-"));
+  process.env.INDEX_PATH = join(workDir, "index.sqlite");
+  store = createStore(process.env.INDEX_PATH);
+
+  const now = "2026-04-28T00:00:00Z";
+  store.db
+    .prepare(`INSERT INTO content (hash, doc, created_at) VALUES (?, ?, ?)`)
+    .run("hashA", "Alpha document body about query encoding via remote provider.", now);
+  store.db
+    .prepare(`INSERT INTO content (hash, doc, created_at) VALUES (?, ?, ?)`)
+    .run("hashB", "Beta document body about fallback chain semantics.", now);
+  store.db
+    .prepare(`INSERT INTO documents (hash, collection, path, title, created_at, modified_at, active) VALUES (?, ?, ?, ?, ?, ?, ?)`)
+    .run("hashA", "test", "alpha.md", "Alpha", now, now, 1);
+  store.db
+    .prepare(`INSERT INTO documents (hash, collection, path, title, created_at, modified_at, active) VALUES (?, ?, ?, ?, ?, ?, ?)`)
+    .run("hashB", "test", "beta.md", "Beta", now, now, 1);
+
+  // Seed vectors_vec with the same fixed vector so stub provider's query
+  // embedding lines up with the index entries.
+  store.ensureVecTable(DIM);
+  store.db
+    .prepare(`INSERT INTO content_vectors (hash, seq, pos, model, embedded_at) VALUES (?, 0, 0, 'embeddinggemma', ?)`)
+    .run("hashA", now);
+  store.db
+    .prepare(`INSERT INTO content_vectors (hash, seq, pos, model, embedded_at) VALUES (?, 0, 0, 'embeddinggemma', ?)`)
+    .run("hashB", now);
+  store.db
+    .prepare(`INSERT INTO vectors_vec (hash_seq, embedding) VALUES (?, ?)`)
+    .run("hashA_0", new Float32Array(FIXED_VEC));
+  store.db
+    .prepare(`INSERT INTO vectors_vec (hash_seq, embedding) VALUES (?, ?)`)
+    .run("hashB_0", new Float32Array(FIXED_VEC));
+});
+
+afterEach(() => {
+  try { store.close(); } catch { /* ignore */ }
+  delete process.env.INDEX_PATH;
+  rmSync(workDir, { recursive: true, force: true });
+});
+
+// ─────────────────────────── searchVec ──────────────────────────────────────
+
+describe("searchVec with EmbeddingProvider", () => {
+  test("encodes the query through the provider when supplied", async () => {
+    const provider = new FixedProvider("embeddinggemma", FIXED_VEC);
+
+    // Sanity: store.llm is not set; if searchVec touched local llama-cpp
+    // it would fail (no model loaded). Provider routing must be exclusive.
+    const results = await searchVec(
+      store.db, "hello", "embeddinggemma", 10,
+      undefined, undefined, undefined, provider,
+    );
+
+    expect(provider.embedCalls).toBe(1);
+    expect(provider.embedBatchCalls).toBe(0);
+    expect(results.length).toBeGreaterThan(0);
+    // Both alpha + beta share the same vector — both should be returned.
+    const filepaths = results.map((r) => r.filepath).sort();
+    expect(filepaths).toEqual(["qmd://test/alpha.md", "qmd://test/beta.md"]);
+  });
+
+  test("provider mode does not access the local llama-cpp instance", async () => {
+    const provider = new FixedProvider("embeddinggemma", FIXED_VEC);
+
+    // If anything touches `store.llm` while the provider is set, the proxy
+    // throws — proves the provider path is truly exclusive (mirrors the
+    // i-08ovbvtb regression guard in embedding-store-integration.test.ts).
+    store.llm = new Proxy({}, {
+      get(_target, prop) {
+        throw new Error(
+          `store.llm.${String(prop)} accessed when embedProvider was supplied — DoD violation`,
+        );
+      },
+    }) as never;
+
+    const results = await searchVec(
+      store.db, "hello", "embeddinggemma", 10,
+      undefined, undefined, undefined, provider,
+    );
+    expect(results.length).toBeGreaterThan(0);
+  });
+
+  test("survives transient primary failure via AutoFallback", async () => {
+    const primary = new CircuitOpenProvider("embeddinggemma");
+    const fallback = new FixedProvider("embeddinggemma", FIXED_VEC);
+    const wrapped = new AutoFallbackEmbeddingProvider({
+      primary,
+      fallback,
+      warn: () => { /* swallow noisy WARN in tests */ },
+    });
+
+    const results = await searchVec(
+      store.db, "fallback test", "embeddinggemma", 10,
+      undefined, undefined, undefined, wrapped,
+    );
+
+    expect(primary.embedCalls).toBe(1);
+    expect(fallback.embedCalls).toBe(1);
+    expect(results.length).toBeGreaterThan(0);
+  });
+
+  test("surfaces error when both primary AND fallback fail", async () => {
+    const primary = new AlwaysFailProvider("embeddinggemma");
+    const fallback = new AlwaysFailProvider("embeddinggemma");
+    const wrapped = new AutoFallbackEmbeddingProvider({
+      primary,
+      fallback,
+      warn: () => { /* swallow */ },
+    });
+
+    await expect(
+      searchVec(
+        store.db, "doomed", "embeddinggemma", 10,
+        undefined, undefined, undefined, wrapped,
+      ),
+    ).rejects.toThrow(/backend unreachable/);
+  });
+});
+
+// ─────────────────────────── structuredSearch ───────────────────────────────
+
+describe("structuredSearch with EmbeddingProvider", () => {
+  test("uses provider.embedBatch for vec/hyde sub-queries", async () => {
+    const provider = new FixedProvider("embeddinggemma", FIXED_VEC);
+
+    // Deny access to the local llama-cpp — proves the provider path is exclusive.
+    store.llm = new Proxy({}, {
+      get(_target, prop) {
+        throw new Error(
+          `store.llm.${String(prop)} accessed when embedProvider was supplied — DoD violation`,
+        );
+      },
+    }) as never;
+
+    const queries: ExpandedQuery[] = [
+      { type: "vec", query: "what is the fallback chain about" },
+      { type: "hyde", query: "Fallback chains route around primary failure transparently." },
+    ];
+
+    const results = await structuredSearch(store, queries, {
+      skipRerank: true, // reranker uses local llm — skip in this isolation test
+      embedProvider: provider,
+    });
+
+    // One batch call covering both vec/hyde queries.
+    expect(provider.embedBatchCalls).toBe(1);
+    expect(provider.lastEmbedTexts.length).toBe(2);
+    expect(results.length).toBeGreaterThan(0);
+  });
+
+  test("AutoFallback covers structuredSearch query batch", async () => {
+    const primary = new CircuitOpenProvider("embeddinggemma");
+    const fallback = new FixedProvider("embeddinggemma", FIXED_VEC);
+    const wrapped = new AutoFallbackEmbeddingProvider({
+      primary,
+      fallback,
+      warn: () => { /* swallow */ },
+    });
+
+    const queries: ExpandedQuery[] = [
+      { type: "vec", query: "fallback test" },
+    ];
+
+    const results = await structuredSearch(store, queries, {
+      skipRerank: true,
+      embedProvider: wrapped,
+    });
+
+    expect(primary.embedBatchCalls).toBe(1);
+    expect(fallback.embedBatchCalls).toBe(1);
+    expect(results.length).toBeGreaterThan(0);
+  });
+
+  test("structuredSearch degrades to empty results when both providers fail (batch path)", async () => {
+    // AutoFallback.embedBatch is contract-bound to return nulls on total
+    // failure (graceful degradation in batch mode — see autofallback.ts
+    // onTotalFail). structuredSearch then has no embeddings to query
+    // sqlite-vec with and returns []. This is the documented behavior;
+    // searchVec (single-embed path) is the one that surfaces a thrown
+    // error to the caller, see the test above.
+    const primary = new AlwaysFailProvider("embeddinggemma");
+    const fallback = new AlwaysFailProvider("embeddinggemma");
+    const wrapped = new AutoFallbackEmbeddingProvider({
+      primary,
+      fallback,
+      warn: () => { /* swallow */ },
+    });
+
+    const queries: ExpandedQuery[] = [
+      { type: "vec", query: "doomed" },
+    ];
+
+    const results = await structuredSearch(store, queries, {
+      skipRerank: true,
+      embedProvider: wrapped,
+    });
+    expect(results).toEqual([]);
+  });
+});
+
+// ─────────────────────────── vectorSearchQuery ──────────────────────────────
+
+describe("vectorSearchQuery with EmbeddingProvider", () => {
+  test("encodes original query via provider, no local llm access", async () => {
+    const provider = new FixedProvider("embeddinggemma", FIXED_VEC);
+
+    // Stub expandQuery to return no expansions — this isolates the
+    // embedding path from the LLM-driven query expansion path.
+    store.expandQuery = async () => [];
+
+    store.llm = new Proxy({}, {
+      get(_target, prop) {
+        throw new Error(
+          `store.llm.${String(prop)} accessed when embedProvider was supplied — DoD violation`,
+        );
+      },
+    }) as never;
+
+    const results = await vectorSearchQuery(store, "vector search test", {
+      limit: 5,
+      minScore: 0,
+      embedProvider: provider,
+    });
+
+    // vectorSearchQuery sequentializes — at minimum the original query
+    // triggers one embed call via the provider.
+    expect(provider.embedCalls).toBeGreaterThanOrEqual(1);
+    expect(results.length).toBeGreaterThan(0);
+  });
+
+  test("AutoFallback rescues vectorSearchQuery from primary failure", async () => {
+    const primary = new CircuitOpenProvider("embeddinggemma");
+    const fallback = new FixedProvider("embeddinggemma", FIXED_VEC);
+    const wrapped = new AutoFallbackEmbeddingProvider({
+      primary,
+      fallback,
+      warn: () => { /* swallow */ },
+    });
+
+    store.expandQuery = async () => [];
+
+    const results = await vectorSearchQuery(store, "fallback path", {
+      minScore: 0,
+      embedProvider: wrapped,
+    });
+
+    expect(primary.embedCalls).toBeGreaterThanOrEqual(1);
+    expect(fallback.embedCalls).toBeGreaterThanOrEqual(1);
+    expect(results.length).toBeGreaterThan(0);
+  });
+});
+
+// ─────────────────────────── Backward compat ────────────────────────────────
+
+describe("backward compat — no provider supplied", () => {
+  test("searchVec without provider uses precomputed embedding path (no llm needed)", async () => {
+    // When the caller passes `precomputedEmbedding`, searchVec must not
+    // touch any embedding backend at all — neither local nor provider.
+    // This is the cheapest backward-compat smoke test we can run without
+    // loading node-llama-cpp.
+    store.llm = new Proxy({}, {
+      get(_target, prop) {
+        throw new Error(`store.llm.${String(prop)} accessed unexpectedly`);
+      },
+    }) as never;
+
+    const results = await searchVec(
+      store.db, "hello", "embeddinggemma", 10,
+      undefined, undefined, FIXED_VEC, // precomputedEmbedding
+    );
+    expect(results.length).toBeGreaterThan(0);
+  });
+});