|
@@ -1120,7 +1120,7 @@ export type Store = {
|
|
|
|
|
|
|
|
// Search
|
|
// Search
|
|
|
searchFTS: (query: string, limit?: number, collectionName?: string) => SearchResult[];
|
|
searchFTS: (query: string, limit?: number, collectionName?: string) => SearchResult[];
|
|
|
- searchVec: (query: string, model: string, limit?: number, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[]) => Promise<SearchResult[]>;
|
|
|
|
|
|
|
+ searchVec: (query: string, model: string, limit?: number, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[], embedProvider?: EmbeddingProvider) => Promise<SearchResult[]>;
|
|
|
|
|
|
|
|
// Query expansion & reranking
|
|
// Query expansion & reranking
|
|
|
expandQuery: (query: string, model?: string, intent?: string) => Promise<ExpandedQuery[]>;
|
|
expandQuery: (query: string, model?: string, intent?: string) => Promise<ExpandedQuery[]>;
|
|
@@ -1565,6 +1565,16 @@ export async function generateEmbeddings(
|
|
|
return session.embedBatch(texts, { model: modelArg });
|
|
return session.embedBatch(texts, { model: modelArg });
|
|
|
};
|
|
};
|
|
|
|
|
|
|
|
|
|
+ // JS-only token estimator for the provider path. Char-based with
|
|
|
|
|
+ // avgCharsPerToken=3 — matches the heuristic the chunker already
|
|
|
|
|
+ // uses for its initial char-space pass, so the safety re-split is a
|
|
|
|
|
+ // near no-op while populating the `tokens` field with a stable
|
|
|
|
|
+ // estimate. CRITICAL: avoids loading node-llama-cpp on remote-only
|
|
|
|
|
+ // deployments (`QMD_EMBED_ENDPOINT=...`). i-1rqixh6m DoD #1.
|
|
|
|
|
+ const chunkTokenizer: TokenCounter | undefined = provider
|
|
|
|
|
+ ? (text: string) => Math.ceil(text.length / 3)
|
|
|
|
|
+ : undefined;
|
|
|
|
|
+
|
|
|
for (const batchMeta of batches) {
|
|
for (const batchMeta of batches) {
|
|
|
// Abort early if session has been invalidated
|
|
// Abort early if session has been invalidated
|
|
|
if (!session.isValid) {
|
|
if (!session.isValid) {
|
|
@@ -1588,6 +1598,7 @@ export async function generateEmbeddings(
|
|
|
doc.path,
|
|
doc.path,
|
|
|
chunkStrategy,
|
|
chunkStrategy,
|
|
|
session.signal,
|
|
session.signal,
|
|
|
|
|
+ chunkTokenizer,
|
|
|
);
|
|
);
|
|
|
|
|
|
|
|
for (let seq = 0; seq < chunks.length; seq++) {
|
|
for (let seq = 0; seq < chunks.length; seq++) {
|
|
@@ -1764,7 +1775,7 @@ export function createStore(dbPath?: string): Store {
|
|
|
|
|
|
|
|
// Search
|
|
// Search
|
|
|
searchFTS: (query: string, limit?: number, collectionName?: string) => searchFTS(db, query, limit, collectionName),
|
|
searchFTS: (query: string, limit?: number, collectionName?: string) => searchFTS(db, query, limit, collectionName),
|
|
|
- searchVec: (query: string, model: string, limit?: number, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[]) => searchVec(db, query, model, limit, collectionName, session, precomputedEmbedding),
|
|
|
|
|
|
|
+ searchVec: (query: string, model: string, limit?: number, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[], embedProvider?: EmbeddingProvider) => searchVec(db, query, model, limit, collectionName, session, precomputedEmbedding, embedProvider),
|
|
|
|
|
|
|
|
// Query expansion & reranking
|
|
// Query expansion & reranking
|
|
|
expandQuery: (query: string, model?: string, intent?: string) => expandQuery(query, model, db, intent, store.llm),
|
|
expandQuery: (query: string, model?: string, intent?: string) => expandQuery(query, model, db, intent, store.llm),
|
|
@@ -2453,12 +2464,37 @@ function chunkByFunctionRanges(
|
|
|
return out;
|
|
return out;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+/**
|
|
|
|
|
+ * Counts the tokens in `text`. Used by `chunkDocumentByTokens` for the
|
|
|
|
|
+ * safety re-split that splits chunks exceeding `maxTokens`.
|
|
|
|
|
+ *
|
|
|
|
|
+ * When `chunkDocumentByTokens` is called WITHOUT a tokenizer (default),
|
|
|
|
|
+ * it lazily resolves `getDefaultLlamaCpp()` and uses `llm.tokenize` —
|
|
|
|
|
+ * accurate but expensive (loads the local GGUF embed model + initialises
|
|
|
|
|
+ * llama.cpp, ~22s on cold cache).
|
|
|
|
|
+ *
|
|
|
|
|
+ * Provider-mode callers (HTTP embed providers like the GPU worker on
|
|
|
|
|
+ * `models` LXC) MUST pass a JS-only approximator to avoid loading the
|
|
|
|
|
+ * local model entirely. A char-based estimate like
|
|
|
|
|
+ * `Math.ceil(text.length / 3)` is a reasonable default — it matches the
|
|
|
|
|
+ * `avgCharsPerToken=3` heuristic used for the initial char-space chunk
|
|
|
|
|
+ * step, so the safety re-split stays a near no-op while populating the
|
|
|
|
|
+ * `tokens` field with a stable estimate.
|
|
|
|
|
+ */
|
|
|
|
|
+export type TokenCounter = (text: string) => number | Promise<number>;
|
|
|
|
|
+
|
|
|
/**
|
|
/**
|
|
|
* Chunk a document by actual token count using the LLM tokenizer.
|
|
* Chunk a document by actual token count using the LLM tokenizer.
|
|
|
* More accurate than character-based chunking but requires async.
|
|
* More accurate than character-based chunking but requires async.
|
|
|
*
|
|
*
|
|
|
- * When filepath and chunkStrategy are provided, uses AST-aware break points
|
|
|
|
|
- * for supported code files.
|
|
|
|
|
|
|
+ * When `tokenizer` is supplied, it is used in place of the local
|
|
|
|
|
+ * `llm.tokenize(...)` call — neither `getDefaultLlamaCpp()` nor
|
|
|
|
|
+ * `llm.tokenize(...)` is invoked. This lets remote-only deployments
|
|
|
|
|
+ * (`QMD_EMBED_ENDPOINT=...`) chunk documents without warming up
|
|
|
|
|
+ * node-llama-cpp (DoD #1 of i-1rqixh6m / i-qkarfffa).
|
|
|
|
|
+ *
|
|
|
|
|
+ * When `filepath` and `chunkStrategy` are provided, uses AST-aware break
|
|
|
|
|
+ * points for supported code files.
|
|
|
*/
|
|
*/
|
|
|
export async function chunkDocumentByTokens(
|
|
export async function chunkDocumentByTokens(
|
|
|
content: string,
|
|
content: string,
|
|
@@ -2467,9 +2503,18 @@ export async function chunkDocumentByTokens(
|
|
|
windowTokens: number = CHUNK_WINDOW_TOKENS,
|
|
windowTokens: number = CHUNK_WINDOW_TOKENS,
|
|
|
filepath?: string,
|
|
filepath?: string,
|
|
|
chunkStrategy: ChunkStrategy = "regex",
|
|
chunkStrategy: ChunkStrategy = "regex",
|
|
|
- signal?: AbortSignal
|
|
|
|
|
|
|
+ signal?: AbortSignal,
|
|
|
|
|
+ tokenizer?: TokenCounter,
|
|
|
): Promise<{ text: string; pos: number; tokens: number }[]> {
|
|
): Promise<{ text: string; pos: number; tokens: number }[]> {
|
|
|
- const llm = getDefaultLlamaCpp();
|
|
|
|
|
|
|
+ // Resolve token counter lazily so callers that supply `tokenizer` never
|
|
|
|
|
+ // touch the local LlamaCpp instance — `getDefaultLlamaCpp()` is only
|
|
|
|
|
+ // invoked from inside the default closure when it is actually called
|
|
|
|
|
+ // (i.e. when no tokenizer is supplied).
|
|
|
|
|
+ let llm: ReturnType<typeof getDefaultLlamaCpp> | undefined;
|
|
|
|
|
+ const countTokens: TokenCounter = tokenizer ?? (async (text: string) => {
|
|
|
|
|
+ if (!llm) llm = getDefaultLlamaCpp();
|
|
|
|
|
+ return (await llm.tokenize(text)).length;
|
|
|
|
|
+ });
|
|
|
|
|
|
|
|
// Use moderate chars/token estimate (prose ~4, code ~2, mixed ~3)
|
|
// Use moderate chars/token estimate (prose ~4, code ~2, mixed ~3)
|
|
|
// If chunks exceed limit, they'll be re-split with actual ratio
|
|
// If chunks exceed limit, they'll be re-split with actual ratio
|
|
@@ -2489,25 +2534,25 @@ export async function chunkDocumentByTokens(
|
|
|
// Respect abort signal to avoid runaway tokenization
|
|
// Respect abort signal to avoid runaway tokenization
|
|
|
if (signal?.aborted) break;
|
|
if (signal?.aborted) break;
|
|
|
|
|
|
|
|
- const tokens = await llm.tokenize(chunk.text);
|
|
|
|
|
|
|
+ const tokenCount = await countTokens(chunk.text);
|
|
|
|
|
|
|
|
- if (tokens.length <= maxTokens) {
|
|
|
|
|
- results.push({ text: chunk.text, pos: chunk.pos, tokens: tokens.length });
|
|
|
|
|
|
|
+ if (tokenCount <= maxTokens) {
|
|
|
|
|
+ results.push({ text: chunk.text, pos: chunk.pos, tokens: tokenCount });
|
|
|
} else {
|
|
} else {
|
|
|
// Chunk is still too large - split it further
|
|
// Chunk is still too large - split it further
|
|
|
// Use actual token count to estimate better char limit
|
|
// Use actual token count to estimate better char limit
|
|
|
- const actualCharsPerToken = chunk.text.length / tokens.length;
|
|
|
|
|
|
|
+ const actualCharsPerToken = chunk.text.length / tokenCount;
|
|
|
const safeMaxChars = Math.floor(maxTokens * actualCharsPerToken * 0.95); // 5% safety margin
|
|
const safeMaxChars = Math.floor(maxTokens * actualCharsPerToken * 0.95); // 5% safety margin
|
|
|
|
|
|
|
|
const subChunks = chunkDocument(chunk.text, safeMaxChars, Math.floor(overlapChars * actualCharsPerToken / 2), Math.floor(windowChars * actualCharsPerToken / 2));
|
|
const subChunks = chunkDocument(chunk.text, safeMaxChars, Math.floor(overlapChars * actualCharsPerToken / 2), Math.floor(windowChars * actualCharsPerToken / 2));
|
|
|
|
|
|
|
|
for (const subChunk of subChunks) {
|
|
for (const subChunk of subChunks) {
|
|
|
if (signal?.aborted) break;
|
|
if (signal?.aborted) break;
|
|
|
- const subTokens = await llm.tokenize(subChunk.text);
|
|
|
|
|
|
|
+ const subCount = await countTokens(subChunk.text);
|
|
|
results.push({
|
|
results.push({
|
|
|
text: subChunk.text,
|
|
text: subChunk.text,
|
|
|
pos: chunk.pos + subChunk.pos,
|
|
pos: chunk.pos + subChunk.pos,
|
|
|
- tokens: subTokens.length,
|
|
|
|
|
|
|
+ tokens: subCount,
|
|
|
});
|
|
});
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
@@ -3260,11 +3305,11 @@ export function searchFTS(db: Database, query: string, limit: number = 20, colle
|
|
|
// Vector Search
|
|
// Vector Search
|
|
|
// =============================================================================
|
|
// =============================================================================
|
|
|
|
|
|
|
|
-export async function searchVec(db: Database, query: string, model: string, limit: number = 20, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[]): Promise<SearchResult[]> {
|
|
|
|
|
|
|
+export async function searchVec(db: Database, query: string, model: string, limit: number = 20, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[], embedProvider?: EmbeddingProvider): Promise<SearchResult[]> {
|
|
|
const tableExists = db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
|
|
const tableExists = db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
|
|
|
if (!tableExists) return [];
|
|
if (!tableExists) return [];
|
|
|
|
|
|
|
|
- const embedding = precomputedEmbedding ?? await getEmbedding(query, model, true, session);
|
|
|
|
|
|
|
+ const embedding = precomputedEmbedding ?? await getEmbedding(query, model, true, session, undefined, embedProvider);
|
|
|
if (!embedding) return [];
|
|
if (!embedding) return [];
|
|
|
|
|
|
|
|
// IMPORTANT: We use a two-step query approach here because sqlite-vec virtual tables
|
|
// IMPORTANT: We use a two-step query approach here because sqlite-vec virtual tables
|
|
@@ -3350,7 +3395,24 @@ export async function searchVec(db: Database, query: string, model: string, limi
|
|
|
// Embeddings
|
|
// Embeddings
|
|
|
// =============================================================================
|
|
// =============================================================================
|
|
|
|
|
|
|
|
-async function getEmbedding(text: string, model: string, isQuery: boolean, session?: ILLMSession, llmOverride?: LlamaCpp): Promise<number[] | null> {
|
|
|
|
|
|
|
+async function getEmbedding(text: string, model: string, isQuery: boolean, session?: ILLMSession, llmOverride?: LlamaCpp, embedProvider?: EmbeddingProvider): Promise<number[] | null> {
|
|
|
|
|
+ // When an EmbeddingProvider is supplied, route the encoding through it
|
|
|
|
|
+ // (HTTP / GPU worker / fallback chain) instead of touching local
|
|
|
|
|
+ // node-llama-cpp at all. The provider sees the raw text + the desired
|
|
|
|
|
+ // model id; query-formatting prefixes are still applied via
|
|
|
|
|
+ // formatQueryForEmbedding so embedding parity with the index is preserved.
|
|
|
|
|
+ if (embedProvider) {
|
|
|
|
|
+ const providerModel = embedProvider.getModelId();
|
|
|
|
|
+ const formattedText = isQuery
|
|
|
|
|
+ ? formatQueryForEmbedding(text, providerModel)
|
|
|
|
|
+ : formatDocForEmbedding(text, undefined, providerModel);
|
|
|
|
|
+ // Only forward an AbortSignal when the provider is local-backed;
|
|
|
|
|
+ // remote providers manage their own timeouts and an LLM-session signal
|
|
|
|
|
+ // would abort their HTTP request prematurely (i-08ovbvtb).
|
|
|
|
|
+ const sig = embedProvider.kind === "local" ? session?.signal : undefined;
|
|
|
|
|
+ const result = await embedProvider.embed(formattedText, sig ? { model: providerModel, signal: sig } : { model: providerModel });
|
|
|
|
|
+ return result?.embedding ?? null;
|
|
|
|
|
+ }
|
|
|
// Format text using the appropriate prompt template
|
|
// Format text using the appropriate prompt template
|
|
|
const formattedText = isQuery ? formatQueryForEmbedding(text, model) : formatDocForEmbedding(text, undefined, model);
|
|
const formattedText = isQuery ? formatQueryForEmbedding(text, model) : formatDocForEmbedding(text, undefined, model);
|
|
|
const result = session
|
|
const result = session
|
|
@@ -4147,6 +4209,14 @@ export interface HybridQueryOptions {
|
|
|
skipRerank?: boolean; // skip LLM reranking, use only RRF scores
|
|
skipRerank?: boolean; // skip LLM reranking, use only RRF scores
|
|
|
chunkStrategy?: ChunkStrategy;
|
|
chunkStrategy?: ChunkStrategy;
|
|
|
hooks?: SearchHooks;
|
|
hooks?: SearchHooks;
|
|
|
|
|
+ /**
|
|
|
|
|
+ * Optional embedding provider for query-side encoding (i-loazq6ze).
|
|
|
|
|
+ * When supplied, the original-query vector AND any vec/hyde expansion
|
|
|
|
|
+ * variants are encoded through this provider (HTTP, GPU worker,
|
|
|
|
|
+ * AutoFallback chain) instead of `getLlm(store).embedBatch(...)`. Skip
|
|
|
|
|
+ * to keep pre-patch behavior (uses local LlamaCpp).
|
|
|
|
|
+ */
|
|
|
|
|
+ embedProvider?: EmbeddingProvider;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
export interface HybridQueryResult {
|
|
export interface HybridQueryResult {
|
|
@@ -4194,6 +4264,7 @@ export async function hybridQuery(
|
|
|
const intent = options?.intent;
|
|
const intent = options?.intent;
|
|
|
const skipRerank = options?.skipRerank ?? false;
|
|
const skipRerank = options?.skipRerank ?? false;
|
|
|
const hooks = options?.hooks;
|
|
const hooks = options?.hooks;
|
|
|
|
|
+ const embedProvider = options?.embedProvider;
|
|
|
|
|
|
|
|
const rankedLists: RankedResult[][] = [];
|
|
const rankedLists: RankedResult[][] = [];
|
|
|
const rankedListMeta: RankedListMeta[] = [];
|
|
const rankedListMeta: RankedListMeta[] = [];
|
|
@@ -4267,12 +4338,19 @@ export async function hybridQuery(
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- // Batch embed all vector queries in a single call
|
|
|
|
|
- const llm = getLlm(store);
|
|
|
|
|
- const textsToEmbed = vecQueries.map(q => formatQueryForEmbedding(q.text, llm.embedModelName));
|
|
|
|
|
|
|
+ // Batch embed all vector queries in a single call.
|
|
|
|
|
+ // When `embedProvider` is supplied (i-loazq6ze), route the encode through
|
|
|
|
|
+ // it (HTTP / GPU worker / AutoFallback chain) instead of warming the
|
|
|
|
|
+ // local llama-cpp model — this is the whole point of the GPU worker.
|
|
|
|
|
+ const embedModelName = embedProvider
|
|
|
|
|
+ ? embedProvider.getModelId()
|
|
|
|
|
+ : getLlm(store).embedModelName;
|
|
|
|
|
+ const textsToEmbed = vecQueries.map(q => formatQueryForEmbedding(q.text, embedModelName));
|
|
|
hooks?.onEmbedStart?.(textsToEmbed.length);
|
|
hooks?.onEmbedStart?.(textsToEmbed.length);
|
|
|
const embedStart = Date.now();
|
|
const embedStart = Date.now();
|
|
|
- const embeddings = await llm.embedBatch(textsToEmbed);
|
|
|
|
|
|
|
+ const embeddings = embedProvider
|
|
|
|
|
+ ? await embedProvider.embedBatch(textsToEmbed, { model: embedModelName })
|
|
|
|
|
+ : await getLlm(store).embedBatch(textsToEmbed);
|
|
|
hooks?.onEmbedDone?.(Date.now() - embedStart);
|
|
hooks?.onEmbedDone?.(Date.now() - embedStart);
|
|
|
|
|
|
|
|
// Run sqlite-vec lookups with pre-computed embeddings
|
|
// Run sqlite-vec lookups with pre-computed embeddings
|
|
@@ -4468,6 +4546,12 @@ export interface VectorSearchOptions {
|
|
|
minScore?: number; // default 0.3
|
|
minScore?: number; // default 0.3
|
|
|
intent?: string; // domain intent hint for disambiguation
|
|
intent?: string; // domain intent hint for disambiguation
|
|
|
hooks?: Pick<SearchHooks, 'onExpand'>;
|
|
hooks?: Pick<SearchHooks, 'onExpand'>;
|
|
|
|
|
+ /**
|
|
|
|
|
+ * Optional embedding provider for query-side encoding (i-loazq6ze).
|
|
|
|
|
+ * When supplied, query vectors are encoded via the provider (HTTP /
|
|
|
|
|
+ * GPU worker / fallback chain) instead of the local llama-cpp model.
|
|
|
|
|
+ */
|
|
|
|
|
+ embedProvider?: EmbeddingProvider;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
export interface VectorSearchResult {
|
|
export interface VectorSearchResult {
|
|
@@ -4498,6 +4582,7 @@ export async function vectorSearchQuery(
|
|
|
const minScore = options?.minScore ?? 0.3;
|
|
const minScore = options?.minScore ?? 0.3;
|
|
|
const collection = options?.collection;
|
|
const collection = options?.collection;
|
|
|
const intent = options?.intent;
|
|
const intent = options?.intent;
|
|
|
|
|
+ const embedProvider = options?.embedProvider;
|
|
|
|
|
|
|
|
const hasVectors = !!store.db.prepare(
|
|
const hasVectors = !!store.db.prepare(
|
|
|
`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`
|
|
`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`
|
|
@@ -4510,11 +4595,17 @@ export async function vectorSearchQuery(
|
|
|
const vecExpanded = allExpanded.filter(q => q.type !== 'lex');
|
|
const vecExpanded = allExpanded.filter(q => q.type !== 'lex');
|
|
|
options?.hooks?.onExpand?.(query, vecExpanded, Date.now() - expandStart);
|
|
options?.hooks?.onExpand?.(query, vecExpanded, Date.now() - expandStart);
|
|
|
|
|
|
|
|
- // Run original + vec/hyde expanded through vector, sequentially — concurrent embed() hangs
|
|
|
|
|
|
|
+ // Run original + vec/hyde expanded through vector, sequentially — concurrent embed() hangs.
|
|
|
|
|
+ // When `embedProvider` is supplied (i-loazq6ze), query encoding is routed
|
|
|
|
|
+ // through it; the per-call signature `searchVec(...)` accepts the provider
|
|
|
|
|
+ // as the trailing argument so existing tests / callers stay untouched.
|
|
|
const queryTexts = [query, ...vecExpanded.map(q => q.query)];
|
|
const queryTexts = [query, ...vecExpanded.map(q => q.query)];
|
|
|
const allResults = new Map<string, VectorSearchResult>();
|
|
const allResults = new Map<string, VectorSearchResult>();
|
|
|
for (const q of queryTexts) {
|
|
for (const q of queryTexts) {
|
|
|
- const vecResults = await store.searchVec(q, DEFAULT_EMBED_MODEL, limit, collection);
|
|
|
|
|
|
|
+ const vecResults = await store.searchVec(
|
|
|
|
|
+ q, DEFAULT_EMBED_MODEL, limit, collection,
|
|
|
|
|
+ undefined, undefined, embedProvider,
|
|
|
|
|
+ );
|
|
|
for (const r of vecResults) {
|
|
for (const r of vecResults) {
|
|
|
const existing = allResults.get(r.filepath);
|
|
const existing = allResults.get(r.filepath);
|
|
|
if (!existing || r.score > existing.score) {
|
|
if (!existing || r.score > existing.score) {
|
|
@@ -4557,6 +4648,12 @@ export interface StructuredSearchOptions {
|
|
|
skipRerank?: boolean;
|
|
skipRerank?: boolean;
|
|
|
chunkStrategy?: ChunkStrategy;
|
|
chunkStrategy?: ChunkStrategy;
|
|
|
hooks?: SearchHooks;
|
|
hooks?: SearchHooks;
|
|
|
|
|
+ /**
|
|
|
|
|
+ * Optional embedding provider for query-side encoding (i-loazq6ze).
|
|
|
|
|
+ * When supplied, vec/hyde sub-queries are batch-encoded via the provider
|
|
|
|
|
+ * (HTTP / GPU worker / fallback chain) instead of `getLlm(store).embedBatch`.
|
|
|
|
|
+ */
|
|
|
|
|
+ embedProvider?: EmbeddingProvider;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
/**
|
|
@@ -4589,6 +4686,7 @@ export async function structuredSearch(
|
|
|
const intent = options?.intent;
|
|
const intent = options?.intent;
|
|
|
const skipRerank = options?.skipRerank ?? false;
|
|
const skipRerank = options?.skipRerank ?? false;
|
|
|
const hooks = options?.hooks;
|
|
const hooks = options?.hooks;
|
|
|
|
|
+ const embedProvider = options?.embedProvider;
|
|
|
|
|
|
|
|
const collections = options?.collections;
|
|
const collections = options?.collections;
|
|
|
|
|
|
|
@@ -4651,11 +4749,19 @@ export async function structuredSearch(
|
|
|
s.type === 'vec' || s.type === 'hyde'
|
|
s.type === 'vec' || s.type === 'hyde'
|
|
|
);
|
|
);
|
|
|
if (vecSearches.length > 0) {
|
|
if (vecSearches.length > 0) {
|
|
|
- const llm = getLlm(store);
|
|
|
|
|
- const textsToEmbed = vecSearches.map(s => formatQueryForEmbedding(s.query, llm.embedModelName));
|
|
|
|
|
|
|
+ // Route batch encoding through the supplied EmbeddingProvider when
|
|
|
|
|
+ // present (i-loazq6ze). Otherwise fall back to the local llama-cpp
|
|
|
|
|
+ // singleton — preserves pre-patch behavior for callers that don't
|
|
|
|
|
+ // configure a provider.
|
|
|
|
|
+ const embedModelName = embedProvider
|
|
|
|
|
+ ? embedProvider.getModelId()
|
|
|
|
|
+ : getLlm(store).embedModelName;
|
|
|
|
|
+ const textsToEmbed = vecSearches.map(s => formatQueryForEmbedding(s.query, embedModelName));
|
|
|
hooks?.onEmbedStart?.(textsToEmbed.length);
|
|
hooks?.onEmbedStart?.(textsToEmbed.length);
|
|
|
const embedStart = Date.now();
|
|
const embedStart = Date.now();
|
|
|
- const embeddings = await llm.embedBatch(textsToEmbed);
|
|
|
|
|
|
|
+ const embeddings = embedProvider
|
|
|
|
|
+ ? await embedProvider.embedBatch(textsToEmbed, { model: embedModelName })
|
|
|
|
|
+ : await getLlm(store).embedBatch(textsToEmbed);
|
|
|
hooks?.onEmbedDone?.(Date.now() - embedStart);
|
|
hooks?.onEmbedDone?.(Date.now() - embedStart);
|
|
|
|
|
|
|
|
for (let i = 0; i < vecSearches.length; i++) {
|
|
for (let i = 0; i < vecSearches.length; i++) {
|