|
|
@@ -1141,6 +1141,15 @@ export async function generateEmbeddings(store, options) {
|
|
|
}
|
|
|
return session.embedBatch(texts, { model: modelArg });
|
|
|
};
|
|
|
+ // JS-only token estimator for the provider path. Char-based with
|
|
|
+ // avgCharsPerToken=3 — matches the heuristic the chunker already
|
|
|
+ // uses for its initial char-space pass, so the safety re-split is a
|
|
|
+ // near no-op while populating the `tokens` field with a stable
|
|
|
+ // estimate. CRITICAL: avoids loading node-llama-cpp on remote-only
|
|
|
+ // deployments (`QMD_EMBED_ENDPOINT=...`). i-1rqixh6m DoD #1.
|
|
|
+ const chunkTokenizer = provider
|
|
|
+ ? (text) => Math.ceil(text.length / 3)
|
|
|
+ : undefined;
|
|
|
for (const batchMeta of batches) {
|
|
|
// Abort early if session has been invalidated
|
|
|
if (!session.isValid) {
|
|
|
@@ -1156,7 +1165,7 @@ export async function generateEmbeddings(store, options) {
|
|
|
const title = extractTitle(doc.body, doc.path);
|
|
|
const perCollectionStrategy = collectionStrategies.get(doc.collection);
|
|
|
const chunkStrategy = perCollectionStrategy ?? options?.chunkStrategy;
|
|
|
- const chunks = await chunkDocumentByTokens(doc.body, undefined, undefined, undefined, doc.path, chunkStrategy, session.signal);
|
|
|
+ const chunks = await chunkDocumentByTokens(doc.body, undefined, undefined, undefined, doc.path, chunkStrategy, session.signal, chunkTokenizer);
|
|
|
for (let seq = 0; seq < chunks.length; seq++) {
|
|
|
batchChunks.push({
|
|
|
hash: doc.hash,
|
|
|
@@ -1316,7 +1325,7 @@ export function createStore(dbPath) {
|
|
|
toVirtualPath: (absolutePath) => toVirtualPath(db, absolutePath),
|
|
|
// Search
|
|
|
searchFTS: (query, limit, collectionName) => searchFTS(db, query, limit, collectionName),
|
|
|
- searchVec: (query, model, limit, collectionName, session, precomputedEmbedding) => searchVec(db, query, model, limit, collectionName, session, precomputedEmbedding),
|
|
|
+ searchVec: (query, model, limit, collectionName, session, precomputedEmbedding, embedProvider) => searchVec(db, query, model, limit, collectionName, session, precomputedEmbedding, embedProvider),
|
|
|
// Query expansion & reranking
|
|
|
expandQuery: (query, model, intent) => expandQuery(query, model, db, intent, store.llm),
|
|
|
rerank: (query, documents, model, intent) => rerank(query, documents, model, db, intent, store.llm),
|
|
|
@@ -1782,11 +1791,26 @@ function chunkByFunctionRanges(content, ranges, regexPoints, codeFences, maxChar
|
|
|
* Chunk a document by actual token count using the LLM tokenizer.
|
|
|
* More accurate than character-based chunking but requires async.
|
|
|
*
|
|
|
- * When filepath and chunkStrategy are provided, uses AST-aware break points
|
|
|
- * for supported code files.
|
|
|
- */
|
|
|
-export async function chunkDocumentByTokens(content, maxTokens = CHUNK_SIZE_TOKENS, overlapTokens = CHUNK_OVERLAP_TOKENS, windowTokens = CHUNK_WINDOW_TOKENS, filepath, chunkStrategy = "regex", signal) {
|
|
|
- const llm = getDefaultLlamaCpp();
|
|
|
+ * When `tokenizer` is supplied, it is used in place of the local
|
|
|
+ * `llm.tokenize(...)` call — neither `getDefaultLlamaCpp()` nor
|
|
|
+ * `llm.tokenize(...)` is invoked. This lets remote-only deployments
|
|
|
+ * (`QMD_EMBED_ENDPOINT=...`) chunk documents without warming up
|
|
|
+ * node-llama-cpp (DoD #1 of i-1rqixh6m / i-qkarfffa).
|
|
|
+ *
|
|
|
+ * When `filepath` and `chunkStrategy` are provided, uses AST-aware break
|
|
|
+ * points for supported code files.
|
|
|
+ */
|
|
|
+export async function chunkDocumentByTokens(content, maxTokens = CHUNK_SIZE_TOKENS, overlapTokens = CHUNK_OVERLAP_TOKENS, windowTokens = CHUNK_WINDOW_TOKENS, filepath, chunkStrategy = "regex", signal, tokenizer) {
|
|
|
+ // Resolve token counter lazily so callers that supply `tokenizer` never
|
|
|
+ // touch the local LlamaCpp instance — `getDefaultLlamaCpp()` is only
|
|
|
+ // invoked from inside the default closure when it is actually called
|
|
|
+ // (i.e. when no tokenizer is supplied).
|
|
|
+ let llm;
|
|
|
+ const countTokens = tokenizer ?? (async (text) => {
|
|
|
+ if (!llm)
|
|
|
+ llm = getDefaultLlamaCpp();
|
|
|
+ return (await llm.tokenize(text)).length;
|
|
|
+ });
|
|
|
// Use moderate chars/token estimate (prose ~4, code ~2, mixed ~3)
|
|
|
// If chunks exceed limit, they'll be re-split with actual ratio
|
|
|
const avgCharsPerToken = 3;
|
|
|
@@ -1802,24 +1826,24 @@ export async function chunkDocumentByTokens(content, maxTokens = CHUNK_SIZE_TOKE
|
|
|
// Respect abort signal to avoid runaway tokenization
|
|
|
if (signal?.aborted)
|
|
|
break;
|
|
|
- const tokens = await llm.tokenize(chunk.text);
|
|
|
- if (tokens.length <= maxTokens) {
|
|
|
- results.push({ text: chunk.text, pos: chunk.pos, tokens: tokens.length });
|
|
|
+ const tokenCount = await countTokens(chunk.text);
|
|
|
+ if (tokenCount <= maxTokens) {
|
|
|
+ results.push({ text: chunk.text, pos: chunk.pos, tokens: tokenCount });
|
|
|
}
|
|
|
else {
|
|
|
// Chunk is still too large - split it further
|
|
|
// Use actual token count to estimate better char limit
|
|
|
- const actualCharsPerToken = chunk.text.length / tokens.length;
|
|
|
+ const actualCharsPerToken = chunk.text.length / tokenCount;
|
|
|
const safeMaxChars = Math.floor(maxTokens * actualCharsPerToken * 0.95); // 5% safety margin
|
|
|
const subChunks = chunkDocument(chunk.text, safeMaxChars, Math.floor(overlapChars * actualCharsPerToken / 2), Math.floor(windowChars * actualCharsPerToken / 2));
|
|
|
for (const subChunk of subChunks) {
|
|
|
if (signal?.aborted)
|
|
|
break;
|
|
|
- const subTokens = await llm.tokenize(subChunk.text);
|
|
|
+ const subCount = await countTokens(subChunk.text);
|
|
|
results.push({
|
|
|
text: subChunk.text,
|
|
|
pos: chunk.pos + subChunk.pos,
|
|
|
- tokens: subTokens.length,
|
|
|
+ tokens: subCount,
|
|
|
});
|
|
|
}
|
|
|
}
|
|
|
@@ -2493,11 +2517,11 @@ export function searchFTS(db, query, limit = 20, collectionName) {
|
|
|
// =============================================================================
|
|
|
// Vector Search
|
|
|
// =============================================================================
|
|
|
-export async function searchVec(db, query, model, limit = 20, collectionName, session, precomputedEmbedding) {
|
|
|
+export async function searchVec(db, query, model, limit = 20, collectionName, session, precomputedEmbedding, embedProvider) {
|
|
|
const tableExists = db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
|
|
|
if (!tableExists)
|
|
|
return [];
|
|
|
- const embedding = precomputedEmbedding ?? await getEmbedding(query, model, true, session);
|
|
|
+ const embedding = precomputedEmbedding ?? await getEmbedding(query, model, true, session, undefined, embedProvider);
|
|
|
if (!embedding)
|
|
|
return [];
|
|
|
// IMPORTANT: We use a two-step query approach here because sqlite-vec virtual tables
|
|
|
@@ -2571,7 +2595,24 @@ export async function searchVec(db, query, model, limit = 20, collectionName, se
|
|
|
// =============================================================================
|
|
|
// Embeddings
|
|
|
// =============================================================================
|
|
|
-async function getEmbedding(text, model, isQuery, session, llmOverride) {
|
|
|
+async function getEmbedding(text, model, isQuery, session, llmOverride, embedProvider) {
|
|
|
+ // When an EmbeddingProvider is supplied, route the encoding through it
|
|
|
+ // (HTTP / GPU worker / fallback chain) instead of touching local
|
|
|
+ // node-llama-cpp at all. The provider sees the raw text + the desired
|
|
|
+ // model id; query-formatting prefixes are still applied via
|
|
|
+ // formatQueryForEmbedding so embedding parity with the index is preserved.
|
|
|
+ if (embedProvider) {
|
|
|
+ const providerModel = embedProvider.getModelId();
|
|
|
+ const formattedText = isQuery
|
|
|
+ ? formatQueryForEmbedding(text, providerModel)
|
|
|
+ : formatDocForEmbedding(text, undefined, providerModel);
|
|
|
+ // Only forward an AbortSignal when the provider is local-backed;
|
|
|
+ // remote providers manage their own timeouts and an LLM-session signal
|
|
|
+ // would abort their HTTP request prematurely (i-08ovbvtb).
|
|
|
+ const sig = embedProvider.kind === "local" ? session?.signal : undefined;
|
|
|
+ const result = await embedProvider.embed(formattedText, sig ? { model: providerModel, signal: sig } : { model: providerModel });
|
|
|
+ return result?.embedding ?? null;
|
|
|
+ }
|
|
|
// Format text using the appropriate prompt template
|
|
|
const formattedText = isQuery ? formatQueryForEmbedding(text, model) : formatDocForEmbedding(text, undefined, model);
|
|
|
const result = session
|
|
|
@@ -3236,6 +3277,7 @@ export async function hybridQuery(store, query, options) {
|
|
|
const intent = options?.intent;
|
|
|
const skipRerank = options?.skipRerank ?? false;
|
|
|
const hooks = options?.hooks;
|
|
|
+ const embedProvider = options?.embedProvider;
|
|
|
const rankedLists = [];
|
|
|
const rankedListMeta = [];
|
|
|
const docidMap = new Map(); // filepath -> docid
|
|
|
@@ -3300,12 +3342,19 @@ export async function hybridQuery(store, query, options) {
|
|
|
vecQueries.push({ text: q.query, queryType: q.type });
|
|
|
}
|
|
|
}
|
|
|
- // Batch embed all vector queries in a single call
|
|
|
- const llm = getLlm(store);
|
|
|
- const textsToEmbed = vecQueries.map(q => formatQueryForEmbedding(q.text, llm.embedModelName));
|
|
|
+ // Batch embed all vector queries in a single call.
|
|
|
+ // When `embedProvider` is supplied (i-loazq6ze), route the encode through
|
|
|
+ // it (HTTP / GPU worker / AutoFallback chain) instead of warming the
|
|
|
+ // local llama-cpp model — this is the whole point of the GPU worker.
|
|
|
+ const embedModelName = embedProvider
|
|
|
+ ? embedProvider.getModelId()
|
|
|
+ : getLlm(store).embedModelName;
|
|
|
+ const textsToEmbed = vecQueries.map(q => formatQueryForEmbedding(q.text, embedModelName));
|
|
|
hooks?.onEmbedStart?.(textsToEmbed.length);
|
|
|
const embedStart = Date.now();
|
|
|
- const embeddings = await llm.embedBatch(textsToEmbed);
|
|
|
+ const embeddings = embedProvider
|
|
|
+ ? await embedProvider.embedBatch(textsToEmbed, { model: embedModelName })
|
|
|
+ : await getLlm(store).embedBatch(textsToEmbed);
|
|
|
hooks?.onEmbedDone?.(Date.now() - embedStart);
|
|
|
// Run sqlite-vec lookups with pre-computed embeddings
|
|
|
for (let i = 0; i < vecQueries.length; i++) {
|
|
|
@@ -3501,6 +3550,7 @@ export async function vectorSearchQuery(store, query, options) {
|
|
|
const minScore = options?.minScore ?? 0.3;
|
|
|
const collection = options?.collection;
|
|
|
const intent = options?.intent;
|
|
|
+ const embedProvider = options?.embedProvider;
|
|
|
const hasVectors = !!store.db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
|
|
|
if (!hasVectors)
|
|
|
return [];
|
|
|
@@ -3509,11 +3559,14 @@ export async function vectorSearchQuery(store, query, options) {
|
|
|
const allExpanded = await store.expandQuery(query, undefined, intent);
|
|
|
const vecExpanded = allExpanded.filter(q => q.type !== 'lex');
|
|
|
options?.hooks?.onExpand?.(query, vecExpanded, Date.now() - expandStart);
|
|
|
- // Run original + vec/hyde expanded through vector, sequentially — concurrent embed() hangs
|
|
|
+ // Run original + vec/hyde expanded through vector, sequentially — concurrent embed() hangs.
|
|
|
+ // When `embedProvider` is supplied (i-loazq6ze), query encoding is routed
|
|
|
+ // through it; the per-call signature `searchVec(...)` accepts the provider
|
|
|
+ // as the trailing argument so existing tests / callers stay untouched.
|
|
|
const queryTexts = [query, ...vecExpanded.map(q => q.query)];
|
|
|
const allResults = new Map();
|
|
|
for (const q of queryTexts) {
|
|
|
- const vecResults = await store.searchVec(q, DEFAULT_EMBED_MODEL, limit, collection);
|
|
|
+ const vecResults = await store.searchVec(q, DEFAULT_EMBED_MODEL, limit, collection, undefined, undefined, embedProvider);
|
|
|
for (const r of vecResults) {
|
|
|
const existing = allResults.get(r.filepath);
|
|
|
if (!existing || r.score > existing.score) {
|
|
|
@@ -3560,6 +3613,7 @@ export async function structuredSearch(store, searches, options) {
|
|
|
const intent = options?.intent;
|
|
|
const skipRerank = options?.skipRerank ?? false;
|
|
|
const hooks = options?.hooks;
|
|
|
+ const embedProvider = options?.embedProvider;
|
|
|
const collections = options?.collections;
|
|
|
if (searches.length === 0)
|
|
|
return [];
|
|
|
@@ -3613,11 +3667,19 @@ export async function structuredSearch(store, searches, options) {
|
|
|
if (hasVectors) {
|
|
|
const vecSearches = searches.filter((s) => s.type === 'vec' || s.type === 'hyde');
|
|
|
if (vecSearches.length > 0) {
|
|
|
- const llm = getLlm(store);
|
|
|
- const textsToEmbed = vecSearches.map(s => formatQueryForEmbedding(s.query, llm.embedModelName));
|
|
|
+ // Route batch encoding through the supplied EmbeddingProvider when
|
|
|
+ // present (i-loazq6ze). Otherwise fall back to the local llama-cpp
|
|
|
+ // singleton — preserves pre-patch behavior for callers that don't
|
|
|
+ // configure a provider.
|
|
|
+ const embedModelName = embedProvider
|
|
|
+ ? embedProvider.getModelId()
|
|
|
+ : getLlm(store).embedModelName;
|
|
|
+ const textsToEmbed = vecSearches.map(s => formatQueryForEmbedding(s.query, embedModelName));
|
|
|
hooks?.onEmbedStart?.(textsToEmbed.length);
|
|
|
const embedStart = Date.now();
|
|
|
- const embeddings = await llm.embedBatch(textsToEmbed);
|
|
|
+ const embeddings = embedProvider
|
|
|
+ ? await embedProvider.embedBatch(textsToEmbed, { model: embedModelName })
|
|
|
+ : await getLlm(store).embedBatch(textsToEmbed);
|
|
|
hooks?.onEmbedDone?.(Date.now() - embedStart);
|
|
|
for (let i = 0; i < vecSearches.length; i++) {
|
|
|
const embedding = embeddings[i]?.embedding;
|