|
|
@@ -956,8 +956,13 @@ function resolveEmbedOptions(options) {
|
|
|
};
|
|
|
}
|
|
|
function getPendingEmbeddingDocs(db) {
|
|
|
+ // `MIN(d.collection)` deterministically picks one collection per hash when
|
|
|
+ // the same content is indexed in multiple collections (SQLite tie-breaks
|
|
|
+ // alphabetically). The identical bytes produce identical chunks regardless
|
|
|
+ // of which collection wins; the chunkStrategy lookup still resolves via
|
|
|
+ // that collection's YAML config. See Phase 2 design notes (i-bud0h8vu).
|
|
|
return db.prepare(`
|
|
|
- SELECT d.hash, MIN(d.path) as path, length(CAST(c.doc AS BLOB)) as bytes
|
|
|
+ SELECT d.hash, MIN(d.path) as path, MIN(d.collection) as collection, length(CAST(c.doc AS BLOB)) as bytes
|
|
|
FROM documents d
|
|
|
JOIN content c ON d.hash = c.hash
|
|
|
LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
|
|
|
@@ -1023,6 +1028,24 @@ export async function generateEmbeddings(store, options) {
|
|
|
const totalBytes = docsToEmbed.reduce((sum, doc) => sum + Math.max(0, doc.bytes), 0);
|
|
|
const totalDocs = docsToEmbed.length;
|
|
|
const startTime = Date.now();
|
|
|
+ // Per-collection chunkStrategy lookup (Phase 2 — i-bud0h8vu). YAML
|
|
|
+ // `chunkStrategy` on a collection wins over `options.chunkStrategy`
|
|
|
+ // (global CLI flag); falls back to the global option, then to
|
|
|
+ // chunkDocumentByTokens' own "regex" default when neither is set.
|
|
|
+ // Opt-in per collection — collections without the field are untouched.
|
|
|
+ const collectionStrategies = new Map();
|
|
|
+ try {
|
|
|
+ const { listCollections: listYamlCollections } = await import("./collections.js");
|
|
|
+ for (const c of listYamlCollections()) {
|
|
|
+ if (c.chunkStrategy)
|
|
|
+ collectionStrategies.set(c.name, c.chunkStrategy);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ catch {
|
|
|
+ // If YAML config is missing/unreadable, fall back silently to the
|
|
|
+ // global strategy — no collection overrides. Keeps SDK/inline
|
|
|
+ // callers that never touch ~/.config/qmd working.
|
|
|
+ }
|
|
|
// Use store's LlamaCpp or global singleton, wrapped in a session
|
|
|
const llm = getLlm(store);
|
|
|
const embedModelUri = llm.embedModelName;
|
|
|
@@ -1048,7 +1071,9 @@ export async function generateEmbeddings(store, options) {
|
|
|
if (!doc.body.trim())
|
|
|
continue;
|
|
|
const title = extractTitle(doc.body, doc.path);
|
|
|
- const chunks = await chunkDocumentByTokens(doc.body, undefined, undefined, undefined, doc.path, options?.chunkStrategy, session.signal);
|
|
|
+ const perCollectionStrategy = collectionStrategies.get(doc.collection);
|
|
|
+ const chunkStrategy = perCollectionStrategy ?? options?.chunkStrategy;
|
|
|
+ const chunks = await chunkDocumentByTokens(doc.body, undefined, undefined, undefined, doc.path, chunkStrategy, session.signal);
|
|
|
for (let seq = 0; seq < chunks.length; seq++) {
|
|
|
batchChunks.push({
|
|
|
hash: doc.hash,
|
|
|
@@ -1557,12 +1582,30 @@ export function chunkDocument(content, maxChars = CHUNK_SIZE_CHARS, overlapChars
|
|
|
* break points for supported code files, merges with regex break points,
|
|
|
* and delegates to the shared chunk algorithm.
|
|
|
*
|
|
|
- * Falls back to regex-only when strategy is "regex", filepath is absent,
|
|
|
- * or language is unsupported.
|
|
|
+ * Strategies:
|
|
|
+ * - "regex" (default) — char-based chunking with regex break points only.
|
|
|
+ * - "auto" — regex break points merged with AST break points (soft hints).
|
|
|
+ * - "function" — one chunk per AST function range (Phase 2); inter-range
|
|
|
+ * gaps (imports, top-level code) are char-chunked with AST
|
|
|
+ * hints. Falls back to "auto" when zero ranges are detected.
|
|
|
*/
|
|
|
export async function chunkDocumentAsync(content, maxChars = CHUNK_SIZE_CHARS, overlapChars = CHUNK_OVERLAP_CHARS, windowChars = CHUNK_WINDOW_CHARS, filepath, chunkStrategy = "regex") {
|
|
|
const regexPoints = scanBreakPoints(content);
|
|
|
const codeFences = findCodeFences(content);
|
|
|
+ // "function" strategy: delegate to the function-level chunker. If no
|
|
|
+ // ranges are detected (markdown, unsupported lang, parse failure), fall
|
|
|
+ // back to "auto" behavior (AST-break-point-assisted char chunking).
|
|
|
+ if (chunkStrategy === "function" && filepath) {
|
|
|
+ const { getASTFunctionRanges, getASTBreakPoints } = await import("./ast.js");
|
|
|
+ const ranges = await getASTFunctionRanges(content, filepath);
|
|
|
+ if (ranges.length > 0) {
|
|
|
+ return chunkByFunctionRanges(content, ranges, regexPoints, codeFences, maxChars, overlapChars, windowChars);
|
|
|
+ }
|
|
|
+ // Zero ranges — fall through to auto behavior so break points still help.
|
|
|
+ const astPoints = await getASTBreakPoints(content, filepath);
|
|
|
+ const merged = astPoints.length > 0 ? mergeBreakPoints(regexPoints, astPoints) : regexPoints;
|
|
|
+ return chunkDocumentWithBreakPoints(content, merged, codeFences, maxChars, overlapChars, windowChars);
|
|
|
+ }
|
|
|
let breakPoints = regexPoints;
|
|
|
if (chunkStrategy === "auto" && filepath) {
|
|
|
const { getASTBreakPoints } = await import("./ast.js");
|
|
|
@@ -1573,6 +1616,85 @@ export async function chunkDocumentAsync(content, maxChars = CHUNK_SIZE_CHARS, o
|
|
|
}
|
|
|
return chunkDocumentWithBreakPoints(content, breakPoints, codeFences, maxChars, overlapChars, windowChars);
|
|
|
}
|
|
|
+/**
|
|
|
+ * Produce one chunk per AST function range, plus char-chunks for the gaps
|
|
|
+ * between ranges (imports, top-level code). Ranges that exceed `maxChars`
|
|
|
+ * are further split using the existing char-based algorithm so we never
|
|
|
+ * emit a single oversized chunk.
|
|
|
+ *
|
|
|
+ * Preconditions: `ranges` is non-empty, sorted by `startIndex`, and the
|
|
|
+ * ranges are non-overlapping (as produced by `getASTFunctionRanges`).
|
|
|
+ */
|
|
|
+function chunkByFunctionRanges(content, ranges, regexPoints, codeFences, maxChars, overlapChars, windowChars) {
|
|
|
+ const out = [];
|
|
|
+ let cursor = 0;
|
|
|
+ const emitGap = (start, end) => {
|
|
|
+ if (start >= end)
|
|
|
+ return;
|
|
|
+ const gap = content.slice(start, end);
|
|
|
+ // Whitespace-only gaps are dropped — they carry no embeddable signal.
|
|
|
+ if (!gap.trim())
|
|
|
+ return;
|
|
|
+ if (gap.length <= maxChars) {
|
|
|
+ out.push({ text: gap, pos: start });
|
|
|
+ return;
|
|
|
+ }
|
|
|
+ // Reuse char-based algorithm for oversized gaps. Restrict break
|
|
|
+ // points and code fences to the gap window and rebase positions so
|
|
|
+ // chunkDocumentWithBreakPoints operates on a standalone slice.
|
|
|
+ const subPoints = regexPoints
|
|
|
+ .filter(p => p.pos >= start && p.pos < end)
|
|
|
+ .map(p => ({ ...p, pos: p.pos - start }));
|
|
|
+ const subFences = codeFences
|
|
|
+ .filter(f => f.end > start && f.start < end)
|
|
|
+ .map(f => ({
|
|
|
+ start: Math.max(0, f.start - start),
|
|
|
+ end: Math.max(0, Math.min(end, f.end) - start),
|
|
|
+ }));
|
|
|
+ const sub = chunkDocumentWithBreakPoints(gap, subPoints, subFences, maxChars, overlapChars, windowChars);
|
|
|
+ for (const c of sub)
|
|
|
+ out.push({ text: c.text, pos: start + c.pos });
|
|
|
+ };
|
|
|
+ for (const range of ranges) {
|
|
|
+ // Emit any leading / inter-range gap (imports, top-level code).
|
|
|
+ emitGap(cursor, range.startIndex);
|
|
|
+ const body = content.slice(range.startIndex, range.endIndex);
|
|
|
+ if (body.length === 0) {
|
|
|
+ cursor = range.endIndex;
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ if (body.length <= maxChars) {
|
|
|
+ out.push({ text: body, pos: range.startIndex });
|
|
|
+ }
|
|
|
+ else {
|
|
|
+ // Oversized function/class — split with char algorithm so we stay
|
|
|
+ // under the embed token budget. Break points inside the range are
|
|
|
+ // reused to keep splits at syntactically-sensible positions.
|
|
|
+ const subPoints = regexPoints
|
|
|
+ .filter(p => p.pos >= range.startIndex && p.pos < range.endIndex)
|
|
|
+ .map(p => ({ ...p, pos: p.pos - range.startIndex }));
|
|
|
+ const subFences = codeFences
|
|
|
+ .filter(f => f.end > range.startIndex && f.start < range.endIndex)
|
|
|
+ .map(f => ({
|
|
|
+ start: Math.max(0, f.start - range.startIndex),
|
|
|
+ end: Math.max(0, Math.min(range.endIndex, f.end) - range.startIndex),
|
|
|
+ }));
|
|
|
+ const sub = chunkDocumentWithBreakPoints(body, subPoints, subFences, maxChars, overlapChars, windowChars);
|
|
|
+ for (const c of sub)
|
|
|
+ out.push({ text: c.text, pos: range.startIndex + c.pos });
|
|
|
+ }
|
|
|
+ cursor = range.endIndex;
|
|
|
+ }
|
|
|
+ // Trailing gap after the last range.
|
|
|
+ emitGap(cursor, content.length);
|
|
|
+ // Edge case: content consisted entirely of whitespace-only gaps (zero
|
|
|
+ // emitted chunks). Preserve the invariant that non-empty content yields
|
|
|
+ // at least one chunk.
|
|
|
+ if (out.length === 0 && content.length > 0) {
|
|
|
+ return [{ text: content, pos: 0 }];
|
|
|
+ }
|
|
|
+ return out;
|
|
|
+}
|
|
|
/**
|
|
|
* Chunk a document by actual token count using the LLM tokenizer.
|
|
|
* More accurate than character-based chunking but requires async.
|
|
|
@@ -2197,10 +2319,15 @@ function buildFTS5Query(query) {
|
|
|
/**
|
|
|
* Validate that a vec/hyde query doesn't use lex-only syntax.
|
|
|
* Returns error message if invalid, null if valid.
|
|
|
+ *
|
|
|
+ * Negation is detected ONLY when `-` is preceded by whitespace or sits at
|
|
|
+ * the start of the query. Hyphens inside words (e.g. `auto-archived`,
|
|
|
+ * `pre-commit`, `multi-session`, `state-of-the-art`) carry no negation
|
|
|
+ * semantics in natural English and must pass through unchanged.
|
|
|
*/
|
|
|
export function validateSemanticQuery(query) {
|
|
|
- // Check for negation syntax
|
|
|
- if (/-\w/.test(query) || /-"/.test(query)) {
|
|
|
+ // `-term` or `-"phrase"` only counts as negation at SOS or after whitespace.
|
|
|
+ if (/(?:^|\s)-\w/.test(query) || /(?:^|\s)-"/.test(query)) {
|
|
|
return 'Negation (-term) is not supported in vec/hyde queries. Use lex for exclusions.';
|
|
|
}
|
|
|
return null;
|