|
@@ -227,7 +227,7 @@ export function findBestCutoff(
|
|
|
// Chunk Strategy
|
|
// Chunk Strategy
|
|
|
// =============================================================================
|
|
// =============================================================================
|
|
|
|
|
|
|
|
-export type ChunkStrategy = "auto" | "regex";
|
|
|
|
|
|
|
+export type ChunkStrategy = "auto" | "regex" | "function";
|
|
|
|
|
|
|
|
/**
|
|
/**
|
|
|
* Merge two sets of break points (e.g. regex + AST), keeping the highest
|
|
* Merge two sets of break points (e.g. regex + AST), keeping the highest
|
|
@@ -1298,6 +1298,7 @@ type PendingEmbeddingDoc = {
|
|
|
hash: string;
|
|
hash: string;
|
|
|
path: string;
|
|
path: string;
|
|
|
bytes: number;
|
|
bytes: number;
|
|
|
|
|
+ collection: string;
|
|
|
};
|
|
};
|
|
|
|
|
|
|
|
type EmbeddingDoc = PendingEmbeddingDoc & {
|
|
type EmbeddingDoc = PendingEmbeddingDoc & {
|
|
@@ -1330,8 +1331,13 @@ function resolveEmbedOptions(options?: EmbedOptions): Required<Pick<EmbedOptions
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
function getPendingEmbeddingDocs(db: Database): PendingEmbeddingDoc[] {
|
|
function getPendingEmbeddingDocs(db: Database): PendingEmbeddingDoc[] {
|
|
|
|
|
+ // `MIN(d.collection)` deterministically picks one collection per hash when
|
|
|
|
|
+ // the same content is indexed in multiple collections (SQLite tie-breaks
|
|
|
|
|
+ // alphabetically). The identical bytes produce identical chunks regardless
|
|
|
|
|
+ // of which collection wins; the chunkStrategy lookup still resolves via
|
|
|
|
|
+ // that collection's YAML config. See Phase 2 design notes (i-bud0h8vu).
|
|
|
return db.prepare(`
|
|
return db.prepare(`
|
|
|
- SELECT d.hash, MIN(d.path) as path, length(CAST(c.doc AS BLOB)) as bytes
|
|
|
|
|
|
|
+ SELECT d.hash, MIN(d.path) as path, MIN(d.collection) as collection, length(CAST(c.doc AS BLOB)) as bytes
|
|
|
FROM documents d
|
|
FROM documents d
|
|
|
JOIN content c ON d.hash = c.hash
|
|
JOIN content c ON d.hash = c.hash
|
|
|
LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
|
|
LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
|
|
@@ -1417,6 +1423,23 @@ export async function generateEmbeddings(
|
|
|
const totalDocs = docsToEmbed.length;
|
|
const totalDocs = docsToEmbed.length;
|
|
|
const startTime = Date.now();
|
|
const startTime = Date.now();
|
|
|
|
|
|
|
|
|
|
+ // Per-collection chunkStrategy lookup (Phase 2 — i-bud0h8vu). YAML
|
|
|
|
|
+ // `chunkStrategy` on a collection wins over `options.chunkStrategy`
|
|
|
|
|
+ // (global CLI flag); falls back to the global option, then to
|
|
|
|
|
+ // chunkDocumentByTokens' own "regex" default when neither is set.
|
|
|
|
|
+ // Opt-in per collection — collections without the field are untouched.
|
|
|
|
|
+ const collectionStrategies = new Map<string, ChunkStrategy>();
|
|
|
|
|
+ try {
|
|
|
|
|
+ const { listCollections: listYamlCollections } = await import("./collections.js");
|
|
|
|
|
+ for (const c of listYamlCollections()) {
|
|
|
|
|
+ if (c.chunkStrategy) collectionStrategies.set(c.name, c.chunkStrategy);
|
|
|
|
|
+ }
|
|
|
|
|
+ } catch {
|
|
|
|
|
+ // If YAML config is missing/unreadable, fall back silently to the
|
|
|
|
|
+ // global strategy — no collection overrides. Keeps SDK/inline
|
|
|
|
|
+ // callers that never touch ~/.config/qmd working.
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
// Use store's LlamaCpp or global singleton, wrapped in a session
|
|
// Use store's LlamaCpp or global singleton, wrapped in a session
|
|
|
const llm = getLlm(store);
|
|
const llm = getLlm(store);
|
|
|
const embedModelUri = llm.embedModelName;
|
|
const embedModelUri = llm.embedModelName;
|
|
@@ -1446,11 +1469,13 @@ export async function generateEmbeddings(
|
|
|
if (!doc.body.trim()) continue;
|
|
if (!doc.body.trim()) continue;
|
|
|
|
|
|
|
|
const title = extractTitle(doc.body, doc.path);
|
|
const title = extractTitle(doc.body, doc.path);
|
|
|
|
|
+ const perCollectionStrategy = collectionStrategies.get(doc.collection);
|
|
|
|
|
+ const chunkStrategy = perCollectionStrategy ?? options?.chunkStrategy;
|
|
|
const chunks = await chunkDocumentByTokens(
|
|
const chunks = await chunkDocumentByTokens(
|
|
|
doc.body,
|
|
doc.body,
|
|
|
undefined, undefined, undefined,
|
|
undefined, undefined, undefined,
|
|
|
doc.path,
|
|
doc.path,
|
|
|
- options?.chunkStrategy,
|
|
|
|
|
|
|
+ chunkStrategy,
|
|
|
session.signal,
|
|
session.signal,
|
|
|
);
|
|
);
|
|
|
|
|
|
|
@@ -2171,8 +2196,12 @@ export function chunkDocument(
|
|
|
* break points for supported code files, merges with regex break points,
|
|
* break points for supported code files, merges with regex break points,
|
|
|
* and delegates to the shared chunk algorithm.
|
|
* and delegates to the shared chunk algorithm.
|
|
|
*
|
|
*
|
|
|
- * Falls back to regex-only when strategy is "regex", filepath is absent,
|
|
|
|
|
- * or language is unsupported.
|
|
|
|
|
|
|
+ * Strategies:
|
|
|
|
|
+ * - "regex" (default) — char-based chunking with regex break points only.
|
|
|
|
|
+ * - "auto" — regex break points merged with AST break points (soft hints).
|
|
|
|
|
+ * - "function" — one chunk per AST function range (Phase 2); inter-range
|
|
|
|
|
+ * gaps (imports, top-level code) are char-chunked with AST
|
|
|
|
|
+ * hints. Falls back to "auto" when zero ranges are detected.
|
|
|
*/
|
|
*/
|
|
|
export async function chunkDocumentAsync(
|
|
export async function chunkDocumentAsync(
|
|
|
content: string,
|
|
content: string,
|
|
@@ -2185,6 +2214,29 @@ export async function chunkDocumentAsync(
|
|
|
const regexPoints = scanBreakPoints(content);
|
|
const regexPoints = scanBreakPoints(content);
|
|
|
const codeFences = findCodeFences(content);
|
|
const codeFences = findCodeFences(content);
|
|
|
|
|
|
|
|
|
|
+ // "function" strategy: delegate to the function-level chunker. If no
|
|
|
|
|
+ // ranges are detected (markdown, unsupported lang, parse failure), fall
|
|
|
|
|
+ // back to "auto" behavior (AST-break-point-assisted char chunking).
|
|
|
|
|
+ if (chunkStrategy === "function" && filepath) {
|
|
|
|
|
+ const { getASTFunctionRanges, getASTBreakPoints } = await import("./ast.js");
|
|
|
|
|
+ const ranges = await getASTFunctionRanges(content, filepath);
|
|
|
|
|
+ if (ranges.length > 0) {
|
|
|
|
|
+ return chunkByFunctionRanges(
|
|
|
|
|
+ content,
|
|
|
|
|
+ ranges,
|
|
|
|
|
+ regexPoints,
|
|
|
|
|
+ codeFences,
|
|
|
|
|
+ maxChars,
|
|
|
|
|
+ overlapChars,
|
|
|
|
|
+ windowChars,
|
|
|
|
|
+ );
|
|
|
|
|
+ }
|
|
|
|
|
+ // Zero ranges — fall through to auto behavior so break points still help.
|
|
|
|
|
+ const astPoints = await getASTBreakPoints(content, filepath);
|
|
|
|
|
+ const merged = astPoints.length > 0 ? mergeBreakPoints(regexPoints, astPoints) : regexPoints;
|
|
|
|
|
+ return chunkDocumentWithBreakPoints(content, merged, codeFences, maxChars, overlapChars, windowChars);
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
let breakPoints = regexPoints;
|
|
let breakPoints = regexPoints;
|
|
|
if (chunkStrategy === "auto" && filepath) {
|
|
if (chunkStrategy === "auto" && filepath) {
|
|
|
const { getASTBreakPoints } = await import("./ast.js");
|
|
const { getASTBreakPoints } = await import("./ast.js");
|
|
@@ -2197,6 +2249,99 @@ export async function chunkDocumentAsync(
|
|
|
return chunkDocumentWithBreakPoints(content, breakPoints, codeFences, maxChars, overlapChars, windowChars);
|
|
return chunkDocumentWithBreakPoints(content, breakPoints, codeFences, maxChars, overlapChars, windowChars);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+/**
|
|
|
|
|
+ * Produce one chunk per AST function range, plus char-chunks for the gaps
|
|
|
|
|
+ * between ranges (imports, top-level code). Ranges that exceed `maxChars`
|
|
|
|
|
+ * are further split using the existing char-based algorithm so we never
|
|
|
|
|
+ * emit a single oversized chunk.
|
|
|
|
|
+ *
|
|
|
|
|
+ * Preconditions: `ranges` is non-empty, sorted by `startIndex`, and the
|
|
|
|
|
+ * ranges are non-overlapping (as produced by `getASTFunctionRanges`).
|
|
|
|
|
+ */
|
|
|
|
|
+function chunkByFunctionRanges(
|
|
|
|
|
+ content: string,
|
|
|
|
|
+ ranges: import("./ast.js").FunctionRange[],
|
|
|
|
|
+ regexPoints: BreakPoint[],
|
|
|
|
|
+ codeFences: CodeFenceRegion[],
|
|
|
|
|
+ maxChars: number,
|
|
|
|
|
+ overlapChars: number,
|
|
|
|
|
+ windowChars: number,
|
|
|
|
|
+): { text: string; pos: number }[] {
|
|
|
|
|
+ const out: { text: string; pos: number }[] = [];
|
|
|
|
|
+ let cursor = 0;
|
|
|
|
|
+
|
|
|
|
|
+ const emitGap = (start: number, end: number) => {
|
|
|
|
|
+ if (start >= end) return;
|
|
|
|
|
+ const gap = content.slice(start, end);
|
|
|
|
|
+ // Whitespace-only gaps are dropped — they carry no embeddable signal.
|
|
|
|
|
+ if (!gap.trim()) return;
|
|
|
|
|
+
|
|
|
|
|
+ if (gap.length <= maxChars) {
|
|
|
|
|
+ out.push({ text: gap, pos: start });
|
|
|
|
|
+ return;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // Reuse char-based algorithm for oversized gaps. Restrict break
|
|
|
|
|
+ // points and code fences to the gap window and rebase positions so
|
|
|
|
|
+ // chunkDocumentWithBreakPoints operates on a standalone slice.
|
|
|
|
|
+ const subPoints = regexPoints
|
|
|
|
|
+ .filter(p => p.pos >= start && p.pos < end)
|
|
|
|
|
+ .map(p => ({ ...p, pos: p.pos - start }));
|
|
|
|
|
+ const subFences = codeFences
|
|
|
|
|
+ .filter(f => f.end > start && f.start < end)
|
|
|
|
|
+ .map(f => ({
|
|
|
|
|
+ start: Math.max(0, f.start - start),
|
|
|
|
|
+ end: Math.max(0, Math.min(end, f.end) - start),
|
|
|
|
|
+ }));
|
|
|
|
|
+ const sub = chunkDocumentWithBreakPoints(gap, subPoints, subFences, maxChars, overlapChars, windowChars);
|
|
|
|
|
+ for (const c of sub) out.push({ text: c.text, pos: start + c.pos });
|
|
|
|
|
+ };
|
|
|
|
|
+
|
|
|
|
|
+ for (const range of ranges) {
|
|
|
|
|
+ // Emit any leading / inter-range gap (imports, top-level code).
|
|
|
|
|
+ emitGap(cursor, range.startIndex);
|
|
|
|
|
+
|
|
|
|
|
+ const body = content.slice(range.startIndex, range.endIndex);
|
|
|
|
|
+ if (body.length === 0) {
|
|
|
|
|
+ cursor = range.endIndex;
|
|
|
|
|
+ continue;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ if (body.length <= maxChars) {
|
|
|
|
|
+ out.push({ text: body, pos: range.startIndex });
|
|
|
|
|
+ } else {
|
|
|
|
|
+ // Oversized function/class — split with char algorithm so we stay
|
|
|
|
|
+ // under the embed token budget. Break points inside the range are
|
|
|
|
|
+ // reused to keep splits at syntactically-sensible positions.
|
|
|
|
|
+ const subPoints = regexPoints
|
|
|
|
|
+ .filter(p => p.pos >= range.startIndex && p.pos < range.endIndex)
|
|
|
|
|
+ .map(p => ({ ...p, pos: p.pos - range.startIndex }));
|
|
|
|
|
+ const subFences = codeFences
|
|
|
|
|
+ .filter(f => f.end > range.startIndex && f.start < range.endIndex)
|
|
|
|
|
+ .map(f => ({
|
|
|
|
|
+ start: Math.max(0, f.start - range.startIndex),
|
|
|
|
|
+ end: Math.max(0, Math.min(range.endIndex, f.end) - range.startIndex),
|
|
|
|
|
+ }));
|
|
|
|
|
+ const sub = chunkDocumentWithBreakPoints(body, subPoints, subFences, maxChars, overlapChars, windowChars);
|
|
|
|
|
+ for (const c of sub) out.push({ text: c.text, pos: range.startIndex + c.pos });
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ cursor = range.endIndex;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // Trailing gap after the last range.
|
|
|
|
|
+ emitGap(cursor, content.length);
|
|
|
|
|
+
|
|
|
|
|
+ // Edge case: content consisted entirely of whitespace-only gaps (zero
|
|
|
|
|
+ // emitted chunks). Preserve the invariant that non-empty content yields
|
|
|
|
|
+ // at least one chunk.
|
|
|
|
|
+ if (out.length === 0 && content.length > 0) {
|
|
|
|
|
+ return [{ text: content, pos: 0 }];
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ return out;
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
/**
|
|
/**
|
|
|
* Chunk a document by actual token count using the LLM tokenizer.
|
|
* Chunk a document by actual token count using the LLM tokenizer.
|
|
|
* More accurate than character-based chunking but requires async.
|
|
* More accurate than character-based chunking but requires async.
|