hace 2 meses · 56a77d5769
--- a/dist/ast.d.ts
+++ b/dist/ast.d.ts
@@ -47,6 +47,33 @@ export declare function getASTStatus(): Promise<{
 
				         error?: string;
			
 
				     }[];
			
 
				 }>;
			
 
				+/**
			
 
				+ * A byte-offset range covering a single top-level code unit
			
 
				+ * (function, method, class, interface, struct, impl, trait, type...).
			
 
				+ *
			
 
				+ * Used by the `"function"` chunk strategy in store.ts to produce
			
 
				+ * one chunk per range instead of character-window chunks.
			
 
				+ */
			
 
				+export interface FunctionRange {
			
 
				+    startIndex: number;
			
 
				+    endIndex: number;
			
 
				+    type: string;
			
 
				+    name?: string;
			
 
				+}
			
 
				+/**
			
 
				+ * Parse a source file and return byte-offset ranges for every top-level
			
 
				+ * code unit that should be its own chunk under the `"function"` chunk
			
 
				+ * strategy.
			
 
				+ *
			
 
				+ * Returns an empty array for unsupported languages, parse failures, or
			
 
				+ * grammar loading failures. Never throws. Reuses the parser/grammar/
			
 
				+ * query caches already populated by `getASTBreakPoints`.
			
 
				+ *
			
 
				+ * @param content - The file content to parse.
			
 
				+ * @param filepath - The file path (used for language detection).
			
 
				+ * @returns Array of non-overlapping FunctionRange objects, sorted by startIndex.
			
 
				+ */
			
 
				+export declare function getASTFunctionRanges(content: string, filepath: string): Promise<FunctionRange[]>;
			
 
				 /**
			
 
				  * Metadata about a code symbol within a chunk.
			
 
				  * Stubbed for Phase 2 — always returns empty array in Phase 1.
			
--- a/dist/ast.js
+++ b/dist/ast.js
@@ -337,6 +337,144 @@ export async function getASTStatus() {
 
				         languages,
			
 
				     };
			
 
				 }
			
 
				+/**
			
 
				+ * Capture names that denote a "function-like unit" — a chunk worth of
			
 
				+ * code that should stay together. Shared across all languages because
			
 
				+ * capture-name semantics (@class, @func, @method, @iface, etc.) are
			
 
				+ * normalized in `LANGUAGE_QUERIES`. Captures with names not in this
			
 
				+ * set (e.g. `import`) are ignored — they belong to the inter-range
			
 
				+ * gaps (char-chunked) instead.
			
 
				+ *
			
 
				+ * Language-agnostic by design so that `getASTFunctionRanges` works for
			
 
				+ * any current-or-future `SupportedLanguage` without requiring a
			
 
				+ * per-language table edit.
			
 
				+ */
			
 
				+const FUNCTION_CAPTURE_NAMES = new Set([
			
 
				+    "export", // TS/JS: export_statement wrapping a decl — preserves outer start
			
 
				+    "class", // TS/JS/Py/Java/Kotlin/etc.
			
 
				+    "iface", // TS: interface_declaration; Java: interface / annotation_type_declaration
			
 
				+    "func", // function_declaration + arrow/function-expression lexical_declaration
			
 
				+    "method", // method_definition / method_declaration / constructor_declaration
			
 
				+    "type", // TS: type_alias_declaration; Go: type_declaration; Rust: type_item; Kotlin: type_alias
			
 
				+    "enum", // TS/Rust/Java: enum declarations
			
 
				+    "decorated", // Python: decorated_definition — preserves decorators
			
 
				+    "struct", // Rust
			
 
				+    "impl", // Rust
			
 
				+    "trait", // Rust
			
 
				+    "mod", // Rust
			
 
				+]);
			
 
				+/**
			
 
				+ * Try to pull a human-readable name out of an AST node. Best-effort —
			
 
				+ * returns `undefined` when the node shape doesn't expose a simple name
			
 
				+ * child. Used for debugging / display and not for correctness.
			
 
				+ */
			
 
				+function extractNodeName(node) {
			
 
				+    // Common shape: `(function_declaration name: (identifier))` etc.
			
 
				+    const nameChild = node.childForFieldName?.("name");
			
 
				+    if (nameChild && nameChild.text)
			
 
				+        return nameChild.text;
			
 
				+    // TS lexical_declaration: `const foo = () => ...` — first declarator's identifier.
			
 
				+    const declarator = node.namedChildren?.find(c => c?.type === "variable_declarator");
			
 
				+    if (declarator) {
			
 
				+        const id = declarator.childForFieldName?.("name");
			
 
				+        if (id && id.text)
			
 
				+            return id.text;
			
 
				+    }
			
 
				+    // export_statement / decorated_definition — recurse into the wrapped decl.
			
 
				+    const inner = node.namedChildren?.find(c => c != null && (c.type === "class_declaration" ||
			
 
				+        c.type === "function_declaration" ||
			
 
				+        c.type === "interface_declaration" ||
			
 
				+        c.type === "type_alias_declaration" ||
			
 
				+        c.type === "enum_declaration" ||
			
 
				+        c.type === "lexical_declaration" ||
			
 
				+        c.type === "function_definition" ||
			
 
				+        c.type === "class_definition"));
			
 
				+    if (inner)
			
 
				+        return extractNodeName(inner);
			
 
				+    return undefined;
			
 
				+}
			
 
				+/**
			
 
				+ * Deduplicate overlapping ranges produced by the same AST pass.
			
 
				+ *
			
 
				+ * Tree-sitter emits multiple captures for the same region — e.g. an
			
 
				+ * `export class Foo {}` matches both `export` and `class`. We want ONE
			
 
				+ * range per region, preferring the outermost (earliest startIndex, largest
			
 
				+ * endIndex). When two captures start at the same position we keep the
			
 
				+ * one with the larger end (typically the wrapper — export/decorated).
			
 
				+ *
			
 
				+ * After this pass no two ranges overlap (strictly: for any a, b either
			
 
				+ * a.endIndex <= b.startIndex or b.endIndex <= a.startIndex).
			
 
				+ */
			
 
				+function dedupeFunctionRanges(ranges) {
			
 
				+    if (ranges.length === 0)
			
 
				+        return ranges;
			
 
				+    const sorted = [...ranges].sort((a, b) => {
			
 
				+        if (a.startIndex !== b.startIndex)
			
 
				+            return a.startIndex - b.startIndex;
			
 
				+        return b.endIndex - a.endIndex; // larger wrapper wins at same start
			
 
				+    });
			
 
				+    const result = [];
			
 
				+    for (const r of sorted) {
			
 
				+        const last = result[result.length - 1];
			
 
				+        if (last && r.startIndex < last.endIndex) {
			
 
				+            // r is contained in or overlaps last — last is the outer/earlier range; drop r.
			
 
				+            continue;
			
 
				+        }
			
 
				+        result.push(r);
			
 
				+    }
			
 
				+    return result;
			
 
				+}
			
 
				+/**
			
 
				+ * Parse a source file and return byte-offset ranges for every top-level
			
 
				+ * code unit that should be its own chunk under the `"function"` chunk
			
 
				+ * strategy.
			
 
				+ *
			
 
				+ * Returns an empty array for unsupported languages, parse failures, or
			
 
				+ * grammar loading failures. Never throws. Reuses the parser/grammar/
			
 
				+ * query caches already populated by `getASTBreakPoints`.
			
 
				+ *
			
 
				+ * @param content - The file content to parse.
			
 
				+ * @param filepath - The file path (used for language detection).
			
 
				+ * @returns Array of non-overlapping FunctionRange objects, sorted by startIndex.
			
 
				+ */
			
 
				+export async function getASTFunctionRanges(content, filepath) {
			
 
				+    const language = detectLanguage(filepath);
			
 
				+    if (!language)
			
 
				+        return [];
			
 
				+    try {
			
 
				+        await ensureInit();
			
 
				+        const grammar = await loadGrammar(language);
			
 
				+        if (!grammar)
			
 
				+            return [];
			
 
				+        const parser = new ParserClass();
			
 
				+        parser.setLanguage(grammar);
			
 
				+        const tree = parser.parse(content);
			
 
				+        if (!tree) {
			
 
				+            parser.delete();
			
 
				+            return [];
			
 
				+        }
			
 
				+        const query = getQuery(language, grammar);
			
 
				+        const captures = query.captures(tree.rootNode);
			
 
				+        const ranges = [];
			
 
				+        for (const cap of captures) {
			
 
				+            if (!FUNCTION_CAPTURE_NAMES.has(cap.name))
			
 
				+                continue;
			
 
				+            ranges.push({
			
 
				+                startIndex: cap.node.startIndex,
			
 
				+                endIndex: cap.node.endIndex,
			
 
				+                type: `ast:${cap.name}`,
			
 
				+                name: extractNodeName(cap.node),
			
 
				+            });
			
 
				+        }
			
 
				+        tree.delete();
			
 
				+        parser.delete();
			
 
				+        return dedupeFunctionRanges(ranges);
			
 
				+    }
			
 
				+    catch (err) {
			
 
				+        console.warn(`[qmd] AST function-range extraction failed for ${filepath}, returning empty: ${err instanceof Error ? err.message : err}`);
			
 
				+        return [];
			
 
				+    }
			
 
				+}
			
 
				 /**
			
 
				  * Extract symbol metadata for code within a byte range.
			
 
				  * Stubbed for Phase 2 — returns empty array.
			
--- a/dist/cli/qmd.js
+++ b/dist/cli/qmd.js
@@ -1415,9 +1415,9 @@ function parseChunkStrategy(value) {
 
				     if (value === undefined)
			
 
				         return undefined;
			
 
				     const s = String(value);
			
 
				-    if (s === "auto" || s === "regex")
			
 
				+    if (s === "auto" || s === "regex" || s === "function")
			
 
				         return s;
			
 
				-    throw new Error(`--chunk-strategy must be "auto" or "regex" (got "${s}")`);
			
 
				+    throw new Error(`--chunk-strategy must be "auto", "regex", or "function" (got "${s}")`);
			
 
				 }
			
 
				 async function vectorIndex(model = DEFAULT_EMBED_MODEL_URI, force = false, batchOptions) {
			
 
				     const storeInstance = getStore();
			
--- a/dist/collections.d.ts
+++ b/dist/collections.d.ts
@@ -4,6 +4,7 @@
 
				  * This module manages the YAML-based collection configuration at ~/.config/qmd/index.yml.
			
 
				  * Collections define which directories to index and their associated contexts.
			
 
				  */
			
 
				+import type { ChunkStrategy } from "./store.js";
			
 
				 /**
			
 
				  * Context definitions for a collection
			
 
				  * Key is path prefix (e.g., "/", "/2024", "/Board of Directors")
			
@@ -20,6 +21,21 @@ export interface Collection {
 
				     context?: ContextMap;
			
 
				     update?: string;
			
 
				     includeByDefault?: boolean;
			
 
				+    /**
			
 
				+     * Chunking strategy for this collection (Phase 2 — i-bud0h8vu). When
			
 
				+     * unset, qmd falls back to the global CLI `--chunk-strategy` flag.
			
 
				+     *
			
 
				+     *   - "auto"     — char-based chunks with AST break points as hints.
			
 
				+     *   - "regex"    — char-based chunks without AST hints (legacy).
			
 
				+     *   - "function" — one chunk per AST function/class/method range for
			
 
				+     *                  supported code files. Opt-in per collection; files
			
 
				+     *                  with zero detected ranges fall back to "auto".
			
 
				+     *
			
 
				+     * Changing this value requires a per-collection force-reindex
			
 
				+     * (`qmd update --force <collection>`). The `content_hash`-keyed rows
			
 
				+     * replace in-place, so other collections are unaffected.
			
 
				+     */
			
 
				+    chunkStrategy?: ChunkStrategy;
			
 
				 }
			
 
				 /**
			
 
				  * Model configuration for embedding, reranking, and generation
			
--- a/dist/store.d.ts
+++ b/dist/store.d.ts
@@ -78,7 +78,7 @@ export declare function isInsideCodeFence(pos: number, fences: CodeFenceRegion[]
 
				  * @returns The best position to cut at
			
 
				  */
			
 
				 export declare function findBestCutoff(breakPoints: BreakPoint[], targetCharPos: number, windowChars?: number, decayFactor?: number, codeFences?: CodeFenceRegion[]): number;
			
 
				-export type ChunkStrategy = "auto" | "regex";
			
 
				+export type ChunkStrategy = "auto" | "regex" | "function";
			
 
				 /**
			
 
				  * Merge two sets of break points (e.g. regex + AST), keeping the highest
			
 
				  * score at each position. Result is sorted by position.
			
@@ -540,8 +540,12 @@ export declare function chunkDocument(content: string, maxChars?: number, overla
 
				  * break points for supported code files, merges with regex break points,
			
 
				  * and delegates to the shared chunk algorithm.
			
 
				  *
			
 
				- * Falls back to regex-only when strategy is "regex", filepath is absent,
			
 
				- * or language is unsupported.
			
 
				+ * Strategies:
			
 
				+ *   - "regex"    (default) — char-based chunking with regex break points only.
			
 
				+ *   - "auto"     — regex break points merged with AST break points (soft hints).
			
 
				+ *   - "function" — one chunk per AST function range (Phase 2); inter-range
			
 
				+ *                  gaps (imports, top-level code) are char-chunked with AST
			
 
				+ *                  hints. Falls back to "auto" when zero ranges are detected.
			
 
				  */
			
 
				 export declare function chunkDocumentAsync(content: string, maxChars?: number, overlapChars?: number, windowChars?: number, filepath?: string, chunkStrategy?: ChunkStrategy): Promise<{
			
 
				     text: string;
			
@@ -685,6 +689,11 @@ export declare function sanitizeFTS5Term(term: string): string;
 
				 /**
			
 
				  * Validate that a vec/hyde query doesn't use lex-only syntax.
			
 
				  * Returns error message if invalid, null if valid.
			
 
				+ *
			
 
				+ * Negation is detected ONLY when `-` is preceded by whitespace or sits at
			
 
				+ * the start of the query. Hyphens inside words (e.g. `auto-archived`,
			
 
				+ * `pre-commit`, `multi-session`, `state-of-the-art`) carry no negation
			
 
				+ * semantics in natural English and must pass through unchanged.
			
 
				  */
			
 
				 export declare function validateSemanticQuery(query: string): string | null;
			
 
				 export declare function validateLexQuery(query: string): string | null;
			
--- a/dist/store.js
+++ b/dist/store.js
@@ -956,8 +956,13 @@ function resolveEmbedOptions(options) {
 
				     };
			
 
				 }
			
 
				 function getPendingEmbeddingDocs(db) {
			
 
				+    // `MIN(d.collection)` deterministically picks one collection per hash when
			
 
				+    // the same content is indexed in multiple collections (SQLite tie-breaks
			
 
				+    // alphabetically). The identical bytes produce identical chunks regardless
			
 
				+    // of which collection wins; the chunkStrategy lookup still resolves via
			
 
				+    // that collection's YAML config. See Phase 2 design notes (i-bud0h8vu).
			
 
				     return db.prepare(`
			
 
				-    SELECT d.hash, MIN(d.path) as path, length(CAST(c.doc AS BLOB)) as bytes
			
 
				+    SELECT d.hash, MIN(d.path) as path, MIN(d.collection) as collection, length(CAST(c.doc AS BLOB)) as bytes
			
 
				     FROM documents d
			
 
				     JOIN content c ON d.hash = c.hash
			
 
				     LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
			
@@ -1023,6 +1028,24 @@ export async function generateEmbeddings(store, options) {
 
				     const totalBytes = docsToEmbed.reduce((sum, doc) => sum + Math.max(0, doc.bytes), 0);
			
 
				     const totalDocs = docsToEmbed.length;
			
 
				     const startTime = Date.now();
			
 
				+    // Per-collection chunkStrategy lookup (Phase 2 — i-bud0h8vu). YAML
			
 
				+    // `chunkStrategy` on a collection wins over `options.chunkStrategy`
			
 
				+    // (global CLI flag); falls back to the global option, then to
			
 
				+    // chunkDocumentByTokens' own "regex" default when neither is set.
			
 
				+    // Opt-in per collection — collections without the field are untouched.
			
 
				+    const collectionStrategies = new Map();
			
 
				+    try {
			
 
				+        const { listCollections: listYamlCollections } = await import("./collections.js");
			
 
				+        for (const c of listYamlCollections()) {
			
 
				+            if (c.chunkStrategy)
			
 
				+                collectionStrategies.set(c.name, c.chunkStrategy);
			
 
				+        }
			
 
				+    }
			
 
				+    catch {
			
 
				+        // If YAML config is missing/unreadable, fall back silently to the
			
 
				+        // global strategy — no collection overrides. Keeps SDK/inline
			
 
				+        // callers that never touch ~/.config/qmd working.
			
 
				+    }
			
 
				     // Use store's LlamaCpp or global singleton, wrapped in a session
			
 
				     const llm = getLlm(store);
			
 
				     const embedModelUri = llm.embedModelName;
			
@@ -1048,7 +1071,9 @@ export async function generateEmbeddings(store, options) {
 
				                 if (!doc.body.trim())
			
 
				                     continue;
			
 
				                 const title = extractTitle(doc.body, doc.path);
			
 
				-                const chunks = await chunkDocumentByTokens(doc.body, undefined, undefined, undefined, doc.path, options?.chunkStrategy, session.signal);
			
 
				+                const perCollectionStrategy = collectionStrategies.get(doc.collection);
			
 
				+                const chunkStrategy = perCollectionStrategy ?? options?.chunkStrategy;
			
 
				+                const chunks = await chunkDocumentByTokens(doc.body, undefined, undefined, undefined, doc.path, chunkStrategy, session.signal);
			
 
				                 for (let seq = 0; seq < chunks.length; seq++) {
			
 
				                     batchChunks.push({
			
 
				                         hash: doc.hash,
			
@@ -1557,12 +1582,30 @@ export function chunkDocument(content, maxChars = CHUNK_SIZE_CHARS, overlapChars
 
				  * break points for supported code files, merges with regex break points,
			
 
				  * and delegates to the shared chunk algorithm.
			
 
				  *
			
 
				- * Falls back to regex-only when strategy is "regex", filepath is absent,
			
 
				- * or language is unsupported.
			
 
				+ * Strategies:
			
 
				+ *   - "regex"    (default) — char-based chunking with regex break points only.
			
 
				+ *   - "auto"     — regex break points merged with AST break points (soft hints).
			
 
				+ *   - "function" — one chunk per AST function range (Phase 2); inter-range
			
 
				+ *                  gaps (imports, top-level code) are char-chunked with AST
			
 
				+ *                  hints. Falls back to "auto" when zero ranges are detected.
			
 
				  */
			
 
				 export async function chunkDocumentAsync(content, maxChars = CHUNK_SIZE_CHARS, overlapChars = CHUNK_OVERLAP_CHARS, windowChars = CHUNK_WINDOW_CHARS, filepath, chunkStrategy = "regex") {
			
 
				     const regexPoints = scanBreakPoints(content);
			
 
				     const codeFences = findCodeFences(content);
			
 
				+    // "function" strategy: delegate to the function-level chunker. If no
			
 
				+    // ranges are detected (markdown, unsupported lang, parse failure), fall
			
 
				+    // back to "auto" behavior (AST-break-point-assisted char chunking).
			
 
				+    if (chunkStrategy === "function" && filepath) {
			
 
				+        const { getASTFunctionRanges, getASTBreakPoints } = await import("./ast.js");
			
 
				+        const ranges = await getASTFunctionRanges(content, filepath);
			
 
				+        if (ranges.length > 0) {
			
 
				+            return chunkByFunctionRanges(content, ranges, regexPoints, codeFences, maxChars, overlapChars, windowChars);
			
 
				+        }
			
 
				+        // Zero ranges — fall through to auto behavior so break points still help.
			
 
				+        const astPoints = await getASTBreakPoints(content, filepath);
			
 
				+        const merged = astPoints.length > 0 ? mergeBreakPoints(regexPoints, astPoints) : regexPoints;
			
 
				+        return chunkDocumentWithBreakPoints(content, merged, codeFences, maxChars, overlapChars, windowChars);
			
 
				+    }
			
 
				     let breakPoints = regexPoints;
			
 
				     if (chunkStrategy === "auto" && filepath) {
			
 
				         const { getASTBreakPoints } = await import("./ast.js");
			
@@ -1573,6 +1616,85 @@ export async function chunkDocumentAsync(content, maxChars = CHUNK_SIZE_CHARS, o
 
				     }
			
 
				     return chunkDocumentWithBreakPoints(content, breakPoints, codeFences, maxChars, overlapChars, windowChars);
			
 
				 }
			
 
				+/**
			
 
				+ * Produce one chunk per AST function range, plus char-chunks for the gaps
			
 
				+ * between ranges (imports, top-level code). Ranges that exceed `maxChars`
			
 
				+ * are further split using the existing char-based algorithm so we never
			
 
				+ * emit a single oversized chunk.
			
 
				+ *
			
 
				+ * Preconditions: `ranges` is non-empty, sorted by `startIndex`, and the
			
 
				+ * ranges are non-overlapping (as produced by `getASTFunctionRanges`).
			
 
				+ */
			
 
				+function chunkByFunctionRanges(content, ranges, regexPoints, codeFences, maxChars, overlapChars, windowChars) {
			
 
				+    const out = [];
			
 
				+    let cursor = 0;
			
 
				+    const emitGap = (start, end) => {
			
 
				+        if (start >= end)
			
 
				+            return;
			
 
				+        const gap = content.slice(start, end);
			
 
				+        // Whitespace-only gaps are dropped — they carry no embeddable signal.
			
 
				+        if (!gap.trim())
			
 
				+            return;
			
 
				+        if (gap.length <= maxChars) {
			
 
				+            out.push({ text: gap, pos: start });
			
 
				+            return;
			
 
				+        }
			
 
				+        // Reuse char-based algorithm for oversized gaps. Restrict break
			
 
				+        // points and code fences to the gap window and rebase positions so
			
 
				+        // chunkDocumentWithBreakPoints operates on a standalone slice.
			
 
				+        const subPoints = regexPoints
			
 
				+            .filter(p => p.pos >= start && p.pos < end)
			
 
				+            .map(p => ({ ...p, pos: p.pos - start }));
			
 
				+        const subFences = codeFences
			
 
				+            .filter(f => f.end > start && f.start < end)
			
 
				+            .map(f => ({
			
 
				+            start: Math.max(0, f.start - start),
			
 
				+            end: Math.max(0, Math.min(end, f.end) - start),
			
 
				+        }));
			
 
				+        const sub = chunkDocumentWithBreakPoints(gap, subPoints, subFences, maxChars, overlapChars, windowChars);
			
 
				+        for (const c of sub)
			
 
				+            out.push({ text: c.text, pos: start + c.pos });
			
 
				+    };
			
 
				+    for (const range of ranges) {
			
 
				+        // Emit any leading / inter-range gap (imports, top-level code).
			
 
				+        emitGap(cursor, range.startIndex);
			
 
				+        const body = content.slice(range.startIndex, range.endIndex);
			
 
				+        if (body.length === 0) {
			
 
				+            cursor = range.endIndex;
			
 
				+            continue;
			
 
				+        }
			
 
				+        if (body.length <= maxChars) {
			
 
				+            out.push({ text: body, pos: range.startIndex });
			
 
				+        }
			
 
				+        else {
			
 
				+            // Oversized function/class — split with char algorithm so we stay
			
 
				+            // under the embed token budget. Break points inside the range are
			
 
				+            // reused to keep splits at syntactically-sensible positions.
			
 
				+            const subPoints = regexPoints
			
 
				+                .filter(p => p.pos >= range.startIndex && p.pos < range.endIndex)
			
 
				+                .map(p => ({ ...p, pos: p.pos - range.startIndex }));
			
 
				+            const subFences = codeFences
			
 
				+                .filter(f => f.end > range.startIndex && f.start < range.endIndex)
			
 
				+                .map(f => ({
			
 
				+                start: Math.max(0, f.start - range.startIndex),
			
 
				+                end: Math.max(0, Math.min(range.endIndex, f.end) - range.startIndex),
			
 
				+            }));
			
 
				+            const sub = chunkDocumentWithBreakPoints(body, subPoints, subFences, maxChars, overlapChars, windowChars);
			
 
				+            for (const c of sub)
			
 
				+                out.push({ text: c.text, pos: range.startIndex + c.pos });
			
 
				+        }
			
 
				+        cursor = range.endIndex;
			
 
				+    }
			
 
				+    // Trailing gap after the last range.
			
 
				+    emitGap(cursor, content.length);
			
 
				+    // Edge case: content consisted entirely of whitespace-only gaps (zero
			
 
				+    // emitted chunks). Preserve the invariant that non-empty content yields
			
 
				+    // at least one chunk.
			
 
				+    if (out.length === 0 && content.length > 0) {
			
 
				+        return [{ text: content, pos: 0 }];
			
 
				+    }
			
 
				+    return out;
			
 
				+}
			
 
				 /**
			
 
				  * Chunk a document by actual token count using the LLM tokenizer.
			
 
				  * More accurate than character-based chunking but requires async.
			
@@ -2197,10 +2319,15 @@ function buildFTS5Query(query) {
 
				 /**
			
 
				  * Validate that a vec/hyde query doesn't use lex-only syntax.
			
 
				  * Returns error message if invalid, null if valid.
			
 
				+ *
			
 
				+ * Negation is detected ONLY when `-` is preceded by whitespace or sits at
			
 
				+ * the start of the query. Hyphens inside words (e.g. `auto-archived`,
			
 
				+ * `pre-commit`, `multi-session`, `state-of-the-art`) carry no negation
			
 
				+ * semantics in natural English and must pass through unchanged.
			
 
				  */
			
 
				 export function validateSemanticQuery(query) {
			
 
				-    // Check for negation syntax
			
 
				-    if (/-\w/.test(query) || /-"/.test(query)) {
			
 
				+    // `-term` or `-"phrase"` only counts as negation at SOS or after whitespace.
			
 
				+    if (/(?:^|\s)-\w/.test(query) || /(?:^|\s)-"/.test(query)) {
			
 
				         return 'Negation (-term) is not supported in vec/hyde queries. Use lex for exclusions.';
			
 
				     }
			
 
				     return null;
			
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 
				 {
			
 
				   "name": "@oivo/qmd",
			
 
				-  "version": "2.1.0",
			
 
				+  "version": "2.1.1-oivo.0",
			
 
				   "description": "Query Markup Documents - On-device hybrid search for markdown files with BM25, vector search, and LLM reranking",
			
 
				   "type": "module",
			
 
				   "main": "dist/index.js",
			
--- a/src/store.ts
+++ b/src/store.ts
@@ -3049,10 +3049,15 @@ function buildFTS5Query(query: string): string | null {
 
				 /**
			
 
				  * Validate that a vec/hyde query doesn't use lex-only syntax.
			
 
				  * Returns error message if invalid, null if valid.
			
 
				+ *
			
 
				+ * Negation is detected ONLY when `-` is preceded by whitespace or sits at
			
 
				+ * the start of the query. Hyphens inside words (e.g. `auto-archived`,
			
 
				+ * `pre-commit`, `multi-session`, `state-of-the-art`) carry no negation
			
 
				+ * semantics in natural English and must pass through unchanged.
			
 
				  */
			
 
				 export function validateSemanticQuery(query: string): string | null {
			
 
				-  // Check for negation syntax
			
 
				-  if (/-\w/.test(query) || /-"/.test(query)) {
			
 
				+  // `-term` or `-"phrase"` only counts as negation at SOS or after whitespace.
			
 
				+  if (/(?:^|\s)-\w/.test(query) || /(?:^|\s)-"/.test(query)) {
			
 
				     return 'Negation (-term) is not supported in vec/hyde queries. Use lex for exclusions.';
			
 
				   }
			
 
				   return null;
			
--- a/test/structured-search.test.ts
+++ b/test/structured-search.test.ts
@@ -366,6 +366,29 @@ describe("lex query syntax", () => {
 
				       expect(validateSemanticQuery('-"exact phrase"')).toContain("Negation");
			
 
				     });
			
 
				 
			
 
				+    test("rejects mid-query quoted negation", () => {
			
 
				+      expect(validateSemanticQuery('foo -"exact phrase"')).toContain("Negation");
			
 
				+    });
			
 
				+
			
 
				+    test("accepts hyphenated words (no negation)", () => {
			
 
				+      // Regression for the hyphen-parsing UX bug: hyphens inside words must
			
 
				+      // NOT be read as negation operators. See `validateSemanticQuery` doc.
			
 
				+      expect(validateSemanticQuery("when does a completed session get auto-archived")).toBeNull();
			
 
				+      expect(validateSemanticQuery("pre-commit hook")).toBeNull();
			
 
				+      expect(validateSemanticQuery("multi-session coordination")).toBeNull();
			
 
				+      expect(validateSemanticQuery("cross-machine file ops")).toBeNull();
			
 
				+      expect(validateSemanticQuery("long-running process")).toBeNull();
			
 
				+      expect(validateSemanticQuery("well-known endpoint")).toBeNull();
			
 
				+      expect(validateSemanticQuery("out-of-scope edits")).toBeNull();
			
 
				+      expect(validateSemanticQuery("state-of-the-art model")).toBeNull();
			
 
				+    });
			
 
				+
			
 
				+    test("accepts hyphenated word at start of query", () => {
			
 
				+      // Leading hyphenated word starts with a letter, not `-`, so the SOS
			
 
				+      // rule does not fire — confirm via an explicit start-of-string case.
			
 
				+      expect(validateSemanticQuery("auto-archived session")).toBeNull();
			
 
				+    });
			
 
				+
			
 
				 
			
 
				     test("accepts hyde-style hypothetical answers", () => {
			
 
				       expect(validateSemanticQuery(