Kaynağa Gözat

fix(query): allow hyphenated words in vec/hyde queries (i-fbalbv5l)

`validateSemanticQuery` was rejecting natural-English phrases with
compound modifiers (auto-archived, pre-commit, multi-session,
state-of-the-art, etc.) by misreading every intra-word `-` as the
FTS5 negation operator. Only whitespace-preceded or SOS `-` can be
negation; hyphens inside a word have no negation semantics.

Fix: tighten the detector from `/-\w/` / `/-"/` to
`/(?:^|\s)-\w/` / `/(?:^|\s)-"/`. Lex query negation (handled by
`buildFTS5Query`) is untouched — its own intra-word hyphen
disambiguation has been correct since upstream v2.1.0.

Tests:
- 5 new positives (auto-archived, pre-commit, multi-session,
  cross-machine, long-running, well-known, out-of-scope,
  state-of-the-art, leading-hyphenated) → accepted.
- Existing `performance -sports` / `-"exact phrase"` negation
  rejections still fire (regression guard).
- Added mid-query quoted negation test `foo -"phrase"`.
- All 67 tests in test/structured-search.test.ts pass.

Also bundles dist/ rebuild that had drifted from src/ on this
branch after i-bud0h8vu (Phase 2) and i-76v1j1ld (Phase 3) landed
their source changes without regenerating dist/. `npm run build`
regenerates dist/{store,ast,collections,cli/qmd}.{js,d.ts}.

Version bumped 2.1.0 → 2.1.1-oivo.0 so consumers can pin.
root 1 ay önce
ebeveyn
işleme
56a77d5769
9 değiştirilmiş dosya ile 359 ekleme ve 14 silme
  1. 27 0
      dist/ast.d.ts
  2. 138 0
      dist/ast.js
  3. 2 2
      dist/cli/qmd.js
  4. 16 0
      dist/collections.d.ts
  5. 12 3
      dist/store.d.ts
  6. 133 6
      dist/store.js
  7. 1 1
      package.json
  8. 7 2
      src/store.ts
  9. 23 0
      test/structured-search.test.ts

+ 27 - 0
dist/ast.d.ts

@@ -47,6 +47,33 @@ export declare function getASTStatus(): Promise<{
         error?: string;
     }[];
 }>;
+/**
+ * A byte-offset range covering a single top-level code unit
+ * (function, method, class, interface, struct, impl, trait, type...).
+ *
+ * Used by the `"function"` chunk strategy in store.ts to produce
+ * one chunk per range instead of character-window chunks.
+ */
+export interface FunctionRange {
+    startIndex: number;
+    endIndex: number;
+    type: string;
+    name?: string;
+}
+/**
+ * Parse a source file and return byte-offset ranges for every top-level
+ * code unit that should be its own chunk under the `"function"` chunk
+ * strategy.
+ *
+ * Returns an empty array for unsupported languages, parse failures, or
+ * grammar loading failures. Never throws. Reuses the parser/grammar/
+ * query caches already populated by `getASTBreakPoints`.
+ *
+ * @param content - The file content to parse.
+ * @param filepath - The file path (used for language detection).
+ * @returns Array of non-overlapping FunctionRange objects, sorted by startIndex.
+ */
+export declare function getASTFunctionRanges(content: string, filepath: string): Promise<FunctionRange[]>;
 /**
  * Metadata about a code symbol within a chunk.
  * Stubbed for Phase 2 — always returns empty array in Phase 1.

+ 138 - 0
dist/ast.js

@@ -337,6 +337,144 @@ export async function getASTStatus() {
         languages,
     };
 }
+/**
+ * Capture names that denote a "function-like unit" — a chunk worth of
+ * code that should stay together. Shared across all languages because
+ * capture-name semantics (@class, @func, @method, @iface, etc.) are
+ * normalized in `LANGUAGE_QUERIES`. Captures with names not in this
+ * set (e.g. `import`) are ignored — they belong to the inter-range
+ * gaps (char-chunked) instead.
+ *
+ * Language-agnostic by design so that `getASTFunctionRanges` works for
+ * any current-or-future `SupportedLanguage` without requiring a
+ * per-language table edit.
+ */
+const FUNCTION_CAPTURE_NAMES = new Set([
+    "export", // TS/JS: export_statement wrapping a decl — preserves outer start
+    "class", // TS/JS/Py/Java/Kotlin/etc.
+    "iface", // TS: interface_declaration; Java: interface / annotation_type_declaration
+    "func", // function_declaration + arrow/function-expression lexical_declaration
+    "method", // method_definition / method_declaration / constructor_declaration
+    "type", // TS: type_alias_declaration; Go: type_declaration; Rust: type_item; Kotlin: type_alias
+    "enum", // TS/Rust/Java: enum declarations
+    "decorated", // Python: decorated_definition — preserves decorators
+    "struct", // Rust
+    "impl", // Rust
+    "trait", // Rust
+    "mod", // Rust
+]);
+/**
+ * Try to pull a human-readable name out of an AST node. Best-effort —
+ * returns `undefined` when the node shape doesn't expose a simple name
+ * child. Used for debugging / display and not for correctness.
+ */
+function extractNodeName(node) {
+    // Common shape: `(function_declaration name: (identifier))` etc.
+    const nameChild = node.childForFieldName?.("name");
+    if (nameChild && nameChild.text)
+        return nameChild.text;
+    // TS lexical_declaration: `const foo = () => ...` — first declarator's identifier.
+    const declarator = node.namedChildren?.find(c => c?.type === "variable_declarator");
+    if (declarator) {
+        const id = declarator.childForFieldName?.("name");
+        if (id && id.text)
+            return id.text;
+    }
+    // export_statement / decorated_definition — recurse into the wrapped decl.
+    const inner = node.namedChildren?.find(c => c != null && (c.type === "class_declaration" ||
+        c.type === "function_declaration" ||
+        c.type === "interface_declaration" ||
+        c.type === "type_alias_declaration" ||
+        c.type === "enum_declaration" ||
+        c.type === "lexical_declaration" ||
+        c.type === "function_definition" ||
+        c.type === "class_definition"));
+    if (inner)
+        return extractNodeName(inner);
+    return undefined;
+}
+/**
+ * Deduplicate overlapping ranges produced by the same AST pass.
+ *
+ * Tree-sitter emits multiple captures for the same region — e.g. an
+ * `export class Foo {}` matches both `export` and `class`. We want ONE
+ * range per region, preferring the outermost (earliest startIndex, largest
+ * endIndex). When two captures start at the same position we keep the
+ * one with the larger end (typically the wrapper — export/decorated).
+ *
+ * After this pass no two ranges overlap (strictly: for any a, b either
+ * a.endIndex <= b.startIndex or b.endIndex <= a.startIndex).
+ */
+function dedupeFunctionRanges(ranges) {
+    if (ranges.length === 0)
+        return ranges;
+    const sorted = [...ranges].sort((a, b) => {
+        if (a.startIndex !== b.startIndex)
+            return a.startIndex - b.startIndex;
+        return b.endIndex - a.endIndex; // larger wrapper wins at same start
+    });
+    const result = [];
+    for (const r of sorted) {
+        const last = result[result.length - 1];
+        if (last && r.startIndex < last.endIndex) {
+            // r is contained in or overlaps last — last is the outer/earlier range; drop r.
+            continue;
+        }
+        result.push(r);
+    }
+    return result;
+}
+/**
+ * Parse a source file and return byte-offset ranges for every top-level
+ * code unit that should be its own chunk under the `"function"` chunk
+ * strategy.
+ *
+ * Returns an empty array for unsupported languages, parse failures, or
+ * grammar loading failures. Never throws. Reuses the parser/grammar/
+ * query caches already populated by `getASTBreakPoints`.
+ *
+ * @param content - The file content to parse.
+ * @param filepath - The file path (used for language detection).
+ * @returns Array of non-overlapping FunctionRange objects, sorted by startIndex.
+ */
+export async function getASTFunctionRanges(content, filepath) {
+    const language = detectLanguage(filepath);
+    if (!language)
+        return [];
+    try {
+        await ensureInit();
+        const grammar = await loadGrammar(language);
+        if (!grammar)
+            return [];
+        const parser = new ParserClass();
+        parser.setLanguage(grammar);
+        const tree = parser.parse(content);
+        if (!tree) {
+            parser.delete();
+            return [];
+        }
+        const query = getQuery(language, grammar);
+        const captures = query.captures(tree.rootNode);
+        const ranges = [];
+        for (const cap of captures) {
+            if (!FUNCTION_CAPTURE_NAMES.has(cap.name))
+                continue;
+            ranges.push({
+                startIndex: cap.node.startIndex,
+                endIndex: cap.node.endIndex,
+                type: `ast:${cap.name}`,
+                name: extractNodeName(cap.node),
+            });
+        }
+        tree.delete();
+        parser.delete();
+        return dedupeFunctionRanges(ranges);
+    }
+    catch (err) {
+        console.warn(`[qmd] AST function-range extraction failed for ${filepath}, returning empty: ${err instanceof Error ? err.message : err}`);
+        return [];
+    }
+}
 /**
  * Extract symbol metadata for code within a byte range.
  * Stubbed for Phase 2 — returns empty array.

+ 2 - 2
dist/cli/qmd.js

@@ -1415,9 +1415,9 @@ function parseChunkStrategy(value) {
     if (value === undefined)
         return undefined;
     const s = String(value);
-    if (s === "auto" || s === "regex")
+    if (s === "auto" || s === "regex" || s === "function")
         return s;
-    throw new Error(`--chunk-strategy must be "auto" or "regex" (got "${s}")`);
+    throw new Error(`--chunk-strategy must be "auto", "regex", or "function" (got "${s}")`);
 }
 async function vectorIndex(model = DEFAULT_EMBED_MODEL_URI, force = false, batchOptions) {
     const storeInstance = getStore();

+ 16 - 0
dist/collections.d.ts

@@ -4,6 +4,7 @@
  * This module manages the YAML-based collection configuration at ~/.config/qmd/index.yml.
  * Collections define which directories to index and their associated contexts.
  */
+import type { ChunkStrategy } from "./store.js";
 /**
  * Context definitions for a collection
  * Key is path prefix (e.g., "/", "/2024", "/Board of Directors")
@@ -20,6 +21,21 @@ export interface Collection {
     context?: ContextMap;
     update?: string;
     includeByDefault?: boolean;
+    /**
+     * Chunking strategy for this collection (Phase 2 — i-bud0h8vu). When
+     * unset, qmd falls back to the global CLI `--chunk-strategy` flag.
+     *
+     *   - "auto"     — char-based chunks with AST break points as hints.
+     *   - "regex"    — char-based chunks without AST hints (legacy).
+     *   - "function" — one chunk per AST function/class/method range for
+     *                  supported code files. Opt-in per collection; files
+     *                  with zero detected ranges fall back to "auto".
+     *
+     * Changing this value requires a per-collection force-reindex
+     * (`qmd update --force <collection>`). The `content_hash`-keyed rows
+     * replace in-place, so other collections are unaffected.
+     */
+    chunkStrategy?: ChunkStrategy;
 }
 /**
  * Model configuration for embedding, reranking, and generation

+ 12 - 3
dist/store.d.ts

@@ -78,7 +78,7 @@ export declare function isInsideCodeFence(pos: number, fences: CodeFenceRegion[]
  * @returns The best position to cut at
  */
 export declare function findBestCutoff(breakPoints: BreakPoint[], targetCharPos: number, windowChars?: number, decayFactor?: number, codeFences?: CodeFenceRegion[]): number;
-export type ChunkStrategy = "auto" | "regex";
+export type ChunkStrategy = "auto" | "regex" | "function";
 /**
  * Merge two sets of break points (e.g. regex + AST), keeping the highest
  * score at each position. Result is sorted by position.
@@ -540,8 +540,12 @@ export declare function chunkDocument(content: string, maxChars?: number, overla
  * break points for supported code files, merges with regex break points,
  * and delegates to the shared chunk algorithm.
  *
- * Falls back to regex-only when strategy is "regex", filepath is absent,
- * or language is unsupported.
+ * Strategies:
+ *   - "regex"    (default) — char-based chunking with regex break points only.
+ *   - "auto"     — regex break points merged with AST break points (soft hints).
+ *   - "function" — one chunk per AST function range (Phase 2); inter-range
+ *                  gaps (imports, top-level code) are char-chunked with AST
+ *                  hints. Falls back to "auto" when zero ranges are detected.
  */
 export declare function chunkDocumentAsync(content: string, maxChars?: number, overlapChars?: number, windowChars?: number, filepath?: string, chunkStrategy?: ChunkStrategy): Promise<{
     text: string;
@@ -685,6 +689,11 @@ export declare function sanitizeFTS5Term(term: string): string;
 /**
  * Validate that a vec/hyde query doesn't use lex-only syntax.
  * Returns error message if invalid, null if valid.
+ *
+ * Negation is detected ONLY when `-` is preceded by whitespace or sits at
+ * the start of the query. Hyphens inside words (e.g. `auto-archived`,
+ * `pre-commit`, `multi-session`, `state-of-the-art`) carry no negation
+ * semantics in natural English and must pass through unchanged.
  */
 export declare function validateSemanticQuery(query: string): string | null;
 export declare function validateLexQuery(query: string): string | null;

+ 133 - 6
dist/store.js

@@ -956,8 +956,13 @@ function resolveEmbedOptions(options) {
     };
 }
 function getPendingEmbeddingDocs(db) {
+    // `MIN(d.collection)` deterministically picks one collection per hash when
+    // the same content is indexed in multiple collections (SQLite tie-breaks
+    // alphabetically). The identical bytes produce identical chunks regardless
+    // of which collection wins; the chunkStrategy lookup still resolves via
+    // that collection's YAML config. See Phase 2 design notes (i-bud0h8vu).
     return db.prepare(`
-    SELECT d.hash, MIN(d.path) as path, length(CAST(c.doc AS BLOB)) as bytes
+    SELECT d.hash, MIN(d.path) as path, MIN(d.collection) as collection, length(CAST(c.doc AS BLOB)) as bytes
     FROM documents d
     JOIN content c ON d.hash = c.hash
     LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
@@ -1023,6 +1028,24 @@ export async function generateEmbeddings(store, options) {
     const totalBytes = docsToEmbed.reduce((sum, doc) => sum + Math.max(0, doc.bytes), 0);
     const totalDocs = docsToEmbed.length;
     const startTime = Date.now();
+    // Per-collection chunkStrategy lookup (Phase 2 — i-bud0h8vu). YAML
+    // `chunkStrategy` on a collection wins over `options.chunkStrategy`
+    // (global CLI flag); falls back to the global option, then to
+    // chunkDocumentByTokens' own "regex" default when neither is set.
+    // Opt-in per collection — collections without the field are untouched.
+    const collectionStrategies = new Map();
+    try {
+        const { listCollections: listYamlCollections } = await import("./collections.js");
+        for (const c of listYamlCollections()) {
+            if (c.chunkStrategy)
+                collectionStrategies.set(c.name, c.chunkStrategy);
+        }
+    }
+    catch {
+        // If YAML config is missing/unreadable, fall back silently to the
+        // global strategy — no collection overrides. Keeps SDK/inline
+        // callers that never touch ~/.config/qmd working.
+    }
     // Use store's LlamaCpp or global singleton, wrapped in a session
     const llm = getLlm(store);
     const embedModelUri = llm.embedModelName;
@@ -1048,7 +1071,9 @@ export async function generateEmbeddings(store, options) {
                 if (!doc.body.trim())
                     continue;
                 const title = extractTitle(doc.body, doc.path);
-                const chunks = await chunkDocumentByTokens(doc.body, undefined, undefined, undefined, doc.path, options?.chunkStrategy, session.signal);
+                const perCollectionStrategy = collectionStrategies.get(doc.collection);
+                const chunkStrategy = perCollectionStrategy ?? options?.chunkStrategy;
+                const chunks = await chunkDocumentByTokens(doc.body, undefined, undefined, undefined, doc.path, chunkStrategy, session.signal);
                 for (let seq = 0; seq < chunks.length; seq++) {
                     batchChunks.push({
                         hash: doc.hash,
@@ -1557,12 +1582,30 @@ export function chunkDocument(content, maxChars = CHUNK_SIZE_CHARS, overlapChars
  * break points for supported code files, merges with regex break points,
  * and delegates to the shared chunk algorithm.
  *
- * Falls back to regex-only when strategy is "regex", filepath is absent,
- * or language is unsupported.
+ * Strategies:
+ *   - "regex"    (default) — char-based chunking with regex break points only.
+ *   - "auto"     — regex break points merged with AST break points (soft hints).
+ *   - "function" — one chunk per AST function range (Phase 2); inter-range
+ *                  gaps (imports, top-level code) are char-chunked with AST
+ *                  hints. Falls back to "auto" when zero ranges are detected.
  */
 export async function chunkDocumentAsync(content, maxChars = CHUNK_SIZE_CHARS, overlapChars = CHUNK_OVERLAP_CHARS, windowChars = CHUNK_WINDOW_CHARS, filepath, chunkStrategy = "regex") {
     const regexPoints = scanBreakPoints(content);
     const codeFences = findCodeFences(content);
+    // "function" strategy: delegate to the function-level chunker. If no
+    // ranges are detected (markdown, unsupported lang, parse failure), fall
+    // back to "auto" behavior (AST-break-point-assisted char chunking).
+    if (chunkStrategy === "function" && filepath) {
+        const { getASTFunctionRanges, getASTBreakPoints } = await import("./ast.js");
+        const ranges = await getASTFunctionRanges(content, filepath);
+        if (ranges.length > 0) {
+            return chunkByFunctionRanges(content, ranges, regexPoints, codeFences, maxChars, overlapChars, windowChars);
+        }
+        // Zero ranges — fall through to auto behavior so break points still help.
+        const astPoints = await getASTBreakPoints(content, filepath);
+        const merged = astPoints.length > 0 ? mergeBreakPoints(regexPoints, astPoints) : regexPoints;
+        return chunkDocumentWithBreakPoints(content, merged, codeFences, maxChars, overlapChars, windowChars);
+    }
     let breakPoints = regexPoints;
     if (chunkStrategy === "auto" && filepath) {
         const { getASTBreakPoints } = await import("./ast.js");
@@ -1573,6 +1616,85 @@ export async function chunkDocumentAsync(content, maxChars = CHUNK_SIZE_CHARS, o
     }
     return chunkDocumentWithBreakPoints(content, breakPoints, codeFences, maxChars, overlapChars, windowChars);
 }
+/**
+ * Produce one chunk per AST function range, plus char-chunks for the gaps
+ * between ranges (imports, top-level code). Ranges that exceed `maxChars`
+ * are further split using the existing char-based algorithm so we never
+ * emit a single oversized chunk.
+ *
+ * Preconditions: `ranges` is non-empty, sorted by `startIndex`, and the
+ * ranges are non-overlapping (as produced by `getASTFunctionRanges`).
+ */
+function chunkByFunctionRanges(content, ranges, regexPoints, codeFences, maxChars, overlapChars, windowChars) {
+    const out = [];
+    let cursor = 0;
+    const emitGap = (start, end) => {
+        if (start >= end)
+            return;
+        const gap = content.slice(start, end);
+        // Whitespace-only gaps are dropped — they carry no embeddable signal.
+        if (!gap.trim())
+            return;
+        if (gap.length <= maxChars) {
+            out.push({ text: gap, pos: start });
+            return;
+        }
+        // Reuse char-based algorithm for oversized gaps. Restrict break
+        // points and code fences to the gap window and rebase positions so
+        // chunkDocumentWithBreakPoints operates on a standalone slice.
+        const subPoints = regexPoints
+            .filter(p => p.pos >= start && p.pos < end)
+            .map(p => ({ ...p, pos: p.pos - start }));
+        const subFences = codeFences
+            .filter(f => f.end > start && f.start < end)
+            .map(f => ({
+            start: Math.max(0, f.start - start),
+            end: Math.max(0, Math.min(end, f.end) - start),
+        }));
+        const sub = chunkDocumentWithBreakPoints(gap, subPoints, subFences, maxChars, overlapChars, windowChars);
+        for (const c of sub)
+            out.push({ text: c.text, pos: start + c.pos });
+    };
+    for (const range of ranges) {
+        // Emit any leading / inter-range gap (imports, top-level code).
+        emitGap(cursor, range.startIndex);
+        const body = content.slice(range.startIndex, range.endIndex);
+        if (body.length === 0) {
+            cursor = range.endIndex;
+            continue;
+        }
+        if (body.length <= maxChars) {
+            out.push({ text: body, pos: range.startIndex });
+        }
+        else {
+            // Oversized function/class — split with char algorithm so we stay
+            // under the embed token budget. Break points inside the range are
+            // reused to keep splits at syntactically-sensible positions.
+            const subPoints = regexPoints
+                .filter(p => p.pos >= range.startIndex && p.pos < range.endIndex)
+                .map(p => ({ ...p, pos: p.pos - range.startIndex }));
+            const subFences = codeFences
+                .filter(f => f.end > range.startIndex && f.start < range.endIndex)
+                .map(f => ({
+                start: Math.max(0, f.start - range.startIndex),
+                end: Math.max(0, Math.min(range.endIndex, f.end) - range.startIndex),
+            }));
+            const sub = chunkDocumentWithBreakPoints(body, subPoints, subFences, maxChars, overlapChars, windowChars);
+            for (const c of sub)
+                out.push({ text: c.text, pos: range.startIndex + c.pos });
+        }
+        cursor = range.endIndex;
+    }
+    // Trailing gap after the last range.
+    emitGap(cursor, content.length);
+    // Edge case: content consisted entirely of whitespace-only gaps (zero
+    // emitted chunks). Preserve the invariant that non-empty content yields
+    // at least one chunk.
+    if (out.length === 0 && content.length > 0) {
+        return [{ text: content, pos: 0 }];
+    }
+    return out;
+}
 /**
  * Chunk a document by actual token count using the LLM tokenizer.
  * More accurate than character-based chunking but requires async.
@@ -2197,10 +2319,15 @@ function buildFTS5Query(query) {
 /**
  * Validate that a vec/hyde query doesn't use lex-only syntax.
  * Returns error message if invalid, null if valid.
+ *
+ * Negation is detected ONLY when `-` is preceded by whitespace or sits at
+ * the start of the query. Hyphens inside words (e.g. `auto-archived`,
+ * `pre-commit`, `multi-session`, `state-of-the-art`) carry no negation
+ * semantics in natural English and must pass through unchanged.
  */
 export function validateSemanticQuery(query) {
-    // Check for negation syntax
-    if (/-\w/.test(query) || /-"/.test(query)) {
+    // `-term` or `-"phrase"` only counts as negation at SOS or after whitespace.
+    if (/(?:^|\s)-\w/.test(query) || /(?:^|\s)-"/.test(query)) {
         return 'Negation (-term) is not supported in vec/hyde queries. Use lex for exclusions.';
     }
     return null;

+ 1 - 1
package.json

@@ -1,6 +1,6 @@
 {
   "name": "@oivo/qmd",
-  "version": "2.1.0",
+  "version": "2.1.1-oivo.0",
   "description": "Query Markup Documents - On-device hybrid search for markdown files with BM25, vector search, and LLM reranking",
   "type": "module",
   "main": "dist/index.js",

+ 7 - 2
src/store.ts

@@ -3049,10 +3049,15 @@ function buildFTS5Query(query: string): string | null {
 /**
  * Validate that a vec/hyde query doesn't use lex-only syntax.
  * Returns error message if invalid, null if valid.
+ *
+ * Negation is detected ONLY when `-` is preceded by whitespace or sits at
+ * the start of the query. Hyphens inside words (e.g. `auto-archived`,
+ * `pre-commit`, `multi-session`, `state-of-the-art`) carry no negation
+ * semantics in natural English and must pass through unchanged.
  */
 export function validateSemanticQuery(query: string): string | null {
-  // Check for negation syntax
-  if (/-\w/.test(query) || /-"/.test(query)) {
+  // `-term` or `-"phrase"` only counts as negation at SOS or after whitespace.
+  if (/(?:^|\s)-\w/.test(query) || /(?:^|\s)-"/.test(query)) {
     return 'Negation (-term) is not supported in vec/hyde queries. Use lex for exclusions.';
   }
   return null;

+ 23 - 0
test/structured-search.test.ts

@@ -366,6 +366,29 @@ describe("lex query syntax", () => {
       expect(validateSemanticQuery('-"exact phrase"')).toContain("Negation");
     });
 
+    test("rejects mid-query quoted negation", () => {
+      expect(validateSemanticQuery('foo -"exact phrase"')).toContain("Negation");
+    });
+
+    test("accepts hyphenated words (no negation)", () => {
+      // Regression for the hyphen-parsing UX bug: hyphens inside words must
+      // NOT be read as negation operators. See `validateSemanticQuery` doc.
+      expect(validateSemanticQuery("when does a completed session get auto-archived")).toBeNull();
+      expect(validateSemanticQuery("pre-commit hook")).toBeNull();
+      expect(validateSemanticQuery("multi-session coordination")).toBeNull();
+      expect(validateSemanticQuery("cross-machine file ops")).toBeNull();
+      expect(validateSemanticQuery("long-running process")).toBeNull();
+      expect(validateSemanticQuery("well-known endpoint")).toBeNull();
+      expect(validateSemanticQuery("out-of-scope edits")).toBeNull();
+      expect(validateSemanticQuery("state-of-the-art model")).toBeNull();
+    });
+
+    test("accepts hyphenated word at start of query", () => {
+      // Leading hyphenated word starts with a letter, not `-`, so the SOS
+      // rule does not fire — confirm via an explicit start-of-string case.
+      expect(validateSemanticQuery("auto-archived session")).toBeNull();
+    });
+
 
     test("accepts hyde-style hypothetical answers", () => {
       expect(validateSemanticQuery(