Forráskód Böngészése

feat(lex): add query syntax for exact phrases, negation, and OR

Lex queries now support:
- "exact phrase" - quoted exact matching (no prefix)
- -term or -"phrase" - exclude from results
- term1 OR term2 - match either term

Semantic queries (vec/hyde) validate and reject these operators
with helpful error messages.

Examples:
  performance -sports     → matches "performance" excluding "sports"
  "machine learning"      → exact phrase match
  auth OR authentication  → matches either term
Tobi Lütke 3 hónapja
szülő
commit
efb39616e6
3 módosított fájl, 191 hozzáadás és 9 törlés
  1. 15 3
      skills/qmd/SKILL.md
  2. 143 6
      src/store.ts
  3. 33 0
      test/structured-search.test.ts

+ 15 - 3
skills/qmd/SKILL.md

@@ -42,9 +42,10 @@ Local search engine for markdown content.
 
 **lex (keyword)**
 - 2-5 terms, no filler words
-- Include synonyms: `"auth authentication login"`
-- Use exact names: `"PostgreSQL connection pool"`
-- Code identifiers work: `"handleError async"`
+- Include synonyms: `auth OR authentication`
+- Exact phrase: `"connection pool"` (quoted)
+- Exclude terms: `performance -sports` (minus prefix)
+- Code identifiers work: `handleError async`
 
 **vec (semantic)**
 - Full natural language question
@@ -67,6 +68,17 @@ Local search engine for markdown content.
 
 First query gets 2x weight in fusion — put your best guess first.
 
+### Lex Query Syntax
+
+| Syntax | Meaning | Example |
+|--------|---------|---------|
+| `term` | Prefix match | `perf` matches "performance" |
+| `"phrase"` | Exact phrase | `"rate limiter"` |
+| `-term` | Exclude | `performance -sports` |
+| `OR` | Either term | `auth OR authentication` |
+
+Note: `-term` and `OR` only work in lex queries, not vec/hyde.
+
 ### Collection Filtering
 
 ```json

+ 143 - 6
src/store.ts

@@ -1987,13 +1987,140 @@ function sanitizeFTS5Term(term: string): string {
   return term.replace(/[^\p{L}\p{N}']/gu, '').toLowerCase();
 }
 
+/**
+ * Parse lex query syntax into FTS5 query.
+ *
+ * Supports:
+ * - Quoted phrases: "exact phrase" → "exact phrase" (exact match)
+ * - Negation: -term or -"phrase" → uses FTS5 NOT operator
+ * - OR: term1 OR term2 (case-insensitive)
+ * - Plain terms: term → "term"* (prefix match)
+ *
+ * FTS5 NOT is a binary operator: `term1 NOT term2` means "match term1 but not term2".
+ * So `-term` only works when there are also positive terms.
+ *
+ * Examples:
+ *   performance -sports     → "performance"* NOT "sports"*
+ *   "machine learning"      → "machine learning"
+ *   auth OR authentication  → ("auth"* OR "authentication"*)
+ */
 function buildFTS5Query(query: string): string | null {
-  const terms = query.split(/\s+/)
-    .map(t => sanitizeFTS5Term(t))
-    .filter(t => t.length > 0);
-  if (terms.length === 0) return null;
-  if (terms.length === 1) return `"${terms[0]}"*`;
-  return terms.map(t => `"${t}"*`).join(' AND ');
+  const positive: string[] = [];
+  const negative: string[] = [];
+  const orGroups: string[][] = [[]]; // Track OR groupings
+  let currentOrGroup = 0;
+
+  let i = 0;
+  const s = query.trim();
+
+  while (i < s.length) {
+    // Skip whitespace
+    while (i < s.length && /\s/.test(s[i]!)) i++;
+    if (i >= s.length) break;
+
+    // Check for negation prefix
+    const negated = s[i] === '-';
+    if (negated) i++;
+
+    // Check for quoted phrase
+    if (s[i] === '"') {
+      const start = i + 1;
+      i++;
+      while (i < s.length && s[i] !== '"') i++;
+      const phrase = s.slice(start, i).trim();
+      i++; // skip closing quote
+      if (phrase.length > 0) {
+        const sanitized = phrase.split(/\s+/).map(t => sanitizeFTS5Term(t)).filter(t => t).join(' ');
+        if (sanitized) {
+          const ftsPhrase = `"${sanitized}"`;  // Exact phrase, no prefix match
+          if (negated) {
+            negative.push(ftsPhrase);
+          } else {
+            positive.push(ftsPhrase);
+            orGroups[currentOrGroup]!.push(ftsPhrase);
+          }
+        }
+      }
+    } else {
+      // Plain term (until whitespace or quote)
+      const start = i;
+      while (i < s.length && !/[\s"]/.test(s[i]!)) i++;
+      const term = s.slice(start, i);
+
+      // Check for OR operator
+      if (term.toUpperCase() === 'OR') {
+        // Start new OR group
+        currentOrGroup++;
+        orGroups.push([]);
+      } else if (term.toUpperCase() === 'AND' || term.toUpperCase() === 'NOT') {
+        // AND is implicit, NOT should use - prefix
+        continue;
+      } else {
+        const sanitized = sanitizeFTS5Term(term);
+        if (sanitized) {
+          const ftsTerm = `"${sanitized}"*`;  // Prefix match
+          if (negated) {
+            negative.push(ftsTerm);
+          } else {
+            positive.push(ftsTerm);
+            orGroups[currentOrGroup]!.push(ftsTerm);
+          }
+        }
+      }
+    }
+  }
+
+  if (positive.length === 0 && negative.length === 0) return null;
+
+  // If only negative terms, we can't search (FTS5 NOT is binary)
+  if (positive.length === 0) {
+    // Fall back to searching without negation
+    return null;
+  }
+
+  // Build the positive part with OR groups
+  let result: string;
+  if (orGroups.length > 1 && orGroups.some(g => g.length > 0)) {
+    // Has OR groups - build (a OR b) AND c structure
+    const orParts = orGroups.filter(g => g.length > 0).map(g =>
+      g.length === 1 ? g[0]! : `(${g.join(' OR ')})`
+    );
+    result = orParts.join(' AND ');
+  } else {
+    // Simple AND of all positive terms
+    result = positive.join(' AND ');
+  }
+
+  // Add NOT clause for negative terms (FTS5: positive NOT negative)
+  if (negative.length > 0) {
+    // FTS5 NOT only works with single term on right side, chain them
+    for (const neg of negative) {
+      result = `${result} NOT ${neg}`;
+    }
+  }
+
+  return result;
+}
+
+/**
+ * Validate that a vec/hyde query doesn't use lex-only syntax.
+ * Returns error message if invalid, null if valid.
+ */
+export function validateSemanticQuery(query: string): string | null {
+  // Check for negation syntax
+  if (/-\w/.test(query) || /-"/.test(query)) {
+    return 'Negation (-term) is not supported in vec/hyde queries. Use lex for exclusions.';
+  }
+  // Check for quoted exact phrases (semantic search doesn't do exact matching)
+  if (/"[^"]+"\s*$/.test(query.trim()) || /^"[^"]+"/.test(query.trim())) {
+    // Single quoted phrase is the whole query - that's fine for hyde
+    // But warn if it looks like they expect exact matching
+  }
+  // Check for OR operator (semantic search doesn't support boolean logic)
+  if (/\bOR\b/i.test(query)) {
+    return 'OR operator is not supported in vec/hyde queries. Use multiple lex queries or rephrase.';
+  }
+  return null;
 }
 
 export function searchFTS(db: Database, query: string, limit: number = 20, collectionName?: string): SearchResult[] {
@@ -3116,6 +3243,16 @@ export async function structuredSearch(
 
   if (searches.length === 0) return [];
 
+  // Validate semantic queries don't use lex-only syntax
+  for (const search of searches) {
+    if (search.type === 'vec' || search.type === 'hyde') {
+      const error = validateSemanticQuery(search.query);
+      if (error) {
+        throw new Error(`Invalid ${search.type} query: ${error}`);
+      }
+    }
+  }
+
   const rankedLists: RankedResult[][] = [];
   const docidMap = new Map<string, string>(); // filepath -> docid
   const hasVectors = !!store.db.prepare(

+ 33 - 0
test/structured-search.test.ts

@@ -318,3 +318,36 @@ describe("structuredSearch", () => {
     }
   });
 });
+
+// =============================================================================
+// FTS Query Syntax Tests
+// =============================================================================
+
+describe("lex query syntax", () => {
+  // Note: These test via CLI behavior since buildFTS5Query is not exported
+
+  describe("validateSemanticQuery", () => {
+    // Import the validation function
+    const { validateSemanticQuery } = require("../src/store.js");
+
+    test("accepts plain natural language", () => {
+      expect(validateSemanticQuery("how does error handling work")).toBeNull();
+      expect(validateSemanticQuery("what is the CAP theorem")).toBeNull();
+    });
+
+    test("rejects negation syntax", () => {
+      expect(validateSemanticQuery("performance -sports")).toContain("Negation");
+      expect(validateSemanticQuery('-"exact phrase"')).toContain("Negation");
+    });
+
+    test("rejects OR operator", () => {
+      expect(validateSemanticQuery("auth OR authentication")).toContain("OR");
+    });
+
+    test("accepts hyde-style hypothetical answers", () => {
+      expect(validateSemanticQuery(
+        "The CAP theorem states that a distributed system cannot simultaneously provide consistency, availability, and partition tolerance."
+      )).toBeNull();
+    });
+  });
+});