瀏覽代碼

fix: handle hyphenated tokens in FTS5 lex queries

Hyphenated terms like multi-agent, DEC-0054, gpt-4 were being stripped
of hyphens and concatenated (e.g., "multiagent") which missed matches.
Now they're split into FTS5 phrase queries ("multi agent") so the porter
tokenizer matches them correctly.
Ryan 2 月之前
父節點
當前提交
7b9bd01226
共有 2 個文件被更改,包括 96 次插入9 次删除
  1. 47 7
      src/store.ts
  2. 49 2
      test/structured-search.test.ts

+ 47 - 7
src/store.ts

@@ -2654,20 +2654,46 @@ function sanitizeFTS5Term(term: string): string {
   return term.replace(/[^\p{L}\p{N}']/gu, '').toLowerCase();
 }
 
+/**
+ * Check if a token is a hyphenated compound word (e.g., multi-agent, DEC-0054, gpt-4).
+ * Returns true if the token contains internal hyphens between word/digit characters.
+ */
+function isHyphenatedToken(token: string): boolean {
+  return /^[\p{L}\p{N}][\p{L}\p{N}'-]*-[\p{L}\p{N}][\p{L}\p{N}'-]*$/u.test(token);
+}
+
+/**
+ * Sanitize a hyphenated term into an FTS5 phrase by splitting on hyphens
+ * and sanitizing each part. Returns the parts joined by spaces for use
+ * inside FTS5 quotes: "multi agent" matches "multi-agent" in porter tokenizer.
+ */
+function sanitizeHyphenatedTerm(term: string): string {
+  return term.split('-').map(t => sanitizeFTS5Term(t)).filter(t => t).join(' ');
+}
+
 /**
  * Parse lex query syntax into FTS5 query.
  *
  * Supports:
  * - Quoted phrases: "exact phrase" → "exact phrase" (exact match)
  * - Negation: -term or -"phrase" → uses FTS5 NOT operator
+ * - Hyphenated tokens: multi-agent, DEC-0054, gpt-4 → treated as phrases
  * - Plain terms: term → "term"* (prefix match)
  *
  * FTS5 NOT is a binary operator: `term1 NOT term2` means "match term1 but not term2".
  * So `-term` only works when there are also positive terms.
  *
+ * Hyphen disambiguation: `-sports` at a word boundary is negation, but `multi-agent`
+ * (where `-` is between word characters) is treated as a hyphenated phrase.
+ * When a leading `-` is followed by what looks like a hyphenated compound word
+ * (e.g., `-multi-agent`), the entire token is treated as a negated phrase.
+ *
  * Examples:
  *   performance -sports     → "performance"* NOT "sports"*
  *   "machine learning"      → "machine learning"
+ *   multi-agent memory      → "multi agent" AND "memory"*
+ *   DEC-0054               → "dec 0054"
+ *   -multi-agent            → NOT "multi agent"
  */
 function buildFTS5Query(query: string): string | null {
   const positive: string[] = [];
@@ -2709,13 +2735,27 @@ function buildFTS5Query(query: string): string | null {
       while (i < s.length && !/[\s"]/.test(s[i]!)) i++;
       const term = s.slice(start, i);
 
-      const sanitized = sanitizeFTS5Term(term);
-      if (sanitized) {
-        const ftsTerm = `"${sanitized}"*`;  // Prefix match
-        if (negated) {
-          negative.push(ftsTerm);
-        } else {
-          positive.push(ftsTerm);
+      // Handle hyphenated tokens: multi-agent, DEC-0054, gpt-4
+      // These get split into phrase queries so FTS5 porter tokenizer matches them.
+      if (isHyphenatedToken(term)) {
+        const sanitized = sanitizeHyphenatedTerm(term);
+        if (sanitized) {
+          const ftsPhrase = `"${sanitized}"`;  // Phrase match (no prefix)
+          if (negated) {
+            negative.push(ftsPhrase);
+          } else {
+            positive.push(ftsPhrase);
+          }
+        }
+      } else {
+        const sanitized = sanitizeFTS5Term(term);
+        if (sanitized) {
+          const ftsTerm = `"${sanitized}"*`;  // Prefix match
+          if (negated) {
+            negative.push(ftsTerm);
+          } else {
+            positive.push(ftsTerm);
+          }
         }
       }
     }

+ 49 - 2
test/structured-search.test.ts

@@ -399,6 +399,14 @@ describe("buildFTS5Query (lex parser)", () => {
     return term.replace(/[^\p{L}\p{N}']/gu, '').toLowerCase();
   }
 
+  function isHyphenatedToken(token: string): boolean {
+    return /^[\p{L}\p{N}][\p{L}\p{N}'-]*-[\p{L}\p{N}][\p{L}\p{N}'-]*$/u.test(token);
+  }
+
+  function sanitizeHyphenatedTerm(term: string): string {
+    return term.split('-').map(t => sanitizeFTS5Term(t)).filter(t => t).join(' ');
+  }
+
   function buildFTS5Query(query: string): string | null {
     const positive: string[] = [];
     const negative: string[] = [];
@@ -424,8 +432,14 @@ describe("buildFTS5Query (lex parser)", () => {
         const start = i;
         while (i < s.length && !/[\s"]/.test(s[i]!)) i++;
         const term = s.slice(start, i);
-        const sanitized = sanitizeFTS5Term(term);
-        if (sanitized) (negated ? negative : positive).push(`"${sanitized}"*`);
+
+        if (isHyphenatedToken(term)) {
+          const sanitized = sanitizeHyphenatedTerm(term);
+          if (sanitized) (negated ? negative : positive).push(`"${sanitized}"`);
+        } else {
+          const sanitized = sanitizeFTS5Term(term);
+          if (sanitized) (negated ? negative : positive).push(`"${sanitized}"*`);
+        }
       }
     }
 
@@ -488,4 +502,37 @@ describe("buildFTS5Query (lex parser)", () => {
   test("special chars in terms stripped", () => {
     expect(buildFTS5Query("hello!world")).toBe('"helloworld"*');
   });
+
+  // Hyphenated token tests
+  test("hyphenated term → phrase match", () => {
+    expect(buildFTS5Query("multi-agent")).toBe('"multi agent"');
+  });
+
+  test("hyphenated identifier → phrase match", () => {
+    expect(buildFTS5Query("DEC-0054")).toBe('"dec 0054"');
+  });
+
+  test("hyphenated model name → phrase match", () => {
+    expect(buildFTS5Query("gpt-4")).toBe('"gpt 4"');
+  });
+
+  test("multi-hyphen term → phrase match", () => {
+    expect(buildFTS5Query("foo-bar-baz")).toBe('"foo bar baz"');
+  });
+
+  test("hyphenated term mixed with plain terms", () => {
+    expect(buildFTS5Query("multi-agent memory")).toBe('"multi agent" AND "memory"*');
+  });
+
+  test("negation still works alongside hyphenated terms", () => {
+    expect(buildFTS5Query("multi-agent -sports")).toBe('"multi agent" NOT "sports"*');
+  });
+
+  test("negated hyphenated term", () => {
+    expect(buildFTS5Query("performance -multi-agent")).toBe('"performance"* NOT "multi agent"');
+  });
+
+  test("plain negation still works (not confused with hyphen)", () => {
+    expect(buildFTS5Query("performance -sports")).toBe('"performance"* NOT "sports"*');
+  });
 });