|
@@ -2654,20 +2654,46 @@ function sanitizeFTS5Term(term: string): string {
|
|
|
return term.replace(/[^\p{L}\p{N}']/gu, '').toLowerCase();
|
|
return term.replace(/[^\p{L}\p{N}']/gu, '').toLowerCase();
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+/**
|
|
|
|
|
+ * Check if a token is a hyphenated compound word (e.g., multi-agent, DEC-0054, gpt-4).
|
|
|
|
|
+ * Returns true if the token contains internal hyphens between word/digit characters.
|
|
|
|
|
+ */
|
|
|
|
|
+function isHyphenatedToken(token: string): boolean {
|
|
|
|
|
+ return /^[\p{L}\p{N}][\p{L}\p{N}'-]*-[\p{L}\p{N}][\p{L}\p{N}'-]*$/u.test(token);
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+/**
|
|
|
|
|
+ * Sanitize a hyphenated term into an FTS5 phrase by splitting on hyphens
|
|
|
|
|
+ * and sanitizing each part. Returns the parts joined by spaces for use
|
|
|
|
|
+ * inside FTS5 quotes: "multi agent" matches "multi-agent" in porter tokenizer.
|
|
|
|
|
+ */
|
|
|
|
|
+function sanitizeHyphenatedTerm(term: string): string {
|
|
|
|
|
+ return term.split('-').map(t => sanitizeFTS5Term(t)).filter(t => t).join(' ');
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
/**
|
|
/**
|
|
|
* Parse lex query syntax into FTS5 query.
|
|
* Parse lex query syntax into FTS5 query.
|
|
|
*
|
|
*
|
|
|
* Supports:
|
|
* Supports:
|
|
|
* - Quoted phrases: "exact phrase" → "exact phrase" (exact match)
|
|
* - Quoted phrases: "exact phrase" → "exact phrase" (exact match)
|
|
|
* - Negation: -term or -"phrase" → uses FTS5 NOT operator
|
|
* - Negation: -term or -"phrase" → uses FTS5 NOT operator
|
|
|
|
|
+ * - Hyphenated tokens: multi-agent, DEC-0054, gpt-4 → treated as phrases
|
|
|
* - Plain terms: term → "term"* (prefix match)
|
|
* - Plain terms: term → "term"* (prefix match)
|
|
|
*
|
|
*
|
|
|
* FTS5 NOT is a binary operator: `term1 NOT term2` means "match term1 but not term2".
|
|
* FTS5 NOT is a binary operator: `term1 NOT term2` means "match term1 but not term2".
|
|
|
* So `-term` only works when there are also positive terms.
|
|
* So `-term` only works when there are also positive terms.
|
|
|
*
|
|
*
|
|
|
|
|
+ * Hyphen disambiguation: `-sports` at a word boundary is negation, but `multi-agent`
|
|
|
|
|
+ * (where `-` is between word characters) is treated as a hyphenated phrase.
|
|
|
|
|
+ * When a leading `-` is followed by what looks like a hyphenated compound word
|
|
|
|
|
+ * (e.g., `-multi-agent`), the entire token is treated as a negated phrase.
|
|
|
|
|
+ *
|
|
|
* Examples:
|
|
* Examples:
|
|
|
* performance -sports → "performance"* NOT "sports"*
|
|
* performance -sports → "performance"* NOT "sports"*
|
|
|
* "machine learning" → "machine learning"
|
|
* "machine learning" → "machine learning"
|
|
|
|
|
+ * multi-agent memory → "multi agent" AND "memory"*
|
|
|
|
|
+ * DEC-0054 → "dec 0054"
|
|
|
|
|
+ * -multi-agent → NOT "multi agent"
|
|
|
*/
|
|
*/
|
|
|
function buildFTS5Query(query: string): string | null {
|
|
function buildFTS5Query(query: string): string | null {
|
|
|
const positive: string[] = [];
|
|
const positive: string[] = [];
|
|
@@ -2709,13 +2735,27 @@ function buildFTS5Query(query: string): string | null {
|
|
|
while (i < s.length && !/[\s"]/.test(s[i]!)) i++;
|
|
while (i < s.length && !/[\s"]/.test(s[i]!)) i++;
|
|
|
const term = s.slice(start, i);
|
|
const term = s.slice(start, i);
|
|
|
|
|
|
|
|
- const sanitized = sanitizeFTS5Term(term);
|
|
|
|
|
- if (sanitized) {
|
|
|
|
|
- const ftsTerm = `"${sanitized}"*`; // Prefix match
|
|
|
|
|
- if (negated) {
|
|
|
|
|
- negative.push(ftsTerm);
|
|
|
|
|
- } else {
|
|
|
|
|
- positive.push(ftsTerm);
|
|
|
|
|
|
|
+ // Handle hyphenated tokens: multi-agent, DEC-0054, gpt-4
|
|
|
|
|
+ // These get split into phrase queries so FTS5 porter tokenizer matches them.
|
|
|
|
|
+ if (isHyphenatedToken(term)) {
|
|
|
|
|
+ const sanitized = sanitizeHyphenatedTerm(term);
|
|
|
|
|
+ if (sanitized) {
|
|
|
|
|
+ const ftsPhrase = `"${sanitized}"`; // Phrase match (no prefix)
|
|
|
|
|
+ if (negated) {
|
|
|
|
|
+ negative.push(ftsPhrase);
|
|
|
|
|
+ } else {
|
|
|
|
|
+ positive.push(ftsPhrase);
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ } else {
|
|
|
|
|
+ const sanitized = sanitizeFTS5Term(term);
|
|
|
|
|
+ if (sanitized) {
|
|
|
|
|
+ const ftsTerm = `"${sanitized}"*`; // Prefix match
|
|
|
|
|
+ if (negated) {
|
|
|
|
|
+ negative.push(ftsTerm);
|
|
|
|
|
+ } else {
|
|
|
|
|
+ positive.push(ftsTerm);
|
|
|
|
|
+ }
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|