Procházet zdrojové kódy

fix(store): preserve underscores in BM25 search terms (#404)

sanitizeFTS5Term stripped all non-letter/non-number characters including
underscores, causing snake_case identifiers like `my_variable` to become
`myvariable` and silently fail BM25 matches.

Add underscore to the preserved character set in the Unicode regex.
Export the function and add unit tests covering snake_case, contractions,
punctuation stripping, and unicode.

Fixes #305

Co-authored-by: Matt Van Horn <455140+mvanhorn@users.noreply.github.com>
Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Matt Van Horn před 1 měsícem
rodič
revize
1ad3388132
2 změnil soubory, kde provedl 41 přidání a 2 odebrání
  1. 2 2
      src/store.ts
  2. 39 0
      test/store.helpers.unit.test.ts

+ 2 - 2
src/store.ts

@@ -2762,8 +2762,8 @@ export function getTopLevelPathsWithoutContext(db: Database, collectionName: str
 // FTS Search
 // =============================================================================
 
-function sanitizeFTS5Term(term: string): string {
-  return term.replace(/[^\p{L}\p{N}']/gu, '').toLowerCase();
+export function sanitizeFTS5Term(term: string): string {
+  return term.replace(/[^\p{L}\p{N}'_]/gu, '').toLowerCase();
 }
 
 /**

+ 39 - 0
test/store.helpers.unit.test.ts

@@ -16,6 +16,7 @@ import {
   isDocid,
   handelize,
   cleanupOrphanedVectors,
+  sanitizeFTS5Term,
 } from "../src/store";
 
 // =============================================================================
@@ -244,3 +245,41 @@ describe("handelize", () => {
     expect(isDocid("12345")).toBe(false);
   });
 });
+
+// =============================================================================
+// sanitizeFTS5Term Tests
+// =============================================================================
+
+describe("sanitizeFTS5Term", () => {
+  test("preserves underscores in snake_case identifiers", () => {
+    expect(sanitizeFTS5Term("my_variable")).toBe("my_variable");
+    expect(sanitizeFTS5Term("MAX_RETRIES")).toBe("max_retries");
+    expect(sanitizeFTS5Term("__init__")).toBe("__init__");
+  });
+
+  test("preserves alphanumeric characters", () => {
+    expect(sanitizeFTS5Term("hello123")).toBe("hello123");
+    expect(sanitizeFTS5Term("test")).toBe("test");
+  });
+
+  test("preserves apostrophes for contractions", () => {
+    expect(sanitizeFTS5Term("don't")).toBe("don't");
+    expect(sanitizeFTS5Term("it's")).toBe("it's");
+  });
+
+  test("strips other punctuation", () => {
+    expect(sanitizeFTS5Term("hello!")).toBe("hello");
+    expect(sanitizeFTS5Term("test@value")).toBe("testvalue");
+    expect(sanitizeFTS5Term("a.b")).toBe("ab");
+  });
+
+  test("lowercases output", () => {
+    expect(sanitizeFTS5Term("Hello")).toBe("hello");
+    expect(sanitizeFTS5Term("MY_VAR")).toBe("my_var");
+  });
+
+  test("handles unicode letters and numbers", () => {
+    expect(sanitizeFTS5Term("café")).toBe("café");
+    expect(sanitizeFTS5Term("日本語")).toBe("日本語");
+  });
+});