il y a 6 mois · 4d21c5ab2b
--- a/src/llm.test.ts
+++ b/src/llm.test.ts
@@ -302,20 +302,25 @@ describe("LlamaCpp Integration", () => {
 
				   });
			
 
				 
			
 
				   describe("expandQuery", () => {
			
 
				-    test("returns at least the original query", async () => {
			
 
				+    test("returns query expansions with correct types", async () => {
			
 
				       const result = await llm.expandQuery("test query");
			
 
				 
			
 
				-      // Result is now Queryable[] - check that at least one has the original query text
			
 
				-      const texts = result.map(q => q.text);
			
 
				-      expect(texts).toContain("test query");
			
 
				+      // Result is Queryable[] containing lex, vec, and/or hyde entries
			
 
				       expect(result.length).toBeGreaterThanOrEqual(1);
			
 
				+
			
 
				+      // Each result should have a valid type
			
 
				+      for (const q of result) {
			
 
				+        expect(["lex", "vec", "hyde"]).toContain(q.type);
			
 
				+        expect(q.text.length).toBeGreaterThan(0);
			
 
				+      }
			
 
				     }, 30000); // 30s timeout for model loading
			
 
				 
			
 
				-    test("returns original query first", async () => {
			
 
				-      const result = await llm.expandQuery("authentication setup");
			
 
				+    test("can exclude lexical queries", async () => {
			
 
				+      const result = await llm.expandQuery("authentication setup", { includeLexical: false });
			
 
				 
			
 
				-      // First result should have the original query text
			
 
				-      expect(result[0]?.text).toBe("authentication setup");
			
 
				+      // Should not contain any 'lex' type entries
			
 
				+      const lexEntries = result.filter(q => q.type === "lex");
			
 
				+      expect(lexEntries).toHaveLength(0);
			
 
				     });
			
 
				   });
			
 
				 });
			
--- a/src/llm.ts
+++ b/src/llm.ts
@@ -700,6 +700,10 @@ Final Output:`;
 
				         return { type: type as QueryType, text };
			
 
				       }).filter((q): q is Queryable => q !== null);
			
 
				 
			
 
				+      // Filter out lex entries if not requested
			
 
				+      if (!includeLexical) {
			
 
				+        return queryables.filter(q => q.type !== 'lex');
			
 
				+      }
			
 
				       return queryables;
			
 
				     } catch (error) {
			
 
				       console.error("Structured query expansion failed:", error);
			
--- a/src/store.test.ts
+++ b/src/store.test.ts
@@ -353,12 +353,13 @@ describe("handelize", () => {
 
				   });
			
 
				 
			
 
				   test("handles unicode characters", () => {
			
 
				-    // Pure unicode with no alphanumerics throws error
			
 
				-    expect(() => handelize("日本語.md")).toThrow("no valid filename content");
			
 
				-    // Mixed unicode/ascii preserves the ascii parts
			
 
				-    expect(handelize("café-notes.md")).toBe("caf-notes.md");
			
 
				-    expect(handelize("naïve.md")).toBe("na-ve.md");
			
 
				-    expect(handelize("日本語-notes.md")).toBe("notes.md");
			
 
				+    // Pure unicode filenames are now supported (fixes GitHub issue #10)
			
 
				+    expect(handelize("日本語.md")).toBe("日本語.md");
			
 
				+    expect(handelize("Зоны и проекты.md")).toBe("зоны-и-проекты.md");
			
 
				+    // Mixed unicode/ascii preserves both
			
 
				+    expect(handelize("café-notes.md")).toBe("café-notes.md");
			
 
				+    expect(handelize("naïve.md")).toBe("naïve.md");
			
 
				+    expect(handelize("日本語-notes.md")).toBe("日本語-notes.md");
			
 
				   });
			
 
				 
			
 
				   test("handles dates and times in filenames", () => {
			
@@ -1813,6 +1814,47 @@ describe("LlamaCpp Integration", () => {
 
				     await cleanupTestDb(store);
			
 
				   });
			
 
				 
			
 
				+  test("searchVec filters by collection name", async () => {
			
 
				+    const store = await createTestStore();
			
 
				+    const collection1 = await createTestCollection({ name: "coll1", pwd: "/test/coll1" });
			
 
				+    const collection2 = await createTestCollection({ name: "coll2", pwd: "/test/coll2" });
			
 
				+
			
 
				+    const hash1 = "hash1abc";
			
 
				+    const hash2 = "hash2xyz";
			
 
				+
			
 
				+    await insertTestDocument(store.db, collection1, {
			
 
				+      name: "doc1",
			
 
				+      hash: hash1,
			
 
				+      body: "Content in collection one",
			
 
				+    });
			
 
				+
			
 
				+    await insertTestDocument(store.db, collection2, {
			
 
				+      name: "doc2",
			
 
				+      hash: hash2,
			
 
				+      body: "Content in collection two",
			
 
				+    });
			
 
				+
			
 
				+    // Create vectors_vec table with correct dimensions (768 for embeddinggemma)
			
 
				+    store.ensureVecTable(768);
			
 
				+    const embedding1 = Array(768).fill(0).map(() => Math.random());
			
 
				+    const embedding2 = Array(768).fill(0).map(() => Math.random());
			
 
				+    store.db.prepare(`INSERT INTO content_vectors (hash, seq, pos, model, embedded_at) VALUES (?, 0, 0, 'test', ?)`).run(hash1, new Date().toISOString());
			
 
				+    store.db.prepare(`INSERT INTO content_vectors (hash, seq, pos, model, embedded_at) VALUES (?, 0, 0, 'test', ?)`).run(hash2, new Date().toISOString());
			
 
				+    store.db.prepare(`INSERT INTO vectors_vec (hash_seq, embedding) VALUES (?, ?)`).run(`${hash1}_0`, new Float32Array(embedding1));
			
 
				+    store.db.prepare(`INSERT INTO vectors_vec (hash_seq, embedding) VALUES (?, ?)`).run(`${hash2}_0`, new Float32Array(embedding2));
			
 
				+
			
 
				+    // Search without filter - should return both
			
 
				+    const allResults = await store.searchVec("content", "embeddinggemma", 10);
			
 
				+    expect(allResults).toHaveLength(2);
			
 
				+
			
 
				+    // Search with collection filter - should return only from collection1
			
 
				+    const filtered = await store.searchVec("content", "embeddinggemma", 10, collection1 as unknown as number);
			
 
				+    expect(filtered).toHaveLength(1);
			
 
				+    expect(filtered[0]!.collectionName).toBe(collection1);
			
 
				+
			
 
				+    await cleanupTestDb(store);
			
 
				+  });
			
 
				+
			
 
				   test("expandQuery returns original plus expanded queries", async () => {
			
 
				     const store = await createTestStore();
			
 
				 
			
--- a/src/store.ts
+++ b/src/store.ts
@@ -603,11 +603,11 @@ export function handelize(path: string): string {
 
				   }
			
 
				 
			
 
				   // Check for paths that are just extensions or only dots/special chars
			
 
				-  // A valid path must have at least one alphanumeric character before processing
			
 
				+  // A valid path must have at least one letter or digit (including Unicode)
			
 
				   const segments = path.split('/').filter(Boolean);
			
 
				   const lastSegment = segments[segments.length - 1] || '';
			
 
				   const filenameWithoutExt = lastSegment.replace(/\.[^.]+$/, '');
			
 
				-  const hasValidContent = /[a-zA-Z0-9]/.test(filenameWithoutExt);
			
 
				+  const hasValidContent = /[\p{L}\p{N}]/u.test(filenameWithoutExt);
			
 
				   if (!hasValidContent) {
			
 
				     throw new Error(`handelize: path "${path}" has no valid filename content`);
			
 
				   }
			
@@ -626,14 +626,14 @@ export function handelize(path: string): string {
 
				         const nameWithoutExt = ext ? segment.slice(0, -ext.length) : segment;
			
 
				 
			
 
				         const cleanedName = nameWithoutExt
			
 
				-          .replace(/[\W_]+/g, '-')  // Replace non-word chars with dash
			
 
				+          .replace(/[^\p{L}\p{N}]+/gu, '-')  // Replace non-letter/digit chars with dash
			
 
				           .replace(/^-+|-+$/g, ''); // Remove leading/trailing dashes
			
 
				 
			
 
				         return cleanedName + ext;
			
 
				       } else {
			
 
				         // For directories, just clean normally
			
 
				         return segment
			
 
				-          .replace(/[\W_]+/g, '-')
			
 
				+          .replace(/[^\p{L}\p{N}]+/gu, '-')
			
 
				           .replace(/^-+|-+$/g, '');
			
 
				       }
			
 
				     })
			
@@ -1729,16 +1729,17 @@ export async function searchVec(db: Database, query: string, model: string, limi
 
				     WHERE v.embedding MATCH ? AND k = ?
			
 
				   `;
			
 
				 
			
 
				+  const params: (Float32Array | number | string)[] = [new Float32Array(embedding), limit * 3];
			
 
				+
			
 
				   if (collectionId) {
			
 
				-    // Note: collectionId is a legacy parameter that should be phased out
			
 
				-    // Collections are now managed in YAML. For now, we interpret it as a collection name filter.
			
 
				+    // Filter by collection name
			
 
				     sql += ` AND d.collection = ?`;
			
 
				-    sql = sql.replace('?', String(collectionId)); // Hacky but maintains compatibility
			
 
				+    params.push(String(collectionId));
			
 
				   }
			
 
				 
			
 
				   sql += ` ORDER BY v.distance`;
			
 
				 
			
 
				-  const rows = db.prepare(sql).all(new Float32Array(embedding), limit * 3) as { hash_seq: string; distance: number; filepath: string; display_path: string; title: string; body: string; hash: string; pos: number }[];
			
 
				+  const rows = db.prepare(sql).all(...params) as { hash_seq: string; distance: number; filepath: string; display_path: string; title: string; body: string; hash: string; pos: number }[];
			
 
				 
			
 
				   const seen = new Map<string, { row: typeof rows[0]; bestDist: number }>();
			
 
				   for (const row of rows) {