structured-search.test.ts 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450
  1. /**
  2. * structured-search.test.ts - Tests for structured search functionality
  3. *
  4. * Tests cover:
  5. * - CLI query parser (parseStructuredQuery)
  6. * - StructuredSubSearch type validation
  7. * - Basic structuredSearch function behavior
  8. *
  9. * Run with: bun test structured-search.test.ts
  10. */
  11. import { describe, test, expect, beforeAll, afterAll } from "vitest";
  12. import { mkdtemp, rm } from "node:fs/promises";
  13. import { tmpdir } from "node:os";
  14. import { join } from "node:path";
  15. import {
  16. createStore,
  17. structuredSearch,
  18. validateSemanticQuery,
  19. type StructuredSubSearch,
  20. type Store,
  21. } from "../src/store.js";
  22. import { disposeDefaultLlamaCpp } from "../src/llm.js";
  23. // =============================================================================
  24. // parseStructuredQuery Tests (CLI Parser)
  25. // =============================================================================
  26. /**
  27. * Parse structured search query syntax.
  28. * This is a copy of the function from qmd.ts for isolated testing.
  29. */
  30. function parseStructuredQuery(query: string): StructuredSubSearch[] | null {
  31. const lines = query.split('\n').map(l => l.trim()).filter(l => l.length > 0);
  32. if (lines.length === 0) return null;
  33. const prefixRe = /^(lex|vec|hyde):\s*/i;
  34. const searches: StructuredSubSearch[] = [];
  35. const plainLines: string[] = [];
  36. for (const line of lines) {
  37. const match = line.match(prefixRe);
  38. if (match) {
  39. const type = match[1]!.toLowerCase() as 'lex' | 'vec' | 'hyde';
  40. const text = line.slice(match[0].length).trim();
  41. if (text.length > 0) {
  42. searches.push({ type, query: text });
  43. }
  44. } else {
  45. plainLines.push(line);
  46. }
  47. }
  48. // All plain lines, no prefixes -> null (use normal expansion)
  49. if (searches.length === 0 && plainLines.length === 1) {
  50. return null;
  51. }
  52. // Multiple plain lines without prefixes -> ambiguous, error
  53. if (plainLines.length > 1) {
  54. throw new Error("Ambiguous query: multiple lines without lex:/vec:/hyde: prefix.");
  55. }
  56. // Mix of prefixed and one plain line -> treat plain as lex
  57. if (plainLines.length === 1) {
  58. searches.unshift({ type: 'lex', query: plainLines[0]! });
  59. }
  60. return searches.length > 0 ? searches : null;
  61. }
  62. describe("parseStructuredQuery", () => {
  63. describe("plain queries (returns null for normal expansion)", () => {
  64. test("single line without prefix", () => {
  65. expect(parseStructuredQuery("CAP theorem")).toBeNull();
  66. expect(parseStructuredQuery("distributed systems")).toBeNull();
  67. });
  68. test("empty queries", () => {
  69. expect(parseStructuredQuery("")).toBeNull();
  70. expect(parseStructuredQuery(" ")).toBeNull();
  71. expect(parseStructuredQuery("\n\n")).toBeNull();
  72. });
  73. });
  74. describe("single prefixed queries", () => {
  75. test("lex: prefix", () => {
  76. const result = parseStructuredQuery("lex: CAP theorem");
  77. expect(result).toEqual([{ type: "lex", query: "CAP theorem" }]);
  78. });
  79. test("vec: prefix", () => {
  80. const result = parseStructuredQuery("vec: what is the CAP theorem");
  81. expect(result).toEqual([{ type: "vec", query: "what is the CAP theorem" }]);
  82. });
  83. test("hyde: prefix", () => {
  84. const result = parseStructuredQuery("hyde: The CAP theorem states that...");
  85. expect(result).toEqual([{ type: "hyde", query: "The CAP theorem states that..." }]);
  86. });
  87. test("uppercase prefix", () => {
  88. expect(parseStructuredQuery("LEX: keywords")).toEqual([{ type: "lex", query: "keywords" }]);
  89. expect(parseStructuredQuery("VEC: question")).toEqual([{ type: "vec", query: "question" }]);
  90. expect(parseStructuredQuery("HYDE: passage")).toEqual([{ type: "hyde", query: "passage" }]);
  91. });
  92. test("mixed case prefix", () => {
  93. expect(parseStructuredQuery("Lex: test")).toEqual([{ type: "lex", query: "test" }]);
  94. expect(parseStructuredQuery("VeC: test")).toEqual([{ type: "vec", query: "test" }]);
  95. });
  96. });
  97. describe("multiple prefixed queries", () => {
  98. test("lex + vec", () => {
  99. const result = parseStructuredQuery("lex: keywords\nvec: natural language");
  100. expect(result).toEqual([
  101. { type: "lex", query: "keywords" },
  102. { type: "vec", query: "natural language" },
  103. ]);
  104. });
  105. test("all three types", () => {
  106. const result = parseStructuredQuery("lex: keywords\nvec: question\nhyde: hypothetical doc");
  107. expect(result).toEqual([
  108. { type: "lex", query: "keywords" },
  109. { type: "vec", query: "question" },
  110. { type: "hyde", query: "hypothetical doc" },
  111. ]);
  112. });
  113. test("duplicate types allowed", () => {
  114. const result = parseStructuredQuery("lex: term1\nlex: term2\nlex: term3");
  115. expect(result).toEqual([
  116. { type: "lex", query: "term1" },
  117. { type: "lex", query: "term2" },
  118. { type: "lex", query: "term3" },
  119. ]);
  120. });
  121. test("order preserved", () => {
  122. const result = parseStructuredQuery("hyde: passage\nvec: question\nlex: keywords");
  123. expect(result).toEqual([
  124. { type: "hyde", query: "passage" },
  125. { type: "vec", query: "question" },
  126. { type: "lex", query: "keywords" },
  127. ]);
  128. });
  129. });
  130. describe("mixed plain and prefixed", () => {
  131. test("single plain line with prefixed lines -> plain becomes lex first", () => {
  132. const result = parseStructuredQuery("plain keywords\nvec: semantic question");
  133. expect(result).toEqual([
  134. { type: "lex", query: "plain keywords" },
  135. { type: "vec", query: "semantic question" },
  136. ]);
  137. });
  138. test("plain line prepended before other prefixed", () => {
  139. const result = parseStructuredQuery("keywords\nhyde: passage\nvec: question");
  140. expect(result).toEqual([
  141. { type: "lex", query: "keywords" },
  142. { type: "hyde", query: "passage" },
  143. { type: "vec", query: "question" },
  144. ]);
  145. });
  146. });
  147. describe("error cases", () => {
  148. test("multiple plain lines throws", () => {
  149. expect(() => parseStructuredQuery("line one\nline two")).toThrow("Ambiguous query");
  150. });
  151. test("three plain lines throws", () => {
  152. expect(() => parseStructuredQuery("a\nb\nc")).toThrow("Ambiguous query");
  153. });
  154. });
  155. describe("whitespace handling", () => {
  156. test("empty lines ignored", () => {
  157. const result = parseStructuredQuery("lex: keywords\n\nvec: question\n");
  158. expect(result).toEqual([
  159. { type: "lex", query: "keywords" },
  160. { type: "vec", query: "question" },
  161. ]);
  162. });
  163. test("whitespace-only lines ignored", () => {
  164. const result = parseStructuredQuery("lex: keywords\n \nvec: question");
  165. expect(result).toEqual([
  166. { type: "lex", query: "keywords" },
  167. { type: "vec", query: "question" },
  168. ]);
  169. });
  170. test("leading/trailing whitespace trimmed from lines", () => {
  171. const result = parseStructuredQuery(" lex: keywords \n vec: question ");
  172. expect(result).toEqual([
  173. { type: "lex", query: "keywords" },
  174. { type: "vec", query: "question" },
  175. ]);
  176. });
  177. test("internal whitespace preserved in query", () => {
  178. const result = parseStructuredQuery("lex: multiple spaces ");
  179. expect(result).toEqual([{ type: "lex", query: "multiple spaces" }]);
  180. });
  181. test("empty prefix value skipped", () => {
  182. const result = parseStructuredQuery("lex: \nvec: actual query");
  183. expect(result).toEqual([{ type: "vec", query: "actual query" }]);
  184. });
  185. test("only empty prefix values returns null", () => {
  186. const result = parseStructuredQuery("lex: \nvec: \nhyde: ");
  187. expect(result).toBeNull();
  188. });
  189. });
  190. describe("edge cases", () => {
  191. test("colon in query text preserved", () => {
  192. const result = parseStructuredQuery("lex: time: 12:30 PM");
  193. expect(result).toEqual([{ type: "lex", query: "time: 12:30 PM" }]);
  194. });
  195. test("prefix-like text in query preserved", () => {
  196. const result = parseStructuredQuery("vec: what does lex: mean");
  197. expect(result).toEqual([{ type: "vec", query: "what does lex: mean" }]);
  198. });
  199. test("newline in hyde passage (as single line)", () => {
  200. // If user wants actual newlines in hyde, they need to escape or use multiline syntax
  201. const result = parseStructuredQuery("hyde: The answer is X. It means Y.");
  202. expect(result).toEqual([{ type: "hyde", query: "The answer is X. It means Y." }]);
  203. });
  204. });
  205. });
  206. // =============================================================================
  207. // StructuredSubSearch Type Tests
  208. // =============================================================================
  209. describe("StructuredSubSearch type", () => {
  210. test("accepts lex type", () => {
  211. const search: StructuredSubSearch = { type: "lex", query: "test" };
  212. expect(search.type).toBe("lex");
  213. expect(search.query).toBe("test");
  214. });
  215. test("accepts vec type", () => {
  216. const search: StructuredSubSearch = { type: "vec", query: "test" };
  217. expect(search.type).toBe("vec");
  218. expect(search.query).toBe("test");
  219. });
  220. test("accepts hyde type", () => {
  221. const search: StructuredSubSearch = { type: "hyde", query: "test" };
  222. expect(search.type).toBe("hyde");
  223. expect(search.query).toBe("test");
  224. });
  225. });
  226. // =============================================================================
  227. // structuredSearch Function Tests
  228. // =============================================================================
  229. describe("structuredSearch", () => {
  230. let testDir: string;
  231. let store: Store;
  232. beforeAll(async () => {
  233. testDir = await mkdtemp(join(tmpdir(), "qmd-structured-test-"));
  234. const testDbPath = join(testDir, "test.sqlite");
  235. const testConfigDir = await mkdtemp(join(testDir, "config-"));
  236. process.env.QMD_CONFIG_DIR = testConfigDir;
  237. store = createStore(testDbPath);
  238. });
  239. afterAll(async () => {
  240. store.close();
  241. await disposeDefaultLlamaCpp();
  242. if (testDir) {
  243. await rm(testDir, { recursive: true, force: true });
  244. }
  245. });
  246. test("returns empty array for empty searches", async () => {
  247. const results = await structuredSearch(store, []);
  248. expect(results).toEqual([]);
  249. });
  250. test("returns empty array when no documents match", async () => {
  251. const results = await structuredSearch(store, [
  252. { type: "lex", query: "nonexistent-term-xyz123" }
  253. ]);
  254. expect(results).toEqual([]);
  255. });
  256. test("accepts all search types without error", async () => {
  257. // These may return empty results but should not throw
  258. await expect(structuredSearch(store, [{ type: "lex", query: "test" }])).resolves.toBeDefined();
  259. // vec and hyde require embeddings, so just test lex
  260. });
  261. test("respects limit option", async () => {
  262. const results = await structuredSearch(store, [
  263. { type: "lex", query: "test" }
  264. ], { limit: 5 });
  265. expect(results.length).toBeLessThanOrEqual(5);
  266. });
  267. test("respects minScore option", async () => {
  268. const results = await structuredSearch(store, [
  269. { type: "lex", query: "test" }
  270. ], { minScore: 0.5 });
  271. for (const r of results) {
  272. expect(r.score).toBeGreaterThanOrEqual(0.5);
  273. }
  274. });
  275. });
  276. // =============================================================================
  277. // FTS Query Syntax Tests
  278. // =============================================================================
  279. describe("lex query syntax", () => {
  280. // Note: These test via CLI behavior since buildFTS5Query is not exported
  281. describe("validateSemanticQuery", () => {
  282. test("accepts plain natural language", () => {
  283. expect(validateSemanticQuery("how does error handling work")).toBeNull();
  284. expect(validateSemanticQuery("what is the CAP theorem")).toBeNull();
  285. });
  286. test("rejects negation syntax", () => {
  287. expect(validateSemanticQuery("performance -sports")).toContain("Negation");
  288. expect(validateSemanticQuery('-"exact phrase"')).toContain("Negation");
  289. });
  290. test("accepts hyde-style hypothetical answers", () => {
  291. expect(validateSemanticQuery(
  292. "The CAP theorem states that a distributed system cannot simultaneously provide consistency, availability, and partition tolerance."
  293. )).toBeNull();
  294. });
  295. });
  296. });
  297. // =============================================================================
  298. // buildFTS5Query Tests (lex parser)
  299. // =============================================================================
  300. describe("buildFTS5Query (lex parser)", () => {
  301. // Mirror the function for unit testing
  302. function sanitizeFTS5Term(term: string): string {
  303. return term.replace(/[^\p{L}\p{N}']/gu, '').toLowerCase();
  304. }
  305. function buildFTS5Query(query: string): string | null {
  306. const positive: string[] = [];
  307. const negative: string[] = [];
  308. let i = 0;
  309. const s = query.trim();
  310. while (i < s.length) {
  311. while (i < s.length && /\s/.test(s[i]!)) i++;
  312. if (i >= s.length) break;
  313. const negated = s[i] === '-';
  314. if (negated) i++;
  315. if (s[i] === '"') {
  316. const start = i + 1; i++;
  317. while (i < s.length && s[i] !== '"') i++;
  318. const phrase = s.slice(start, i).trim();
  319. i++;
  320. if (phrase.length > 0) {
  321. const sanitized = phrase.split(/\s+/).map((t: string) => sanitizeFTS5Term(t)).filter((t: string) => t).join(' ');
  322. if (sanitized) (negated ? negative : positive).push(`"${sanitized}"`);
  323. }
  324. } else {
  325. const start = i;
  326. while (i < s.length && !/[\s"]/.test(s[i]!)) i++;
  327. const term = s.slice(start, i);
  328. const sanitized = sanitizeFTS5Term(term);
  329. if (sanitized) (negated ? negative : positive).push(`"${sanitized}"*`);
  330. }
  331. }
  332. if (positive.length === 0 && negative.length === 0) return null;
  333. if (positive.length === 0) return null;
  334. let result = positive.join(' AND ');
  335. for (const neg of negative) result = `${result} NOT ${neg}`;
  336. return result;
  337. }
  338. test("plain terms → prefix match with AND", () => {
  339. expect(buildFTS5Query("foo bar")).toBe('"foo"* AND "bar"*');
  340. });
  341. test("single term", () => {
  342. expect(buildFTS5Query("performance")).toBe('"performance"*');
  343. });
  344. test("quoted phrase → exact match (no prefix)", () => {
  345. expect(buildFTS5Query('"machine learning"')).toBe('"machine learning"');
  346. });
  347. test("quoted phrase with mixed case sanitized", () => {
  348. expect(buildFTS5Query('"C++ performance"')).toBe('"c performance"');
  349. });
  350. test("negation of term", () => {
  351. expect(buildFTS5Query("performance -sports")).toBe('"performance"* NOT "sports"*');
  352. });
  353. test("negation of phrase", () => {
  354. expect(buildFTS5Query('performance -"sports athlete"')).toBe('"performance"* NOT "sports athlete"');
  355. });
  356. test("multiple negations", () => {
  357. expect(buildFTS5Query("performance -sports -athlete")).toBe('"performance"* NOT "sports"* NOT "athlete"*');
  358. });
  359. test("quoted positive + negation", () => {
  360. expect(buildFTS5Query('"machine learning" -sports -athlete')).toBe('"machine learning" NOT "sports"* NOT "athlete"*');
  361. });
  362. test("intent-aware C++ performance example", () => {
  363. const result = buildFTS5Query('"C++ performance" optimization -sports -athlete');
  364. expect(result).toContain('NOT "sports"*');
  365. expect(result).toContain('NOT "athlete"*');
  366. expect(result).toContain('"optimization"*');
  367. });
  368. test("only negations with no positives → null (can't search)", () => {
  369. expect(buildFTS5Query("-sports -athlete")).toBeNull();
  370. });
  371. test("empty string → null", () => {
  372. expect(buildFTS5Query("")).toBeNull();
  373. expect(buildFTS5Query(" ")).toBeNull();
  374. });
  375. test("special chars in terms stripped", () => {
  376. expect(buildFTS5Query("hello!world")).toBe('"helloworld"*');
  377. });
  378. });