structured-search.test.ts 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561
  1. /**
  2. * structured-search.test.ts - Tests for structured search functionality
  3. *
  4. * Tests cover:
  5. * - CLI query parser (parseStructuredQuery)
  6. * - ExpandedQuery type validation
  7. * - Basic structuredSearch function behavior
  8. *
  9. * Run with: bun test structured-search.test.ts
  10. */
  11. import { describe, test, expect, beforeAll, afterAll } from "vitest";
  12. import { mkdtemp, rm } from "node:fs/promises";
  13. import { tmpdir } from "node:os";
  14. import { join } from "node:path";
  15. import {
  16. createStore,
  17. structuredSearch,
  18. validateSemanticQuery,
  19. validateLexQuery,
  20. type ExpandedQuery,
  21. type Store,
  22. } from "../src/store.js";
  23. import { disposeDefaultLlamaCpp } from "../src/llm.js";
  24. // =============================================================================
  25. // parseStructuredQuery Tests (CLI Parser)
  26. // =============================================================================
  27. function parseStructuredQuery(query: string): ExpandedQuery[] | null {
  28. const rawLines = query.split('\n').map((line, idx) => ({
  29. raw: line,
  30. trimmed: line.trim(),
  31. number: idx + 1,
  32. })).filter(line => line.trimmed.length > 0);
  33. if (rawLines.length === 0) return null;
  34. const prefixRe = /^(lex|vec|hyde):\s*/i;
  35. const expandRe = /^expand:\s*/i;
  36. const typed: ExpandedQuery[] = [];
  37. for (const line of rawLines) {
  38. if (expandRe.test(line.trimmed)) {
  39. if (rawLines.length > 1) {
  40. throw new Error(`Line ${line.number} starts with expand:, but query documents cannot mix expand with typed lines. Submit a single expand query instead.`);
  41. }
  42. const text = line.trimmed.replace(expandRe, '').trim();
  43. if (!text) {
  44. throw new Error('expand: query must include text.');
  45. }
  46. return null;
  47. }
  48. const match = line.trimmed.match(prefixRe);
  49. if (match) {
  50. const type = match[1]!.toLowerCase() as 'lex' | 'vec' | 'hyde';
  51. const text = line.trimmed.slice(match[0].length).trim();
  52. if (!text) {
  53. throw new Error(`Line ${line.number} (${type}:) must include text.`);
  54. }
  55. if (/\r|\n/.test(text)) {
  56. throw new Error(`Line ${line.number} (${type}:) contains a newline. Keep each query on a single line.`);
  57. }
  58. typed.push({ type, query: text, line: line.number });
  59. continue;
  60. }
  61. if (rawLines.length === 1) {
  62. return null;
  63. }
  64. throw new Error(`Line ${line.number} is missing a lex:/vec:/hyde: prefix. Each line in a query document must start with one.`);
  65. }
  66. return typed.length > 0 ? typed : null;
  67. }
  68. describe("parseStructuredQuery", () => {
  69. describe("plain queries (returns null for normal expansion)", () => {
  70. test("single line without prefix", () => {
  71. expect(parseStructuredQuery("CAP theorem")).toBeNull();
  72. expect(parseStructuredQuery("distributed systems")).toBeNull();
  73. });
  74. test("explicit expand line treated as plain query", () => {
  75. expect(parseStructuredQuery("expand: error handling best practices")).toBeNull();
  76. });
  77. test("empty queries", () => {
  78. expect(parseStructuredQuery("")).toBeNull();
  79. expect(parseStructuredQuery(" ")).toBeNull();
  80. expect(parseStructuredQuery("\n\n")).toBeNull();
  81. });
  82. });
  83. describe("single prefixed queries", () => {
  84. test("lex: prefix", () => {
  85. const result = parseStructuredQuery("lex: CAP theorem");
  86. expect(result).toEqual([{ type: "lex", query: "CAP theorem", line: 1 }]);
  87. });
  88. test("vec: prefix", () => {
  89. const result = parseStructuredQuery("vec: what is the CAP theorem");
  90. expect(result).toEqual([{ type: "vec", query: "what is the CAP theorem", line: 1 }]);
  91. });
  92. test("hyde: prefix", () => {
  93. const result = parseStructuredQuery("hyde: The CAP theorem states that...");
  94. expect(result).toEqual([{ type: "hyde", query: "The CAP theorem states that...", line: 1 }]);
  95. });
  96. test("uppercase prefix", () => {
  97. expect(parseStructuredQuery("LEX: keywords")).toEqual([{ type: "lex", query: "keywords", line: 1 }]);
  98. expect(parseStructuredQuery("VEC: question")).toEqual([{ type: "vec", query: "question", line: 1 }]);
  99. expect(parseStructuredQuery("HYDE: passage")).toEqual([{ type: "hyde", query: "passage", line: 1 }]);
  100. });
  101. test("mixed case prefix", () => {
  102. expect(parseStructuredQuery("Lex: test")).toEqual([{ type: "lex", query: "test", line: 1 }]);
  103. expect(parseStructuredQuery("VeC: test")).toEqual([{ type: "vec", query: "test", line: 1 }]);
  104. });
  105. });
  106. describe("multiple prefixed queries", () => {
  107. test("lex + vec", () => {
  108. const result = parseStructuredQuery("lex: keywords\nvec: natural language");
  109. expect(result).toEqual([
  110. { type: "lex", query: "keywords", line: 1 },
  111. { type: "vec", query: "natural language", line: 2 },
  112. ]);
  113. });
  114. test("all three types", () => {
  115. const result = parseStructuredQuery("lex: keywords\nvec: question\nhyde: hypothetical doc");
  116. expect(result).toEqual([
  117. { type: "lex", query: "keywords", line: 1 },
  118. { type: "vec", query: "question", line: 2 },
  119. { type: "hyde", query: "hypothetical doc", line: 3 },
  120. ]);
  121. });
  122. test("duplicate types allowed", () => {
  123. const result = parseStructuredQuery("lex: term1\nlex: term2\nlex: term3");
  124. expect(result).toEqual([
  125. { type: "lex", query: "term1", line: 1 },
  126. { type: "lex", query: "term2", line: 2 },
  127. { type: "lex", query: "term3", line: 3 },
  128. ]);
  129. });
  130. test("order preserved", () => {
  131. const result = parseStructuredQuery("hyde: passage\nvec: question\nlex: keywords");
  132. expect(result).toEqual([
  133. { type: "hyde", query: "passage", line: 1 },
  134. { type: "vec", query: "question", line: 2 },
  135. { type: "lex", query: "keywords", line: 3 },
  136. ]);
  137. });
  138. });
  139. describe("mixed plain and prefixed", () => {
  140. test("plain line with prefixed lines throws helpful error", () => {
  141. expect(() => parseStructuredQuery("plain keywords\nvec: semantic question"))
  142. .toThrow(/missing a lex:\/vec:\/hyde:/);
  143. });
  144. test("plain line prepended before other prefixed throws", () => {
  145. expect(() => parseStructuredQuery("keywords\nhyde: passage\nvec: question"))
  146. .toThrow(/missing a lex:\/vec:\/hyde:/);
  147. });
  148. });
  149. describe("error cases", () => {
  150. test("multiple plain lines throws", () => {
  151. expect(() => parseStructuredQuery("line one\nline two")).toThrow(/missing a lex:\/vec:\/hyde:/);
  152. });
  153. test("three plain lines throws", () => {
  154. expect(() => parseStructuredQuery("a\nb\nc")).toThrow(/missing a lex:\/vec:\/hyde:/);
  155. });
  156. test("mixing expand: with other lines throws", () => {
  157. expect(() => parseStructuredQuery("expand: question\nlex: keywords"))
  158. .toThrow(/cannot mix expand with typed lines/);
  159. });
  160. test("expand: without text throws", () => {
  161. expect(() => parseStructuredQuery("expand: ")).toThrow(/must include text/);
  162. });
  163. test("typed line without text throws", () => {
  164. expect(() => parseStructuredQuery("lex: \nvec: real")).toThrow(/must include text/);
  165. });
  166. });
  167. describe("whitespace handling", () => {
  168. test("empty lines ignored", () => {
  169. const result = parseStructuredQuery("lex: keywords\n\nvec: question\n");
  170. expect(result).toEqual([
  171. { type: "lex", query: "keywords", line: 1 },
  172. { type: "vec", query: "question", line: 3 },
  173. ]);
  174. });
  175. test("whitespace-only lines ignored", () => {
  176. const result = parseStructuredQuery("lex: keywords\n \nvec: question");
  177. expect(result).toEqual([
  178. { type: "lex", query: "keywords", line: 1 },
  179. { type: "vec", query: "question", line: 3 },
  180. ]);
  181. });
  182. test("leading/trailing whitespace trimmed from lines", () => {
  183. const result = parseStructuredQuery(" lex: keywords \n vec: question ");
  184. expect(result).toEqual([
  185. { type: "lex", query: "keywords", line: 1 },
  186. { type: "vec", query: "question", line: 2 },
  187. ]);
  188. });
  189. test("internal whitespace preserved in query", () => {
  190. const result = parseStructuredQuery("lex: multiple spaces ");
  191. expect(result).toEqual([{ type: "lex", query: "multiple spaces", line: 1 }]);
  192. });
  193. test("empty prefix value throws", () => {
  194. expect(() => parseStructuredQuery("lex: \nvec: actual query")).toThrow(/must include text/);
  195. });
  196. test("only empty prefix values throws", () => {
  197. expect(() => parseStructuredQuery("lex: \nvec: \nhyde: ")).toThrow(/must include text/);
  198. });
  199. });
  200. describe("edge cases", () => {
  201. test("colon in query text preserved", () => {
  202. const result = parseStructuredQuery("lex: time: 12:30 PM");
  203. expect(result).toEqual([{ type: "lex", query: "time: 12:30 PM", line: 1 }]);
  204. });
  205. test("prefix-like text in query preserved", () => {
  206. const result = parseStructuredQuery("vec: what does lex: mean");
  207. expect(result).toEqual([{ type: "vec", query: "what does lex: mean", line: 1 }]);
  208. });
  209. test("newline in hyde passage (as single line)", () => {
  210. // If user wants actual newlines in hyde, they need to escape or use multiline syntax
  211. const result = parseStructuredQuery("hyde: The answer is X. It means Y.");
  212. expect(result).toEqual([{ type: "hyde", query: "The answer is X. It means Y.", line: 1 }]);
  213. });
  214. });
  215. });
  216. // =============================================================================
  217. // ExpandedQuery Type Tests
  218. // =============================================================================
  219. describe("ExpandedQuery type", () => {
  220. test("accepts lex type", () => {
  221. const search: ExpandedQuery = { type: "lex", query: "test" };
  222. expect(search.type).toBe("lex");
  223. expect(search.query).toBe("test");
  224. });
  225. test("accepts vec type", () => {
  226. const search: ExpandedQuery = { type: "vec", query: "test" };
  227. expect(search.type).toBe("vec");
  228. expect(search.query).toBe("test");
  229. });
  230. test("accepts hyde type", () => {
  231. const search: ExpandedQuery = { type: "hyde", query: "test" };
  232. expect(search.type).toBe("hyde");
  233. expect(search.query).toBe("test");
  234. });
  235. });
  236. // =============================================================================
  237. // structuredSearch Function Tests
  238. // =============================================================================
  239. describe("structuredSearch", () => {
  240. let testDir: string;
  241. let store: Store;
  242. beforeAll(async () => {
  243. testDir = await mkdtemp(join(tmpdir(), "qmd-structured-test-"));
  244. const testDbPath = join(testDir, "test.sqlite");
  245. const testConfigDir = await mkdtemp(join(testDir, "config-"));
  246. process.env.QMD_CONFIG_DIR = testConfigDir;
  247. store = createStore(testDbPath);
  248. });
  249. afterAll(async () => {
  250. store.close();
  251. await disposeDefaultLlamaCpp();
  252. if (testDir) {
  253. await rm(testDir, { recursive: true, force: true });
  254. }
  255. });
  256. test("returns empty array for empty searches", async () => {
  257. const results = await structuredSearch(store, []);
  258. expect(results).toEqual([]);
  259. });
  260. test("returns empty array when no documents match", async () => {
  261. const results = await structuredSearch(store, [
  262. { type: "lex", query: "nonexistent-term-xyz123" }
  263. ]);
  264. expect(results).toEqual([]);
  265. });
  266. test("accepts all search types without error", async () => {
  267. // These may return empty results but should not throw
  268. await expect(structuredSearch(store, [{ type: "lex", query: "test" }])).resolves.toBeDefined();
  269. // vec and hyde require embeddings, so just test lex
  270. });
  271. test("respects limit option", async () => {
  272. const results = await structuredSearch(store, [
  273. { type: "lex", query: "test" }
  274. ], { limit: 5 });
  275. expect(results.length).toBeLessThanOrEqual(5);
  276. });
  277. test("respects minScore option", async () => {
  278. const results = await structuredSearch(store, [
  279. { type: "lex", query: "test" }
  280. ], { minScore: 0.5 });
  281. for (const r of results) {
  282. expect(r.score).toBeGreaterThanOrEqual(0.5);
  283. }
  284. });
  285. test("throws when lex query contains newline characters", async () => {
  286. await expect(structuredSearch(store, [
  287. { type: "lex", query: "foo\nbar", line: 3 }
  288. ])).rejects.toThrow(/Line 3 \(lex\):/);
  289. });
  290. test("throws when lex query has unmatched quote", async () => {
  291. await expect(structuredSearch(store, [
  292. { type: "lex", query: "\"unfinished phrase", line: 2 }
  293. ])).rejects.toThrow(/unmatched double quote/);
  294. });
  295. });
  296. // =============================================================================
  297. // FTS Query Syntax Tests
  298. // =============================================================================
  299. describe("lex query syntax", () => {
  300. // Note: These test via CLI behavior since buildFTS5Query is not exported
  301. describe("validateSemanticQuery", () => {
  302. test("accepts plain natural language", () => {
  303. expect(validateSemanticQuery("how does error handling work")).toBeNull();
  304. expect(validateSemanticQuery("what is the CAP theorem")).toBeNull();
  305. });
  306. test("rejects negation syntax", () => {
  307. expect(validateSemanticQuery("performance -sports")).toContain("Negation");
  308. expect(validateSemanticQuery('-"exact phrase"')).toContain("Negation");
  309. });
  310. test("rejects mid-query quoted negation", () => {
  311. expect(validateSemanticQuery('foo -"exact phrase"')).toContain("Negation");
  312. });
  313. test("accepts hyphenated words (no negation)", () => {
  314. // Regression for the hyphen-parsing UX bug: hyphens inside words must
  315. // NOT be read as negation operators. See `validateSemanticQuery` doc.
  316. expect(validateSemanticQuery("when does a completed session get auto-archived")).toBeNull();
  317. expect(validateSemanticQuery("pre-commit hook")).toBeNull();
  318. expect(validateSemanticQuery("multi-session coordination")).toBeNull();
  319. expect(validateSemanticQuery("cross-machine file ops")).toBeNull();
  320. expect(validateSemanticQuery("long-running process")).toBeNull();
  321. expect(validateSemanticQuery("well-known endpoint")).toBeNull();
  322. expect(validateSemanticQuery("out-of-scope edits")).toBeNull();
  323. expect(validateSemanticQuery("state-of-the-art model")).toBeNull();
  324. });
  325. test("accepts hyphenated word at start of query", () => {
  326. // Leading hyphenated word starts with a letter, not `-`, so the SOS
  327. // rule does not fire — confirm via an explicit start-of-string case.
  328. expect(validateSemanticQuery("auto-archived session")).toBeNull();
  329. });
  330. test("accepts hyde-style hypothetical answers", () => {
  331. expect(validateSemanticQuery(
  332. "The CAP theorem states that a distributed system cannot simultaneously provide consistency, availability, and partition tolerance."
  333. )).toBeNull();
  334. });
  335. });
  336. describe("validateLexQuery", () => {
  337. test("accepts basic lex query", () => {
  338. expect(validateLexQuery("auth token")).toBeNull();
  339. });
  340. test("rejects newline", () => {
  341. expect(validateLexQuery("foo\nbar")).toContain("single line");
  342. });
  343. test("rejects unmatched quote", () => {
  344. expect(validateLexQuery("\"unfinished")).toContain("unmatched");
  345. });
  346. });
  347. });
  348. // =============================================================================
  349. // buildFTS5Query Tests (lex parser)
  350. // =============================================================================
  351. describe("buildFTS5Query (lex parser)", () => {
  352. // Mirror the function for unit testing
  353. function sanitizeFTS5Term(term: string): string {
  354. return term.replace(/[^\p{L}\p{N}']/gu, '').toLowerCase();
  355. }
  356. function isHyphenatedToken(token: string): boolean {
  357. return /^[\p{L}\p{N}][\p{L}\p{N}'-]*-[\p{L}\p{N}][\p{L}\p{N}'-]*$/u.test(token);
  358. }
  359. function sanitizeHyphenatedTerm(term: string): string {
  360. return term.split('-').map(t => sanitizeFTS5Term(t)).filter(t => t).join(' ');
  361. }
  362. function buildFTS5Query(query: string): string | null {
  363. const positive: string[] = [];
  364. const negative: string[] = [];
  365. let i = 0;
  366. const s = query.trim();
  367. while (i < s.length) {
  368. while (i < s.length && /\s/.test(s[i]!)) i++;
  369. if (i >= s.length) break;
  370. const negated = s[i] === '-';
  371. if (negated) i++;
  372. if (s[i] === '"') {
  373. const start = i + 1; i++;
  374. while (i < s.length && s[i] !== '"') i++;
  375. const phrase = s.slice(start, i).trim();
  376. i++;
  377. if (phrase.length > 0) {
  378. const sanitized = phrase.split(/\s+/).map((t: string) => sanitizeFTS5Term(t)).filter((t: string) => t).join(' ');
  379. if (sanitized) (negated ? negative : positive).push(`"${sanitized}"`);
  380. }
  381. } else {
  382. const start = i;
  383. while (i < s.length && !/[\s"]/.test(s[i]!)) i++;
  384. const term = s.slice(start, i);
  385. if (isHyphenatedToken(term)) {
  386. const sanitized = sanitizeHyphenatedTerm(term);
  387. if (sanitized) (negated ? negative : positive).push(`"${sanitized}"`);
  388. } else {
  389. const sanitized = sanitizeFTS5Term(term);
  390. if (sanitized) (negated ? negative : positive).push(`"${sanitized}"*`);
  391. }
  392. }
  393. }
  394. if (positive.length === 0 && negative.length === 0) return null;
  395. if (positive.length === 0) return null;
  396. let result = positive.join(' AND ');
  397. for (const neg of negative) result = `${result} NOT ${neg}`;
  398. return result;
  399. }
  400. test("plain terms → prefix match with AND", () => {
  401. expect(buildFTS5Query("foo bar")).toBe('"foo"* AND "bar"*');
  402. });
  403. test("single term", () => {
  404. expect(buildFTS5Query("performance")).toBe('"performance"*');
  405. });
  406. test("quoted phrase → exact match (no prefix)", () => {
  407. expect(buildFTS5Query('"machine learning"')).toBe('"machine learning"');
  408. });
  409. test("quoted phrase with mixed case sanitized", () => {
  410. expect(buildFTS5Query('"C++ performance"')).toBe('"c performance"');
  411. });
  412. test("negation of term", () => {
  413. expect(buildFTS5Query("performance -sports")).toBe('"performance"* NOT "sports"*');
  414. });
  415. test("negation of phrase", () => {
  416. expect(buildFTS5Query('performance -"sports athlete"')).toBe('"performance"* NOT "sports athlete"');
  417. });
  418. test("multiple negations", () => {
  419. expect(buildFTS5Query("performance -sports -athlete")).toBe('"performance"* NOT "sports"* NOT "athlete"*');
  420. });
  421. test("quoted positive + negation", () => {
  422. expect(buildFTS5Query('"machine learning" -sports -athlete')).toBe('"machine learning" NOT "sports"* NOT "athlete"*');
  423. });
  424. test("intent-aware C++ performance example", () => {
  425. const result = buildFTS5Query('"C++ performance" optimization -sports -athlete');
  426. expect(result).toContain('NOT "sports"*');
  427. expect(result).toContain('NOT "athlete"*');
  428. expect(result).toContain('"optimization"*');
  429. });
  430. test("only negations with no positives → null (can't search)", () => {
  431. expect(buildFTS5Query("-sports -athlete")).toBeNull();
  432. });
  433. test("empty string → null", () => {
  434. expect(buildFTS5Query("")).toBeNull();
  435. expect(buildFTS5Query(" ")).toBeNull();
  436. });
  437. test("special chars in terms stripped", () => {
  438. expect(buildFTS5Query("hello!world")).toBe('"helloworld"*');
  439. });
  440. // Hyphenated token tests
  441. test("hyphenated term → phrase match", () => {
  442. expect(buildFTS5Query("multi-agent")).toBe('"multi agent"');
  443. });
  444. test("hyphenated identifier → phrase match", () => {
  445. expect(buildFTS5Query("DEC-0054")).toBe('"dec 0054"');
  446. });
  447. test("hyphenated model name → phrase match", () => {
  448. expect(buildFTS5Query("gpt-4")).toBe('"gpt 4"');
  449. });
  450. test("multi-hyphen term → phrase match", () => {
  451. expect(buildFTS5Query("foo-bar-baz")).toBe('"foo bar baz"');
  452. });
  453. test("hyphenated term mixed with plain terms", () => {
  454. expect(buildFTS5Query("multi-agent memory")).toBe('"multi agent" AND "memory"*');
  455. });
  456. test("negation still works alongside hyphenated terms", () => {
  457. expect(buildFTS5Query("multi-agent -sports")).toBe('"multi agent" NOT "sports"*');
  458. });
  459. test("negated hyphenated term", () => {
  460. expect(buildFTS5Query("performance -multi-agent")).toBe('"performance"* NOT "multi agent"');
  461. });
  462. test("plain negation still works (not confused with hyphen)", () => {
  463. expect(buildFTS5Query("performance -sports")).toBe('"performance"* NOT "sports"*');
  464. });
  465. });