Prechádzať zdrojové kódy

Add conditional query expansion based on BM25 signal strength

Skip expensive LLM query expansion when initial BM25 search has
strong signals (top result score > 0.7). This saves LLM calls and
latency for queries that already match well.

- Run initial BM25 search first
- Check if top result has strong score
- Skip expansion if signal is strong
- Reuse initial search results for retrieval

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
Tobi Lutke 5 mesiacov pred
rodič
commit
a0bceef36a
2 zmenil súbory, kde vykonal 35 pridanie a 19 odobranie
  1. 3 3
      .beads/issues.jsonl
  2. 32 16
      src/qmd.ts

+ 3 - 3
.beads/issues.jsonl

@@ -10,7 +10,7 @@
 {"id":"qmd-6s5","title":"Export current database to index.yml","description":"Write a script to export current collections and path_contexts from SQLite to ~/.config/qmd/index.yml format. Include all collection metadata and contexts.","status":"closed","priority":1,"issue_type":"task","created_at":"2025-12-13T09:54:52.707844-05:00","updated_at":"2025-12-13T09:57:36.650437-05:00","closed_at":"2025-12-13T09:57:36.650437-05:00","dependencies":[{"issue_id":"qmd-6s5","depends_on_id":"qmd-3z9","type":"blocks","created_at":"2025-12-13T09:55:07.606834-05:00","created_by":"daemon"}]}
 {"id":"qmd-7ss","title":"remove all the symlinks and stuff in the git repo, clean up the root directory","description":"","status":"closed","priority":4,"issue_type":"task","created_at":"2025-12-12T16:40:00.744982-05:00","updated_at":"2025-12-12T17:11:18.034215-05:00","closed_at":"2025-12-12T17:11:18.034215-05:00"}
 {"id":"qmd-8eu","title":"Update documents table schema for collection names","description":"Change documents.collection_id (integer FK) to documents.collection (text). Update all queries and indices. Keep backwards compatibility during transition.","design":"Schema change:\n- Add `collection TEXT` column\n- Migrate data: UPDATE documents SET collection = (SELECT name FROM collections WHERE id = collection_id)\n- Drop collection_id column\n- Update FTS5 trigger\n- Update all queries in store.ts","status":"closed","priority":1,"issue_type":"task","created_at":"2025-12-13T09:54:52.830305-05:00","updated_at":"2025-12-13T10:08:24.88716-05:00","closed_at":"2025-12-13T10:08:24.88716-05:00","dependencies":[{"issue_id":"qmd-8eu","depends_on_id":"qmd-6s5","type":"blocks","created_at":"2025-12-13T09:55:07.662048-05:00","created_by":"daemon"}]}
-{"id":"qmd-9ij","title":"Conditional query expansion based on BM25 signal strength","description":"Query expansion helps recall but injects false positives. Skip expansion if original BM25 top-5 has strong signals (exact term hits, high proximity). Only expand when recall is weak.","status":"open","priority":3,"issue_type":"feature","created_at":"2025-12-20T17:18:41.806447-05:00","updated_at":"2025-12-20T17:18:41.806447-05:00"}
+{"id":"qmd-9ij","title":"Conditional query expansion based on BM25 signal strength","description":"Query expansion helps recall but injects false positives. Skip expansion if original BM25 top-5 has strong signals (exact term hits, high proximity). Only expand when recall is weak.","status":"in_progress","priority":3,"issue_type":"feature","created_at":"2025-12-20T17:18:41.806447-05:00","updated_at":"2025-12-21T12:04:20.130497-05:00"}
 {"id":"qmd-9ua","title":"Update all qmd commands for YAML-based collections","description":"Update qmd.ts commands: collection add/list/remove/rename, status, update, ls. All should use collections.ts instead of store.ts collection functions.","status":"closed","priority":1,"issue_type":"task","created_at":"2025-12-13T09:54:53.14644-05:00","updated_at":"2025-12-13T10:17:39.67707-05:00","closed_at":"2025-12-13T10:17:39.67707-05:00","dependencies":[{"issue_id":"qmd-9ua","depends_on_id":"qmd-u84","type":"blocks","created_at":"2025-12-13T09:55:07.893268-05:00","created_by":"daemon"},{"issue_id":"qmd-9ua","depends_on_id":"qmd-oxy","type":"blocks","created_at":"2025-12-13T09:55:07.942221-05:00","created_by":"daemon"}]}
 {"id":"qmd-afe","title":"implement qmd collection rename, which changes the global path prefix for the collection","description":"","status":"closed","priority":2,"issue_type":"task","created_at":"2025-12-12T15:55:54.779325-05:00","updated_at":"2025-12-12T16:29:24.153196-05:00","closed_at":"2025-12-12T16:29:24.153196-05:00"}
 {"id":"qmd-ama","title":"Refactor database system","description":"All documents should be stored as content addressable hash, e.g. hash, doc, created_at,\n┃ updated_at. documents should be a file system layer on top e.g. collection, path, hash,\n┃ created_at, updated_at. (collection,path)\n┃\n┃\n\n┃ All documents should be stored as content addressable hash, e.g. hash, doc, created_at,\n┃ updated_at. documents should be a file system layer on top e.g. collection_id, path, hash,\n┃ created_at, updated_at. (collection,path) is unique. There is also collection which stores PWD\n┃ + glob pattern, name (\\w+). Every document is treated as path qmd://collection.name/","notes":"## Completed\n- ✅ Implemented content-addressable storage (content table with hash→doc mapping)\n- ✅ Refactored documents table as file system layer (collection_id, path, hash)\n- ✅ Added collection names (e.g., \"pages\", \"journals\", \"archive\")\n- ✅ Implemented virtual paths (qmd://collection-name/path/to/file.md)\n- ✅ Added hierarchical context support (collection-scoped)\n- ✅ Successfully migrated existing database\n- ✅ Updated search functions to work with new schema\n- ✅ Updated indexing logic to use content-addressable storage\n- ✅ Orphaned content hash cleanup\n\n## Still TODO\n- Fix migration SQL to properly extract basename (currently needs manual fix)\n- Implement `qmd collection add . --name \u003cname\u003e --mask '**/*.md'`\n- Implement `qmd ls [path]` for exploring virtual file tree","status":"closed","priority":2,"issue_type":"task","created_at":"2025-12-10T10:57:35.497489-05:00","updated_at":"2025-12-12T15:39:48.879143-05:00","closed_at":"2025-12-12T15:39:48.879143-05:00"}
@@ -19,9 +19,9 @@
 {"id":"qmd-bx1","title":"Fix migration SQL for proper basename extraction","description":"The migration currently generates collection names incorrectly (uses full path instead of basename). Need to fix the SQL in migrateToContentAddressable to properly extract the directory basename.","status":"closed","priority":1,"issue_type":"bug","created_at":"2025-12-12T15:29:53.757723-05:00","updated_at":"2025-12-12T15:50:29.349134-05:00","closed_at":"2025-12-12T15:50:29.349134-05:00","dependencies":[{"issue_id":"qmd-bx1","depends_on_id":"qmd-ama","type":"discovered-from","created_at":"2025-12-12T15:29:53.758524-05:00","created_by":"daemon"}]}
 {"id":"qmd-c0m","title":"Comprehensive CLI review and consistency pass","description":"Review entire CLI command structure:\n- Consistent naming (add vs create, remove vs delete)\n- Consistent flag usage (--name, --mask, etc)\n- Update help text for all commands\n- Ensure virtual paths work everywhere\n- Test all commands end-to-end","status":"closed","priority":1,"issue_type":"task","created_at":"2025-12-12T15:29:38.083564-05:00","updated_at":"2025-12-12T16:06:51.544695-05:00","closed_at":"2025-12-12T16:06:51.544695-05:00"}
 {"id":"qmd-clr","title":"fix embed","description":"","status":"closed","priority":2,"issue_type":"task","created_at":"2025-12-12T16:14:55.292114-05:00","updated_at":"2025-12-12T16:31:27.661829-05:00","closed_at":"2025-12-12T16:31:27.661829-05:00"}
-{"id":"qmd-d00","title":"Add offline evaluation harness for tuning","description":"Create a small benchmark with ~100 labeled queries from real searches. Would enable tuning: expansion on/off threshold, candidate count (30 vs 100), blending weights, reranker threshold.","status":"open","priority":3,"issue_type":"feature","created_at":"2025-12-20T17:18:42.007265-05:00","updated_at":"2025-12-20T17:18:42.007265-05:00"}
+{"id":"qmd-d00","title":"Add offline evaluation harness for tuning","description":"Create a small benchmark with ~100 labeled queries from real searches. Would enable tuning: expansion on/off threshold, candidate count (30 vs 100), blending weights, reranker threshold.","notes":"Test samples must be: 1) entirely synthetic, OR 2) public documents (e.g., public podcasts, public memos). No private/personal content in eval set.","status":"open","priority":3,"issue_type":"feature","created_at":"2025-12-20T17:18:42.007265-05:00","updated_at":"2025-12-21T12:04:11.951081-05:00"}
 {"id":"qmd-deh","title":"Refactor database introduce qmd collection *","description":"","status":"closed","priority":2,"issue_type":"task","created_at":"2025-12-10T10:56:04.516137-05:00","updated_at":"2025-12-12T16:12:12.349428-05:00","closed_at":"2025-12-12T16:12:12.349428-05:00"}
-{"id":"qmd-df5","title":"Rerank multiple chunks per document with score aggregation","description":"Currently we only rerank 1 chunk per doc (selected by keyword heuristic). Should rerank top 2-3 chunks per document, then aggregate scores (max, softmax, or top-2 average). This improves ranking for long documents where the keyword-matched chunk isn't always the most relevant.","status":"in_progress","priority":2,"issue_type":"feature","created_at":"2025-12-20T17:18:41.592575-05:00","updated_at":"2025-12-21T12:02:56.013748-05:00"}
+{"id":"qmd-df5","title":"Rerank multiple chunks per document with score aggregation","description":"Currently we only rerank 1 chunk per doc (selected by keyword heuristic). Should rerank top 2-3 chunks per document, then aggregate scores (max, softmax, or top-2 average). This improves ranking for long documents where the keyword-matched chunk isn't always the most relevant.","status":"closed","priority":2,"issue_type":"feature","created_at":"2025-12-20T17:18:41.592575-05:00","updated_at":"2025-12-21T12:04:11.777309-05:00","closed_at":"2025-12-21T12:04:11.777309-05:00"}
 {"id":"qmd-dmi","title":"Implement 'qmd collection' commands","description":"Add explicit collection management:\n- qmd collection add . --name \u003cname\u003e --mask '**/*.md'\n- qmd collection list\n- qmd collection remove \u003cname\u003e\n\nThis gives users control over collection names and patterns.","status":"closed","priority":1,"issue_type":"feature","created_at":"2025-12-12T15:29:53.810666-05:00","updated_at":"2025-12-12T16:02:08.079158-05:00","closed_at":"2025-12-12T16:02:08.079158-05:00","dependencies":[{"issue_id":"qmd-dmi","depends_on_id":"qmd-ama","type":"discovered-from","created_at":"2025-12-12T15:29:53.811294-05:00","created_by":"daemon"}]}
 {"id":"qmd-dt1","title":"Redesign context add command for better usability","description":"Current issues: \n1. Virtual path qmd://journals/ is rejected as invalid\n2. Syntax is confusing - sometimes path is first arg, sometimes second\n3. Need to support collection root context (qmd://name/)\n4. Should be intuitive: qmd context add \u003cwhere\u003e \u003cwhat\u003e\nDesign goals:\n- Support qmd://collection/ for collection root context\n- Support qmd://collection/path for path-specific context\n- Clear, consistent syntax\n- Good error messages","status":"closed","priority":1,"issue_type":"task","created_at":"2025-12-13T09:39:19.764114-05:00","updated_at":"2025-12-13T09:41:38.467861-05:00","closed_at":"2025-12-13T09:41:38.467861-05:00"}
 {"id":"qmd-e2c","title":"Implement 'qmd ls' command","description":"Add command to explore virtual file tree:\n- qmd ls → list all collections\n- qmd ls \u003ccollection\u003e → list files in collection\n- qmd ls \u003ccollection\u003e/\u003cpath\u003e → list files under path\nOutput: flat list of qmd:// paths","status":"closed","priority":1,"issue_type":"feature","created_at":"2025-12-12T15:29:53.859804-05:00","updated_at":"2025-12-12T15:55:12.777701-05:00","closed_at":"2025-12-12T15:55:12.777701-05:00","dependencies":[{"issue_id":"qmd-e2c","depends_on_id":"qmd-ama","type":"discovered-from","created_at":"2025-12-12T15:29:53.860535-05:00","created_by":"daemon"}]}

+ 32 - 16
src/qmd.ts

@@ -2031,34 +2031,50 @@ async function querySearch(query: string, opts: OutputOptions, embedModel: strin
   // Check index health and warn about issues
   checkIndexHealth(db);
 
-  // Expand query using structured output
-  const expanded = await expandQueryStructured(query, true);
+  // Run initial BM25 search (will be reused for retrieval)
+  const initialFts = searchFTS(db, query, 20, collectionName as any);
+  const hasVectors = !!db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
 
-  // Build query lists for each retrieval type
-  const ftsQueries: string[] = [query];
-  if (expanded.lexicalQuery && expanded.lexicalQuery !== query) {
-    ftsQueries.push(expanded.lexicalQuery);
-  }
+  // Check if initial results have strong signals (skip expansion if so)
+  // Strong signal = top result has high normalized score (> 0.7)
+  const hasStrongSignal = initialFts.length > 0 && initialFts[0].score > 0.7;
 
-  const vectorQueries: string[] = [query];
-  if (expanded.vectorQuery && expanded.vectorQuery !== query) {
-    vectorQueries.push(expanded.vectorQuery);
-  }
-  if (expanded.hyde && expanded.hyde.length > 20) {
-    vectorQueries.push(expanded.hyde);
+  let ftsQueries: string[] = [query];
+  let vectorQueries: string[] = [query];
+
+  if (hasStrongSignal) {
+    // Strong BM25 signal - skip expensive LLM expansion
+    process.stderr.write(`${c.dim}Strong BM25 signal (${initialFts[0].score.toFixed(2)}) - skipping expansion${c.reset}\n`);
+  } else {
+    // Weak signal - expand query for better recall
+    const expanded = await expandQueryStructured(query, true);
+
+    if (expanded.lexicalQuery && expanded.lexicalQuery !== query) {
+      ftsQueries.push(expanded.lexicalQuery);
+    }
+    if (expanded.vectorQuery && expanded.vectorQuery !== query) {
+      vectorQueries.push(expanded.vectorQuery);
+    }
+    if (expanded.hyde && expanded.hyde.length > 20) {
+      vectorQueries.push(expanded.hyde);
+    }
   }
 
   process.stderr.write(`${c.dim}Searching ${ftsQueries.length} lexical + ${vectorQueries.length} vector queries...${c.reset}\n`);
 
   // Collect ranked result lists for RRF fusion
   const rankedLists: RankedResult[][] = [];
-  const hasVectors = !!db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
 
   // Map to store hash by filepath for final results
   const hashMap = new Map<string, string>();
 
-  // FTS searches with lexical queries
-  for (const q of ftsQueries) {
+  // FTS searches with lexical queries (reuse initial search for original query)
+  if (initialFts.length > 0) {
+    for (const r of initialFts) hashMap.set(r.filepath, r.hash);
+    rankedLists.push(initialFts.map(r => ({ file: r.filepath, displayPath: r.displayPath, title: r.title, body: r.body || "", score: r.score })));
+  }
+  // Run expanded queries (skip first which is original)
+  for (const q of ftsQueries.slice(1)) {
     const ftsResults = searchFTS(db, q, 20, collectionName as any);
     if (ftsResults.length > 0) {
       for (const r of ftsResults) hashMap.set(r.filepath, r.hash);