Browse Source

feat: add collections array filter + improve query writing docs

- structured_search now accepts collections[] for OR filtering
- Updated skill docs with detailed query writing guidance
- lex: 2-5 keywords, include synonyms, exact names
- vec: full natural language questions with context
- hyde: 50-100 word hypothetical answer passages
Tobi Lütke 3 months ago
parent
commit
d1ec31eab8
3 changed files with 93 additions and 54 deletions
  1. 58 30
      skills/qmd/SKILL.md
  2. 5 5
      src/mcp.ts
  3. 30 19
      src/store.ts

+ 58 - 30
skills/qmd/SKILL.md

@@ -11,54 +11,87 @@ allowed-tools: Bash(qmd:*), mcp__qmd__*
 
 # QMD - Quick Markdown Search
 
-Local search engine for markdown content. Indexes notes, docs, and knowledge bases.
+Local search engine for markdown content.
 
 ## Status
 
 !`qmd status 2>/dev/null || echo "Not installed: npm install -g @tobilu/qmd"`
 
-## MCP Search — `structured_search`
-
-Pass 1-4 sub-queries with type `lex`, `vec`, or `hyde`:
+## MCP: `structured_search`
 
 ```json
 {
   "searches": [
     { "type": "lex", "query": "CAP theorem consistency" },
     { "type": "vec", "query": "tradeoff between consistency and availability" }
-  ]
+  ],
+  "collections": ["notes", "docs"],
+  "limit": 10
 }
 ```
 
-| Type | Method | What to Write |
-|------|--------|---------------|
-| `lex` | BM25 keywords | Short phrases — exact terms, names, code |
-| `vec` | Vector search | Natural language question |
-| `hyde` | Vector search | Hypothetical answer (50-100 words) |
+### Search Types
+
+| Type | Method | Input |
+|------|--------|-------|
+| `lex` | BM25 | Keywords — exact terms, names, code |
+| `vec` | Vector | Question — natural language |
+| `hyde` | Vector | Answer — hypothetical result (50-100 words) |
+
+### Writing Good Queries
+
+**lex (keyword)**
+- 2-5 terms, no filler words
+- Include synonyms: `"auth authentication login"`
+- Use exact names: `"PostgreSQL connection pool"`
+- Code identifiers work: `"handleError async"`
+
+**vec (semantic)**
+- Full natural language question
+- Be specific: `"how does the rate limiter handle burst traffic"` not `"rate limiting"`
+- Include context: `"in the payment service, how are refunds processed"`
+
+**hyde (hypothetical document)**
+- Write 50-100 words of what the *answer* looks like
+- Use the vocabulary you expect in the result
+- Example: `"The rate limiter uses a sliding window algorithm with a 60-second window. When a client exceeds 100 requests per minute, subsequent requests return 429 Too Many Requests until the window resets."`
+
+### Combining Types
+
+| Goal | Approach |
+|------|----------|
+| Know exact terms | `lex` only |
+| Don't know vocabulary | `vec` only |
+| Best recall | `lex` + `vec` |
+| Complex topic | `lex` + `vec` + `hyde` |
 
-**Tips:**
-- Quick lookup → single `lex` query
-- Don't know exact terms → use `vec`
-- Best results → combine `lex` + `vec` (+ `hyde` for complex topics)
-- First query gets 2x weight
+First query gets 2x weight in fusion — put your best guess first.
 
-## MCP Tools
+### Collection Filtering
+
+```json
+{ "collection": "docs" }           // Single collection
+{ "collections": ["docs", "notes"] }  // Multiple (OR)
+```
+
+Omit both to search all collections.
+
+## Other MCP Tools
 
 | Tool | Use |
 |------|-----|
-| `structured_search` | Search with lex/vec/hyde queries |
 | `get` | Retrieve doc by path or `#docid` |
-| `multi_get` | Retrieve multiple docs by glob/list |
-| `status` | Index health and collections |
+| `multi_get` | Retrieve multiple by glob/list |
+| `status` | Collections and health |
 
 ## CLI
 
 ```bash
-qmd search "keywords"           # BM25 keyword search
-qmd vsearch "question"          # Vector similarity
-qmd query "question"            # Auto-expand + rerank
-qmd query $'lex: X\nvec: Y'     # Structured (same as MCP)
-qmd get "#abc123"               # Retrieve by docid
+qmd query "question"              # Auto-expand + rerank
+qmd query $'lex: X\nvec: Y'       # Structured
+qmd search "keywords"             # BM25 only
+qmd vsearch "question"            # Vector only
+qmd get "#abc123"                 # By docid
 ```
 
 ## Setup
@@ -66,10 +99,5 @@ qmd get "#abc123"               # Retrieve by docid
 ```bash
 npm install -g @tobilu/qmd
 qmd collection add ~/notes --name notes
-qmd embed                       # Generate embeddings
-```
-
-MCP config for Claude Code (`~/.claude/settings.json`):
-```json
-{ "mcpServers": { "qmd": { "command": "qmd", "args": ["mcp"] } } }
+qmd embed
 ```

+ 5 - 5
src/mcp.ts

@@ -261,11 +261,11 @@ function createMcpServer(store: Store): McpServer {
         ),
         limit: z.number().optional().default(10).describe("Maximum number of results (default: 10)"),
         minScore: z.number().optional().default(0).describe("Minimum relevance score 0-1 (default: 0)"),
-        collection: z.string().optional().describe("Filter to a specific collection by name"),
-        intent: z.string().optional().describe("(Future) Domain intent hint, e.g., 'distributed systems', 'startup finances'"),
+        collection: z.string().optional().describe("Filter to a single collection by name"),
+        collections: z.array(z.string()).optional().describe("Filter to multiple collections (OR match)"),
       },
     },
-    async ({ searches, limit, minScore, collection, intent }) => {
+    async ({ searches, limit, minScore, collection, collections }) => {
       // Map to internal format
       const subSearches: StructuredSubSearch[] = searches.map(s => ({
         type: s.type,
@@ -274,9 +274,9 @@ function createMcpServer(store: Store): McpServer {
 
       const results = await structuredSearch(store, subSearches, {
         collection,
+        collections,
         limit,
         minScore,
-        intent,
       });
 
       // Use first lex or vec query for snippet extraction
@@ -582,9 +582,9 @@ export async function startMcpHttpServer(port: number, options?: { quiet?: boole
 
         const results = await structuredSearch(store, subSearches, {
           collection: params.collection,
+          collections: params.collections,
           limit: params.limit ?? 10,
           minScore: params.minScore ?? 0,
-          intent: params.intent,
         });
 
         // Use first lex or vec query for snippet extraction

+ 30 - 19
src/store.ts

@@ -3072,7 +3072,8 @@ export interface StructuredSubSearch {
 }
 
 export interface StructuredSearchOptions {
-  collection?: string;
+  collection?: string;      // Single collection filter
+  collections?: string[];   // Multiple collections filter (OR)
   limit?: number;           // default 10
   minScore?: number;        // default 0
   candidateLimit?: number;  // default RERANK_CANDIDATE_LIMIT
@@ -3107,9 +3108,12 @@ export async function structuredSearch(
   const limit = options?.limit ?? 10;
   const minScore = options?.minScore ?? 0;
   const candidateLimit = options?.candidateLimit ?? RERANK_CANDIDATE_LIMIT;
-  const collection = options?.collection;
   const hooks = options?.hooks;
 
+  // Normalize collection filter to array (undefined = all collections)
+  const collections: string[] | undefined = options?.collections
+    ?? (options?.collection ? [options.collection] : undefined);
+
   if (searches.length === 0) return [];
 
   const rankedLists: RankedResult[][] = [];
@@ -3118,16 +3122,21 @@ export async function structuredSearch(
     `SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`
   ).get();
 
+  // Helper to run search across collections (or all if undefined)
+  const collectionList = collections ?? [undefined]; // undefined = all collections
+
   // Step 1: Run FTS for all lex searches (sync, instant)
   for (const search of searches) {
     if (search.type === 'lex') {
-      const ftsResults = store.searchFTS(search.query, 20, collection);
-      if (ftsResults.length > 0) {
-        for (const r of ftsResults) docidMap.set(r.filepath, r.docid);
-        rankedLists.push(ftsResults.map(r => ({
-          file: r.filepath, displayPath: r.displayPath,
-          title: r.title, body: r.body || "", score: r.score,
-        })));
+      for (const coll of collectionList) {
+        const ftsResults = store.searchFTS(search.query, 20, coll);
+        if (ftsResults.length > 0) {
+          for (const r of ftsResults) docidMap.set(r.filepath, r.docid);
+          rankedLists.push(ftsResults.map(r => ({
+            file: r.filepath, displayPath: r.displayPath,
+            title: r.title, body: r.body || "", score: r.score,
+          })));
+        }
       }
     }
   }
@@ -3144,16 +3153,18 @@ export async function structuredSearch(
         const embedding = embeddings[i]?.embedding;
         if (!embedding) continue;
 
-        const vecResults = await store.searchVec(
-          vecSearches[i]!.query, DEFAULT_EMBED_MODEL, 20, collection,
-          undefined, embedding
-        );
-        if (vecResults.length > 0) {
-          for (const r of vecResults) docidMap.set(r.filepath, r.docid);
-          rankedLists.push(vecResults.map(r => ({
-            file: r.filepath, displayPath: r.displayPath,
-            title: r.title, body: r.body || "", score: r.score,
-          })));
+        for (const coll of collectionList) {
+          const vecResults = await store.searchVec(
+            vecSearches[i]!.query, DEFAULT_EMBED_MODEL, 20, coll,
+            undefined, embedding
+          );
+          if (vecResults.length > 0) {
+            for (const r of vecResults) docidMap.set(r.filepath, r.docid);
+            rankedLists.push(vecResults.map(r => ({
+              file: r.filepath, displayPath: r.displayPath,
+              title: r.title, body: r.body || "", score: r.score,
+            })));
+          }
         }
       }
     }