4 miesięcy temu · 8ebe5ab31b
--- a/src/qmd.ts
+++ b/src/qmd.ts
@@ -59,8 +59,10 @@ import {
 
				   handelize,
			
 
				   hybridQuery,
			
 
				   vectorSearchQuery,
			
 
				+  structuredSearch,
			
 
				   addLineNumbers,
			
 
				   type ExpandedQuery,
			
 
				+  type StructuredSubSearch,
			
 
				   DEFAULT_EMBED_MODEL,
			
 
				   DEFAULT_RERANK_MODEL,
			
 
				   DEFAULT_GLOB,
			
@@ -1939,6 +1941,64 @@ function filterByCollections<T extends { filepath?: string; file?: string }>(res
 
				   });
			
 
				 }
			
 
				 
			
 
				+/**
			
 
				+ * Parse structured search query syntax.
			
 
				+ * Lines starting with lex:, vec:, or hyde: are routed directly.
			
 
				+ * Plain lines without prefix go through query expansion.
			
 
				+ * 
			
 
				+ * Returns null if this is a plain query (single line, no prefix).
			
 
				+ * Returns StructuredSubSearch[] if structured syntax detected.
			
 
				+ * Throws if multiple plain lines (ambiguous).
			
 
				+ * 
			
 
				+ * Examples:
			
 
				+ *   "CAP theorem"                    -> null (plain query, use expansion)
			
 
				+ *   "lex: CAP theorem"               -> [{ type: 'lex', query: 'CAP theorem' }]
			
 
				+ *   "lex: CAP\nvec: consistency"     -> [{ type: 'lex', ... }, { type: 'vec', ... }]
			
 
				+ *   "CAP\nconsistency"               -> throws (multiple plain lines)
			
 
				+ */
			
 
				+function parseStructuredQuery(query: string): StructuredSubSearch[] | null {
			
 
				+  const lines = query.split('\n').map(l => l.trim()).filter(l => l.length > 0);
			
 
				+  if (lines.length === 0) return null;
			
 
				+
			
 
				+  const prefixRe = /^(lex|vec|hyde):\s*/i;
			
 
				+  const searches: StructuredSubSearch[] = [];
			
 
				+  const plainLines: string[] = [];
			
 
				+
			
 
				+  for (const line of lines) {
			
 
				+    const match = line.match(prefixRe);
			
 
				+    if (match) {
			
 
				+      const type = match[1]!.toLowerCase() as 'lex' | 'vec' | 'hyde';
			
 
				+      const text = line.slice(match[0].length).trim();
			
 
				+      if (text.length > 0) {
			
 
				+        searches.push({ type, query: text });
			
 
				+      }
			
 
				+    } else {
			
 
				+      plainLines.push(line);
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  // All plain lines, no prefixes -> null (use normal expansion)
			
 
				+  if (searches.length === 0 && plainLines.length === 1) {
			
 
				+    return null;
			
 
				+  }
			
 
				+
			
 
				+  // Multiple plain lines without prefixes -> ambiguous, error
			
 
				+  if (plainLines.length > 1) {
			
 
				+    throw new Error(
			
 
				+      `Ambiguous query: multiple lines without lex:/vec:/hyde: prefix.\n` +
			
 
				+      `Either use a single line (for query expansion) or prefix each line.\n` +
			
 
				+      `Example:\n  lex: keyword terms\n  vec: natural language question\n  hyde: hypothetical answer passage`
			
 
				+    );
			
 
				+  }
			
 
				+
			
 
				+  // Mix of prefixed and one plain line -> treat plain as lex
			
 
				+  if (plainLines.length === 1) {
			
 
				+    searches.unshift({ type: 'lex', query: plainLines[0]! });
			
 
				+  }
			
 
				+
			
 
				+  return searches.length > 0 ? searches : null;
			
 
				+}
			
 
				+
			
 
				 function search(query: string, opts: OutputOptions): void {
			
 
				   const db = getDb();
			
 
				 
			
@@ -2055,28 +2115,63 @@ async function querySearch(query: string, opts: OutputOptions, _embedModel: stri
 
				 
			
 
				   checkIndexHealth(store.db);
			
 
				 
			
 
				+  // Check for structured query syntax (lex:/vec:/hyde: prefixes)
			
 
				+  const structuredQueries = parseStructuredQuery(query);
			
 
				+
			
 
				   await withLLMSession(async () => {
			
 
				-    let results = await hybridQuery(store, query, {
			
 
				-      collection: singleCollection,
			
 
				-      limit: opts.all ? 500 : (opts.limit || 10),
			
 
				-      minScore: opts.minScore || 0,
			
 
				-      hooks: {
			
 
				-        onStrongSignal: (score) => {
			
 
				-          process.stderr.write(`${c.dim}Strong BM25 signal (${score.toFixed(2)}) — skipping expansion${c.reset}\n`);
			
 
				-        },
			
 
				-        onExpand: (original, expanded) => {
			
 
				-          logExpansionTree(original, expanded);
			
 
				-          process.stderr.write(`${c.dim}Searching ${expanded.length + 1} queries...${c.reset}\n`);
			
 
				-        },
			
 
				-        onRerankStart: (chunkCount) => {
			
 
				-          process.stderr.write(`${c.dim}Reranking ${chunkCount} chunks...${c.reset}\n`);
			
 
				-          progress.indeterminate();
			
 
				+    let results;
			
 
				+
			
 
				+    if (structuredQueries) {
			
 
				+      // Structured search — user provided their own query expansions
			
 
				+      const typeLabels = structuredQueries.map(s => s.type).join('+');
			
 
				+      process.stderr.write(`${c.dim}Structured search: ${structuredQueries.length} queries (${typeLabels})${c.reset}\n`);
			
 
				+      
			
 
				+      // Log each sub-query
			
 
				+      for (const s of structuredQueries) {
			
 
				+        let preview = s.query.replace(/\n/g, ' ');
			
 
				+        if (preview.length > 72) preview = preview.substring(0, 69) + '...';
			
 
				+        process.stderr.write(`${c.dim}├─ ${s.type}: ${preview}${c.reset}\n`);
			
 
				+      }
			
 
				+      process.stderr.write(`${c.dim}└─ Searching...${c.reset}\n`);
			
 
				+
			
 
				+      results = await structuredSearch(store, structuredQueries, {
			
 
				+        collection: singleCollection,
			
 
				+        limit: opts.all ? 500 : (opts.limit || 10),
			
 
				+        minScore: opts.minScore || 0,
			
 
				+        hooks: {
			
 
				+          onRerankStart: (chunkCount) => {
			
 
				+            process.stderr.write(`${c.dim}Reranking ${chunkCount} chunks...${c.reset}\n`);
			
 
				+            progress.indeterminate();
			
 
				+          },
			
 
				+          onRerankDone: () => {
			
 
				+            progress.clear();
			
 
				+          },
			
 
				         },
			
 
				-        onRerankDone: () => {
			
 
				-          progress.clear();
			
 
				+      });
			
 
				+    } else {
			
 
				+      // Standard hybrid query with automatic expansion
			
 
				+      results = await hybridQuery(store, query, {
			
 
				+        collection: singleCollection,
			
 
				+        limit: opts.all ? 500 : (opts.limit || 10),
			
 
				+        minScore: opts.minScore || 0,
			
 
				+        hooks: {
			
 
				+          onStrongSignal: (score) => {
			
 
				+            process.stderr.write(`${c.dim}Strong BM25 signal (${score.toFixed(2)}) — skipping expansion${c.reset}\n`);
			
 
				+          },
			
 
				+          onExpand: (original, expanded) => {
			
 
				+            logExpansionTree(original, expanded);
			
 
				+            process.stderr.write(`${c.dim}Searching ${expanded.length + 1} queries...${c.reset}\n`);
			
 
				+          },
			
 
				+          onRerankStart: (chunkCount) => {
			
 
				+            process.stderr.write(`${c.dim}Reranking ${chunkCount} chunks...${c.reset}\n`);
			
 
				+            progress.indeterminate();
			
 
				+          },
			
 
				+          onRerankDone: () => {
			
 
				+            progress.clear();
			
 
				+          },
			
 
				         },
			
 
				-      },
			
 
				-    });
			
 
				+      });
			
 
				+    }
			
 
				 
			
 
				     // Post-filter for multi-collection
			
 
				     if (collectionNames.length > 1) {
			
@@ -2097,6 +2192,11 @@ async function querySearch(query: string, opts: OutputOptions, _embedModel: stri
 
				       return;
			
 
				     }
			
 
				 
			
 
				+    // Use first lex/vec query for output context, or original query
			
 
				+    const displayQuery = structuredQueries
			
 
				+      ? (structuredQueries.find(s => s.type === 'lex')?.query || structuredQueries.find(s => s.type === 'vec')?.query || query)
			
 
				+      : query;
			
 
				+
			
 
				     // Map to CLI output format — use bestChunk for snippet display
			
 
				     outputResults(results.map(r => ({
			
 
				       file: r.file,
			
@@ -2107,7 +2207,7 @@ async function querySearch(query: string, opts: OutputOptions, _embedModel: stri
 
				       score: r.score,
			
 
				       context: r.context,
			
 
				       docid: r.docid,
			
 
				-    })), query, { ...opts, limit: results.length });
			
 
				+    })), displayQuery, { ...opts, limit: results.length });
			
 
				   }, { maxDuration: 10 * 60 * 1000, name: 'querySearch' });
			
 
				 }
			
 
				 
			
@@ -2214,6 +2314,7 @@ function showHelp(): void {
 
				   console.log("  qmd embed [-f]                - Create vector embeddings (900 tokens/chunk, 15% overlap)");
			
 
				   console.log("  qmd cleanup                   - Remove cache and orphaned data, vacuum DB");
			
 
				   console.log("  qmd query <query>             - Search with query expansion + reranking (recommended)");
			
 
				+  console.log("  qmd query 'lex:..\\nvec:...'   - Structured search (you provide lex/vec/hyde queries)");
			
 
				   console.log("  qmd search <query>            - Full-text keyword search (BM25, no LLM)");
			
 
				   console.log("  qmd vsearch <query>           - Vector similarity search (no reranking)");
			
 
				   console.log("  qmd mcp                       - Start MCP server (stdio transport)");
			
@@ -2237,6 +2338,13 @@ function showHelp(): void {
 
				   console.log("  --xml                      - XML output");
			
 
				   console.log("  -c, --collection <name>    - Filter results to a specific collection");
			
 
				   console.log("");
			
 
				+  console.log("Structured queries (qmd query):");
			
 
				+  console.log("  Prefix lines with lex:, vec:, or hyde: to skip automatic expansion.");
			
 
				+  console.log("  lex:  BM25 keyword search (exact terms)");
			
 
				+  console.log("  vec:  Vector similarity (natural language question)");
			
 
				+  console.log("  hyde: Vector similarity (hypothetical answer passage)");
			
 
				+  console.log("  Example: qmd query $'lex: CAP theorem\\nvec: consistency vs availability tradeoff'");
			
 
				+  console.log("");
			
 
				   console.log("Multi-get options:");
			
 
				   console.log("  -l <num>                   - Maximum lines per file");
			
 
				   console.log("  --max-bytes <num>          - Skip files larger than N bytes (default: 10240)");