6 месяцев назад · fd24df81c9
--- a/.beads/issues.jsonl
+++ b/.beads/issues.jsonl
@@ -21,7 +21,7 @@
 
															 {"id":"qmd-clr","title":"fix embed","description":"","status":"closed","priority":2,"issue_type":"task","created_at":"2025-12-12T16:14:55.292114-05:00","updated_at":"2025-12-12T16:31:27.661829-05:00","closed_at":"2025-12-12T16:31:27.661829-05:00"}
														
 
															 {"id":"qmd-d00","title":"Add offline evaluation harness for tuning","description":"Create a small benchmark with ~100 labeled queries from real searches. Would enable tuning: expansion on/off threshold, candidate count (30 vs 100), blending weights, reranker threshold.","status":"open","priority":3,"issue_type":"feature","created_at":"2025-12-20T17:18:42.007265-05:00","updated_at":"2025-12-20T17:18:42.007265-05:00"}
														
 
															 {"id":"qmd-deh","title":"Refactor database introduce qmd collection *","description":"","status":"closed","priority":2,"issue_type":"task","created_at":"2025-12-10T10:56:04.516137-05:00","updated_at":"2025-12-12T16:12:12.349428-05:00","closed_at":"2025-12-12T16:12:12.349428-05:00"}
														
 
															-{"id":"qmd-df5","title":"Rerank multiple chunks per document with score aggregation","description":"Currently we only rerank 1 chunk per doc (selected by keyword heuristic). Should rerank top 2-3 chunks per document, then aggregate scores (max, softmax, or top-2 average). This improves ranking for long documents where the keyword-matched chunk isn't always the most relevant.","status":"open","priority":2,"issue_type":"feature","created_at":"2025-12-20T17:18:41.592575-05:00","updated_at":"2025-12-20T17:18:41.592575-05:00"}
														
 
															+{"id":"qmd-df5","title":"Rerank multiple chunks per document with score aggregation","description":"Currently we only rerank 1 chunk per doc (selected by keyword heuristic). Should rerank top 2-3 chunks per document, then aggregate scores (max, softmax, or top-2 average). This improves ranking for long documents where the keyword-matched chunk isn't always the most relevant.","status":"in_progress","priority":2,"issue_type":"feature","created_at":"2025-12-20T17:18:41.592575-05:00","updated_at":"2025-12-21T12:02:56.013748-05:00"}
														
 
															 {"id":"qmd-dmi","title":"Implement 'qmd collection' commands","description":"Add explicit collection management:\n- qmd collection add . --name \u003cname\u003e --mask '**/*.md'\n- qmd collection list\n- qmd collection remove \u003cname\u003e\n\nThis gives users control over collection names and patterns.","status":"closed","priority":1,"issue_type":"feature","created_at":"2025-12-12T15:29:53.810666-05:00","updated_at":"2025-12-12T16:02:08.079158-05:00","closed_at":"2025-12-12T16:02:08.079158-05:00","dependencies":[{"issue_id":"qmd-dmi","depends_on_id":"qmd-ama","type":"discovered-from","created_at":"2025-12-12T15:29:53.811294-05:00","created_by":"daemon"}]}
														
 
															 {"id":"qmd-dt1","title":"Redesign context add command for better usability","description":"Current issues: \n1. Virtual path qmd://journals/ is rejected as invalid\n2. Syntax is confusing - sometimes path is first arg, sometimes second\n3. Need to support collection root context (qmd://name/)\n4. Should be intuitive: qmd context add \u003cwhere\u003e \u003cwhat\u003e\nDesign goals:\n- Support qmd://collection/ for collection root context\n- Support qmd://collection/path for path-specific context\n- Clear, consistent syntax\n- Good error messages","status":"closed","priority":1,"issue_type":"task","created_at":"2025-12-13T09:39:19.764114-05:00","updated_at":"2025-12-13T09:41:38.467861-05:00","closed_at":"2025-12-13T09:41:38.467861-05:00"}
														
 
															 {"id":"qmd-e2c","title":"Implement 'qmd ls' command","description":"Add command to explore virtual file tree:\n- qmd ls → list all collections\n- qmd ls \u003ccollection\u003e → list files in collection\n- qmd ls \u003ccollection\u003e/\u003cpath\u003e → list files under path\nOutput: flat list of qmd:// paths","status":"closed","priority":1,"issue_type":"feature","created_at":"2025-12-12T15:29:53.859804-05:00","updated_at":"2025-12-12T15:55:12.777701-05:00","closed_at":"2025-12-12T15:55:12.777701-05:00","dependencies":[{"issue_id":"qmd-e2c","depends_on_id":"qmd-ama","type":"discovered-from","created_at":"2025-12-12T15:29:53.860535-05:00","created_by":"daemon"}]}
														
--- a/src/qmd.ts
+++ b/src/qmd.ts
@@ -2089,50 +2089,80 @@ async function querySearch(query: string, opts: OutputOptions, embedModel: strin
 
															     return;
														
 
															   }
														
 
															-  // Rerank chunks, not full documents
														
 
															-  // For each candidate, extract the most relevant chunk to rerank
														
 
															+  // Rerank multiple chunks per document, then aggregate scores
														
 
															+  // This improves ranking for long documents where keyword-matched chunk isn't always best
														
 
															+  const MAX_CHUNKS_PER_DOC = 3;
														
 
															   const chunksToRerank: { file: string; text: string; chunkIdx: number }[] = [];
														
 
															-  const docChunkMap = new Map<string, { chunks: { text: string; pos: number }[]; bestChunkIdx: number }>();
														
 
															+  const docChunkMap = new Map<string, { chunks: { text: string; pos: number }[]; selectedIndices: number[] }>();
														
 
															   for (const c of candidates) {
														
 
															     const chunks = chunkDocument(c.body);
														
 
															-    if (chunks.length === 1) {
														
 
															-      // Small document - use entire body
														
 
															-      chunksToRerank.push({ file: c.file, text: chunks[0].text, chunkIdx: 0 });
														
 
															-      docChunkMap.set(c.file, { chunks, bestChunkIdx: 0 });
														
 
															+    if (chunks.length <= MAX_CHUNKS_PER_DOC) {
														
 
															+      // Small document - rerank all chunks
														
 
															+      for (let i = 0; i < chunks.length; i++) {
														
 
															+        chunksToRerank.push({ file: c.file, text: chunks[i].text, chunkIdx: i });
														
 
															+      }
														
 
															+      docChunkMap.set(c.file, { chunks, selectedIndices: chunks.map((_, i) => i) });
														
 
															     } else {
														
 
															-      // Find the chunk that best matches the query terms (simple keyword heuristic)
														
 
															+      // Score all chunks by keyword match, select top MAX_CHUNKS_PER_DOC
														
 
															       const queryTerms = query.toLowerCase().split(/\s+/).filter(t => t.length > 2);
														
 
															-      let bestIdx = 0;
														
 
															-      let bestScore = 0;
														
 
															-      for (let i = 0; i < chunks.length; i++) {
														
 
															-        const chunkLower = chunks[i].text.toLowerCase();
														
 
															+      const scored = chunks.map((chunk, idx) => {
														
 
															+        const chunkLower = chunk.text.toLowerCase();
														
 
															         const score = queryTerms.reduce((acc, term) => acc + (chunkLower.includes(term) ? 1 : 0), 0);
														
 
															-        if (score > bestScore) {
														
 
															-          bestScore = score;
														
 
															-          bestIdx = i;
														
 
															-        }
														
 
															+        return { idx, score };
														
 
															+      });
														
 
															+      scored.sort((a, b) => b.score - a.score);
														
 
															+      const selectedIndices = scored.slice(0, MAX_CHUNKS_PER_DOC).map(s => s.idx);
														
 
															+
														
 
															+      for (const idx of selectedIndices) {
														
 
															+        chunksToRerank.push({ file: c.file, text: chunks[idx].text, chunkIdx: idx });
														
 
															       }
														
 
															-      chunksToRerank.push({ file: c.file, text: chunks[bestIdx].text, chunkIdx: bestIdx });
														
 
															-      docChunkMap.set(c.file, { chunks, bestChunkIdx: bestIdx });
														
 
															+      docChunkMap.set(c.file, { chunks, selectedIndices });
														
 
															     }
														
 
															   }
														
 
															-  // Rerank the focused chunks (with caching)
														
 
															+  // Rerank all selected chunks (with caching)
														
 
															+  // Use file:chunkIdx as unique identifier for reranker
														
 
															   const reranked = await rerank(
														
 
															     query,
														
 
															-    chunksToRerank.map(c => ({ file: c.file, text: c.text })),
														
 
															+    chunksToRerank.map(c => ({ file: `${c.file}:${c.chunkIdx}`, text: c.text })),
														
 
															     rerankModel,
														
 
															     db
														
 
															   );
														
 
															-  // Blend RRF position score with reranker score using position-aware weights
														
 
															+  // Aggregate chunk scores back to document level using top-2 average
														
 
															+  // (or max if only 1 chunk) - this balances best chunk with consistency
														
 
															+  const docScores = new Map<string, { scores: number[]; bestChunkIdx: number }>();
														
 
															+  for (const r of reranked) {
														
 
															+    const [file, chunkIdxStr] = r.file.split(/:(\d+)$/);
														
 
															+    const chunkIdx = parseInt(chunkIdxStr || "0");
														
 
															+    const existing = docScores.get(file);
														
 
															+    if (existing) {
														
 
															+      existing.scores.push(r.score);
														
 
															+      if (r.score > (existing.scores[0] || 0)) {
														
 
															+        existing.bestChunkIdx = chunkIdx;
														
 
															+      }
														
 
															+    } else {
														
 
															+      docScores.set(file, { scores: [r.score], bestChunkIdx: chunkIdx });
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  // Compute aggregated score: top-2 average (rewards consistency across chunks)
														
 
															+  const aggregatedScores = new Map<string, { score: number; bestChunkIdx: number }>();
														
 
															+  for (const [file, { scores, bestChunkIdx }] of docScores) {
														
 
															+    scores.sort((a, b) => b - a);
														
 
															+    const topScores = scores.slice(0, 2);
														
 
															+    const avgScore = topScores.reduce((a, b) => a + b, 0) / topScores.length;
														
 
															+    aggregatedScores.set(file, { score: avgScore, bestChunkIdx });
														
 
															+  }
														
 
															+
														
 
															+  // Blend RRF position score with aggregated reranker score using position-aware weights
														
 
															   // Top retrieval results get more protection from reranker disagreement
														
 
															   const candidateMap = new Map(candidates.map(c => [c.file, { displayPath: c.displayPath, title: c.title, body: c.body }]));
														
 
															   const rrfRankMap = new Map(candidates.map((c, i) => [c.file, i + 1])); // 1-indexed rank
														
 
															-  const finalResults = reranked.map(r => {
														
 
															-    const rrfRank = rrfRankMap.get(r.file) || 30;
														
 
															+  const finalResults = Array.from(aggregatedScores.entries()).map(([file, { score: rerankScore, bestChunkIdx }]) => {
														
 
															+    const rrfRank = rrfRankMap.get(file) || 30;
														
 
															     // Position-aware blending: top retrieval results preserved more
														
 
															     // Rank 1-3: 75% RRF, 25% reranker (trust retrieval for exact matches)
														
 
															     // Rank 4-10: 60% RRF, 40% reranker
														
@@ -2146,21 +2176,21 @@ async function querySearch(query: string, opts: OutputOptions, embedModel: strin
 
															       rrfWeight = 0.40;
														
 
															     }
														
 
															     const rrfScore = 1 / rrfRank;  // Position-based: 1, 0.5, 0.33...
														
 
															-    const blendedScore = rrfWeight * rrfScore + (1 - rrfWeight) * r.score;
														
 
															-    const candidate = candidateMap.get(r.file);
														
 
															-    // Use the best chunk's text for the body (better for snippets)
														
 
															-    const chunkInfo = docChunkMap.get(r.file);
														
 
															-    const chunkBody = chunkInfo ? chunkInfo.chunks[chunkInfo.bestChunkIdx].text : candidate?.body || "";
														
 
															-    const chunkPos = chunkInfo ? chunkInfo.chunks[chunkInfo.bestChunkIdx].pos : 0;
														
 
															+    const blendedScore = rrfWeight * rrfScore + (1 - rrfWeight) * rerankScore;
														
 
															+    const candidate = candidateMap.get(file);
														
 
															+    // Use the best-scoring chunk's text for the body (better for snippets)
														
 
															+    const chunkInfo = docChunkMap.get(file);
														
 
															+    const chunkBody = chunkInfo ? chunkInfo.chunks[bestChunkIdx]?.text || chunkInfo.chunks[0].text : candidate?.body || "";
														
 
															+    const chunkPos = chunkInfo ? chunkInfo.chunks[bestChunkIdx]?.pos || 0 : 0;
														
 
															     return {
														
 
															-      file: r.file,
														
 
															+      file,
														
 
															       displayPath: candidate?.displayPath || "",
														
 
															       title: candidate?.title || "",
														
 
															       body: chunkBody,
														
 
															       chunkPos,
														
 
															       score: blendedScore,
														
 
															-      context: getContextForFile(db, r.file),
														
 
															-      hash: hashMap.get(r.file) || "",
														
 
															+      context: getContextForFile(db, file),
														
 
															+      hash: hashMap.get(file) || "",
														
 
															     };
														
 
															   }).sort((a, b) => b.score - a.score);