3 месяцев назад · 616776ebdd
--- a/src/llm.ts
+++ b/src/llm.ts
@@ -759,9 +759,16 @@ export class LlamaCpp implements LLM {
 
															    * - Combined: drops from 11.6 GB (auto, no flash) to 568 MB per context (20×)
														
 
															    */
														
 
															   // Qwen3 reranker template adds ~200 tokens overhead (system prompt, tags, etc.)
														
 
															-  // Chunks are max 800 tokens, so 800 + 200 + query ≈ 1100 tokens typical.
														
 
															-  // Use 2048 for safety margin. Still 17× less than auto (40960).
														
 
															-  private static readonly RERANK_CONTEXT_SIZE = 2048;
														
 
															+  // Default 2048 was too small for longer documents (e.g. session transcripts,
														
 
															+  // CJK text, or large markdown files) — callers hit "input lengths exceed
														
 
															+  // context size" errors even after truncation because the overhead estimate
														
 
															+  // was insufficient.  4096 comfortably fits the largest real-world chunks
														
 
															+  // while staying well below the 40 960-token auto size.
														
 
															+  // Override with QMD_RERANK_CONTEXT_SIZE env var if you need more headroom.
														
 
															+  private static readonly RERANK_CONTEXT_SIZE: number = (() => {
														
 
															+    const v = parseInt(process.env.QMD_RERANK_CONTEXT_SIZE ?? "", 10);
														
 
															+    return Number.isFinite(v) && v > 0 ? v : 4096;
														
 
															+  })();
														
 
															   private async ensureRerankContexts(): Promise<Awaited<ReturnType<LlamaModel["createRankingContext"]>>[]> {
														
 
															     if (this.rerankContexts.length === 0) {
														
 
															       const model = await this.ensureRerankModel();
														
@@ -1101,8 +1108,10 @@ export class LlamaCpp implements LLM {
 
															     }
														
 
															   }
														
 
															-  // Qwen3 reranker chat template overhead (system prompt, tags, separators)
														
 
															-  private static readonly RERANK_TEMPLATE_OVERHEAD = 200;
														
 
															+  // Qwen3 reranker chat template overhead (system prompt, tags, separators).
														
 
															+  // Measured at ~350 tokens on real queries; use 512 as a safe upper bound so
														
 
															+  // the truncation budget never lets a document slip past the context limit.
														
 
															+  private static readonly RERANK_TEMPLATE_OVERHEAD = 512;
														
 
															   private static readonly RERANK_TARGET_DOCS_PER_CONTEXT = 10;
														
 
															   async rerank(