Просмотр исходного кода

Merge pull request #453 from builderjarvis/fix/rerank-context-size

fix: increase RERANK_CONTEXT_SIZE default 2048→4096, configurable via env var, fix template overhead underestimate
Tobias Lütke 1 месяц назад
Родитель
Сommit
616776ebdd
1 измененных файлов с 14 добавлено и 5 удалено
  1. 14 5
      src/llm.ts

+ 14 - 5
src/llm.ts

@@ -759,9 +759,16 @@ export class LlamaCpp implements LLM {
    * - Combined: drops from 11.6 GB (auto, no flash) to 568 MB per context (20×)
    */
   // Qwen3 reranker template adds ~200 tokens overhead (system prompt, tags, etc.)
-  // Chunks are max 800 tokens, so 800 + 200 + query ≈ 1100 tokens typical.
-  // Use 2048 for safety margin. Still 17× less than auto (40960).
-  private static readonly RERANK_CONTEXT_SIZE = 2048;
+  // Default 2048 was too small for longer documents (e.g. session transcripts,
+  // CJK text, or large markdown files) — callers hit "input lengths exceed
+  // context size" errors even after truncation because the overhead estimate
+  // was insufficient.  4096 comfortably fits the largest real-world chunks
+  // while staying well below the 40 960-token auto size.
+  // Override with QMD_RERANK_CONTEXT_SIZE env var if you need more headroom.
+  private static readonly RERANK_CONTEXT_SIZE: number = (() => {
+    const v = parseInt(process.env.QMD_RERANK_CONTEXT_SIZE ?? "", 10);
+    return Number.isFinite(v) && v > 0 ? v : 4096;
+  })();
   private async ensureRerankContexts(): Promise<Awaited<ReturnType<LlamaModel["createRankingContext"]>>[]> {
     if (this.rerankContexts.length === 0) {
       const model = await this.ensureRerankModel();
@@ -1101,8 +1108,10 @@ export class LlamaCpp implements LLM {
     }
   }
 
-  // Qwen3 reranker chat template overhead (system prompt, tags, separators)
-  private static readonly RERANK_TEMPLATE_OVERHEAD = 200;
+  // Qwen3 reranker chat template overhead (system prompt, tags, separators).
+  // Measured at ~350 tokens on real queries; use 512 as a safe upper bound so
+  // the truncation budget never lets a document slip past the context limit.
+  private static readonly RERANK_TEMPLATE_OVERHEAD = 512;
   private static readonly RERANK_TARGET_DOCS_PER_CONTEXT = 10;
 
   async rerank(