4 mesiacov pred · 5233e676d9
--- a/src/llm.ts
+++ b/src/llm.ts
@@ -1022,6 +1022,9 @@ export class LlamaCpp implements LLM {
 
				     }
			
 
				   }
			
 
				 
			
 
				+  // Qwen3 reranker chat template overhead (system prompt, tags, separators)
			
 
				+  private static readonly RERANK_TEMPLATE_OVERHEAD = 200;
			
 
				+
			
 
				   async rerank(
			
 
				     query: string,
			
 
				     documents: RerankDocument[],
			
@@ -1031,15 +1034,28 @@ export class LlamaCpp implements LLM {
 
				     this.touchActivity();
			
 
				 
			
 
				     const contexts = await this.ensureRerankContexts();
			
 
				+    const model = await this.ensureRerankModel();
			
 
				+
			
 
				+    // Truncate documents that would exceed the rerank context size.
			
 
				+    // Budget = contextSize - template overhead - query tokens
			
 
				+    const queryTokens = model.tokenize(query).length;
			
 
				+    const maxDocTokens = LlamaCpp.RERANK_CONTEXT_SIZE - LlamaCpp.RERANK_TEMPLATE_OVERHEAD - queryTokens;
			
 
				+
			
 
				+    const truncatedDocs = documents.map((doc) => {
			
 
				+      const tokens = model.tokenize(doc.text);
			
 
				+      if (tokens.length <= maxDocTokens) return doc;
			
 
				+      const truncatedText = model.detokenize(tokens.slice(0, maxDocTokens));
			
 
				+      return { ...doc, text: truncatedText };
			
 
				+    });
			
 
				 
			
 
				     // Build a map from document text to original indices (for lookup after sorting)
			
 
				     const textToDoc = new Map<string, { file: string; index: number }>();
			
 
				-    documents.forEach((doc, index) => {
			
 
				+    truncatedDocs.forEach((doc, index) => {
			
 
				       textToDoc.set(doc.text, { file: doc.file, index });
			
 
				     });
			
 
				 
			
 
				     // Extract just the text for ranking
			
 
				-    const texts = documents.map((doc) => doc.text);
			
 
				+    const texts = truncatedDocs.map((doc) => doc.text);
			
 
				 
			
 
				     // Split documents across contexts for parallel evaluation.
			
 
				     // Each context has its own sequence with a lock, so parallelism comes
			
--- a/test/llm.test.ts
+++ b/test/llm.test.ts
@@ -365,6 +365,45 @@ describe.skipIf(!!process.env.CI)("LlamaCpp Integration", () => {
 
				       // Log timing for monitoring batch performance
			
 
				       console.log(`Batch rerank of 10 docs took ${elapsed}ms`);
			
 
				     });
			
 
				+
			
 
				+    test("truncates and reranks document exceeding 2048 token context size", async () => {
			
 
				+      // The reranker context is created with contextSize=2048. Documents that
			
 
				+      // exceed the token budget (contextSize - template overhead - query tokens)
			
 
				+      // should be silently truncated rather than crashing.
			
 
				+      const paragraph = "The quick brown fox jumps over the lazy dog near the riverbank. " +
			
 
				+        "Authentication tokens must be validated on every request to ensure security. " +
			
 
				+        "Database queries should use prepared statements to prevent SQL injection attacks. " +
			
 
				+        "The deployment pipeline includes linting, testing, building, and publishing stages. ";
			
 
				+      // ~320 chars per paragraph, repeat 40 times = ~12800 chars ≈ 3200 tokens
			
 
				+      const longText = paragraph.repeat(40);
			
 
				+
			
 
				+      const query = "How do I configure authentication?";
			
 
				+      const documents: RerankDocument[] = [
			
 
				+        { file: "short-relevant.md", text: "Authentication can be configured by setting AUTH_SECRET." },
			
 
				+        { file: "long-doc.md", text: longText },
			
 
				+        { file: "short-irrelevant.md", text: "The weather is sunny today." },
			
 
				+      ];
			
 
				+
			
 
				+      console.log(`Long doc length: ${longText.length} chars (~${Math.round(longText.length / 4)} tokens)`);
			
 
				+
			
 
				+      const result = await llm.rerank(query, documents);
			
 
				+
			
 
				+      // Should return all 3 documents without crashing
			
 
				+      expect(result.results).toHaveLength(3);
			
 
				+
			
 
				+      // All scores should be valid numbers in [0, 1]
			
 
				+      for (const doc of result.results) {
			
 
				+        expect(doc.score).toBeGreaterThanOrEqual(0);
			
 
				+        expect(doc.score).toBeLessThanOrEqual(1);
			
 
				+        expect(Number.isNaN(doc.score)).toBe(false);
			
 
				+      }
			
 
				+
			
 
				+      // The short, directly relevant doc should still rank highest
			
 
				+      console.log("Rerank results for long doc test:");
			
 
				+      for (const doc of result.results) {
			
 
				+        console.log(`  ${doc.file}: ${doc.score.toFixed(4)}`);
			
 
				+      }
			
 
				+    });
			
 
				   });
			
 
				 
			
 
				   describe("expandQuery", () => {