Przeglądaj źródła

fix: handle dense content (code) that tokenizes to more than expected

The 4 chars/token estimate is accurate for prose but code can be
1.7-2 chars/token. This caused chunks to exceed the embedding
model's 2048 token context limit.

- Use 3 chars/token as initial estimate (balanced for mixed content)
- Add safety net: re-chunk any chunks that still exceed token limit
- Use actual chars/token ratio when re-chunking for accuracy

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
Tobi Lutke 3 miesięcy temu
rodzic
commit
31dd977c32
1 zmienionych plików z 32 dodań i 17 usunięć
  1. 32 17
      src/store.ts

+ 32 - 17
src/store.ts

@@ -1453,29 +1453,44 @@ export async function chunkDocumentByTokens(
 ): Promise<{ text: string; pos: number; tokens: number }[]> {
   const llm = getDefaultLlamaCpp();
 
-  // Convert token params to character params (~4 chars per token)
-  const avgCharsPerToken = 4;
+  // Use moderate chars/token estimate (prose ~4, code ~2, mixed ~3)
+  // If chunks exceed limit, they'll be re-split with actual ratio
+  const avgCharsPerToken = 3;
   const maxChars = maxTokens * avgCharsPerToken;
   const overlapChars = overlapTokens * avgCharsPerToken;
   const windowChars = windowTokens * avgCharsPerToken;
 
-  // Chunk entirely in character space
-  const charChunks = chunkDocument(content, maxChars, overlapChars, windowChars);
+  // Chunk in character space with conservative estimate
+  let charChunks = chunkDocument(content, maxChars, overlapChars, windowChars);
 
-  // Batch tokenize: get token counts for all chunks
-  const tokenCounts = await Promise.all(
-    charChunks.map(async (chunk) => {
-      const tokens = await llm.tokenize(chunk.text);
-      return tokens.length;
-    })
-  );
+  // Tokenize and split any chunks that still exceed limit
+  const results: { text: string; pos: number; tokens: number }[] = [];
+
+  for (const chunk of charChunks) {
+    const tokens = await llm.tokenize(chunk.text);
+
+    if (tokens.length <= maxTokens) {
+      results.push({ text: chunk.text, pos: chunk.pos, tokens: tokens.length });
+    } else {
+      // Chunk is still too large - split it further
+      // Use actual token count to estimate better char limit
+      const actualCharsPerToken = chunk.text.length / tokens.length;
+      const safeMaxChars = Math.floor(maxTokens * actualCharsPerToken * 0.95); // 5% safety margin
+
+      const subChunks = chunkDocument(chunk.text, safeMaxChars, Math.floor(overlapChars * actualCharsPerToken / 2), Math.floor(windowChars * actualCharsPerToken / 2));
+
+      for (const subChunk of subChunks) {
+        const subTokens = await llm.tokenize(subChunk.text);
+        results.push({
+          text: subChunk.text,
+          pos: chunk.pos + subChunk.pos,
+          tokens: subTokens.length,
+        });
+      }
+    }
+  }
 
-  // Combine chunks with their token counts
-  return charChunks.map((chunk, i) => ({
-    text: chunk.text,
-    pos: chunk.pos,
-    tokens: tokenCounts[i]!,
-  }));
+  return results;
 }
 
 // =============================================================================