Jelajahi Sumber

Improve embed: truncate large docs, better error messages

- Truncate documents > 64KB with warning showing filenames
- Show document title in error messages instead of hash
- Format total time as "15m 4s" instead of "904.2s"

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
Tobi Lutke 5 bulan lalu
induk
melakukan
46010e6342
1 mengubah file dengan 33 tambahan dan 8 penghapusan
  1. 33 8
      qmd.ts

+ 33 - 8
qmd.ts

@@ -678,11 +678,27 @@ async function vectorIndex(model: string = DEFAULT_EMBED_MODEL, force: boolean =
   }
 
   // Calculate total bytes for accurate progress tracking, skip empty files
+  // Truncate documents larger than 64KB
+  const MAX_EMBED_BYTES = 64 * 1024;
+  const truncated: string[] = [];
+
   const itemsWithSize = hashesToEmbed
-    .map(item => ({
-      ...item,
-      bytes: new TextEncoder().encode(item.body).length
-    }))
+    .map(item => {
+      const originalBytes = new TextEncoder().encode(item.body).length;
+      let body = item.body;
+      if (originalBytes > MAX_EMBED_BYTES) {
+        // Truncate to MAX_EMBED_BYTES
+        const encoder = new TextEncoder();
+        const decoder = new TextDecoder();
+        body = decoder.decode(encoder.encode(item.body).slice(0, MAX_EMBED_BYTES));
+        truncated.push(item.title);
+      }
+      return {
+        ...item,
+        body,
+        bytes: new TextEncoder().encode(body).length
+      };
+    })
     .filter(item => item.bytes > 0);  // Skip empty documents
 
   if (itemsWithSize.length === 0) {
@@ -699,6 +715,15 @@ async function vectorIndex(model: string = DEFAULT_EMBED_MODEL, force: boolean =
   if (skipped > 0) {
     console.log(`${c.dim}Skipped ${skipped} empty documents${c.reset}`);
   }
+  if (truncated.length > 0) {
+    console.log(`${c.yellow}⚠ Truncated ${truncated.length} large documents to 64KB:${c.reset}`);
+    for (const title of truncated.slice(0, 5)) {
+      console.log(`${c.dim}  - ${title}${c.reset}`);
+    }
+    if (truncated.length > 5) {
+      console.log(`${c.dim}  ... and ${truncated.length - 5} more${c.reset}`);
+    }
+  }
   console.log(`${c.dim}Model: ${model}${c.reset}\n`);
 
   progress.indeterminate();
@@ -729,7 +754,7 @@ async function vectorIndex(model: string = DEFAULT_EMBED_MODEL, force: boolean =
       errors++;
       bytesProcessed += item.bytes;
       progress.error();
-      console.error(`\n${c.yellow}⚠ Error embedding ${item.hash.slice(0, 8)}...: ${err}${c.reset}`);
+      console.error(`\n${c.yellow}⚠ Error embedding "${item.title}": ${err}${c.reset}`);
     }
 
     const processed = embedded + errors;
@@ -751,11 +776,11 @@ async function vectorIndex(model: string = DEFAULT_EMBED_MODEL, force: boolean =
   }
 
   progress.clear();
-  const totalTime = ((Date.now() - startTime) / 1000).toFixed(1);
-  const avgThroughput = formatBytes(totalBytes / parseFloat(totalTime));
+  const totalTimeSec = (Date.now() - startTime) / 1000;
+  const avgThroughput = formatBytes(totalBytes / totalTimeSec);
 
   console.log(`\r${c.green}${renderProgressBar(100)}${c.reset} ${c.bold}100%${c.reset}                                    `);
-  console.log(`\n${c.green}✓ Done!${c.reset} Embedded ${c.bold}${embedded}${c.reset} documents in ${c.bold}${totalTime}s${c.reset} ${c.dim}(${avgThroughput}/s)${c.reset}`);
+  console.log(`\n${c.green}✓ Done!${c.reset} Embedded ${c.bold}${embedded}${c.reset} documents in ${c.bold}${formatETA(totalTimeSec)}${c.reset} ${c.dim}(${avgThroughput}/s)${c.reset}`);
   if (errors > 0) {
     console.log(`${c.yellow}⚠ ${errors} documents failed${c.reset}`);
   }