Pārlūkot izejas kodu

Improve embed progress bar with byte-based ETA

- Visual progress bar with filled/empty blocks
- Calculate ETA based on bytes processed (larger files = longer time)
- Show throughput in bytes/sec
- Skip empty documents
- Fix UNIQUE constraint with INSERT OR REPLACE

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
Tobi Lutke 5 mēneši atpakaļ
vecāks
revīzija
6c7e2911a2
1 mainītis faili ar 65 papildinājumiem un 21 dzēšanām
  1. 65 21
      qmd.ts

+ 65 - 21
qmd.ts

@@ -645,13 +645,20 @@ async function indexFiles(globPattern: string = DEFAULT_GLOB): Promise<void> {
   db.close();
 }
 
+function renderProgressBar(percent: number, width: number = 30): string {
+  const filled = Math.round((percent / 100) * width);
+  const empty = width - filled;
+  const bar = "█".repeat(filled) + "░".repeat(empty);
+  return bar;
+}
+
 async function vectorIndex(model: string = DEFAULT_EMBED_MODEL, force: boolean = false): Promise<void> {
   const db = getDb();
   const now = new Date().toISOString();
 
   // If force, clear all vectors
   if (force) {
-    console.log("Force re-indexing: clearing all vectors...");
+    console.log(`${c.yellow}Force re-indexing: clearing all vectors...${c.reset}`);
     db.exec(`DELETE FROM content_vectors`);
     db.exec(`DROP TABLE IF EXISTS vectors_vec`);
   }
@@ -665,56 +672,93 @@ async function vectorIndex(model: string = DEFAULT_EMBED_MODEL, force: boolean =
   `).all() as { hash: string; title: string; body: string }[];
 
   if (hashesToEmbed.length === 0) {
-    console.log("All content hashes already have embeddings.");
+    console.log(`${c.green}✓ All content hashes already have embeddings.${c.reset}`);
+    db.close();
+    return;
+  }
+
+  // Calculate total bytes for accurate progress tracking, skip empty files
+  const itemsWithSize = hashesToEmbed
+    .map(item => ({
+      ...item,
+      bytes: new TextEncoder().encode(item.body).length
+    }))
+    .filter(item => item.bytes > 0);  // Skip empty documents
+
+  if (itemsWithSize.length === 0) {
+    console.log(`${c.green}✓ No non-empty documents to embed.${c.reset}`);
     db.close();
     return;
   }
 
-  const total = hashesToEmbed.length;
-  console.log(`Embedding ${total} unique content hashes with ${model}...`);
+  const totalBytes = itemsWithSize.reduce((sum, item) => sum + item.bytes, 0);
+  const total = itemsWithSize.length;
+  const skipped = hashesToEmbed.length - total;
+
+  console.log(`${c.bold}Embedding ${total} documents${c.reset} ${c.dim}(${formatBytes(totalBytes)})${c.reset}`);
+  if (skipped > 0) {
+    console.log(`${c.dim}Skipped ${skipped} empty documents${c.reset}`);
+  }
+  console.log(`${c.dim}Model: ${model}${c.reset}\n`);
 
   progress.indeterminate();
-  const firstEmbedding = await getEmbedding(hashesToEmbed[0].body, model, false, hashesToEmbed[0].title);
-  console.log(`Embedding dimensions: ${firstEmbedding.length}`);
+  const firstEmbedding = await getEmbedding(itemsWithSize[0].body, model, false, itemsWithSize[0].title);
   ensureVecTable(db, firstEmbedding.length);
 
-  const insertVecStmt = db.prepare(`INSERT INTO vectors_vec (hash, embedding) VALUES (?, ?)`);
+  const insertVecStmt = db.prepare(`INSERT OR REPLACE INTO vectors_vec (hash, embedding) VALUES (?, ?)`);
   const insertContentVectorStmt = db.prepare(`INSERT OR REPLACE INTO content_vectors (hash, model, embedded_at) VALUES (?, ?, ?)`);
 
-  let embedded = 0, errors = 0;
+  let embedded = 0, errors = 0, bytesProcessed = 0;
   const startTime = Date.now();
 
   // Insert first
-  insertVecStmt.run(hashesToEmbed[0].hash, new Float32Array(firstEmbedding));
-  insertContentVectorStmt.run(hashesToEmbed[0].hash, model, now);
+  insertVecStmt.run(itemsWithSize[0].hash, new Float32Array(firstEmbedding));
+  insertContentVectorStmt.run(itemsWithSize[0].hash, model, now);
   embedded++;
-  progress.set((embedded / total) * 100);
-  process.stderr.write(`\rEmbedding: ${embedded}/${total}`);
+  bytesProcessed += itemsWithSize[0].bytes;
 
-  for (let i = 1; i < hashesToEmbed.length; i++) {
-    const item = hashesToEmbed[i];
+  for (let i = 1; i < itemsWithSize.length; i++) {
+    const item = itemsWithSize[i];
     try {
       const embedding = await getEmbedding(item.body, model, false, item.title);
       insertVecStmt.run(item.hash, new Float32Array(embedding));
       insertContentVectorStmt.run(item.hash, model, now);
       embedded++;
+      bytesProcessed += item.bytes;
     } catch (err) {
       errors++;
+      bytesProcessed += item.bytes;
       progress.error();
-      console.error(`\nError embedding hash ${item.hash.slice(0, 8)}...: ${err}`);
+      console.error(`\n${c.yellow}⚠ Error embedding ${item.hash.slice(0, 8)}...: ${err}${c.reset}`);
     }
+
     const processed = embedded + errors;
-    progress.set((processed / total) * 100);
+    const percent = (bytesProcessed / totalBytes) * 100;
+    progress.set(percent);
+
     const elapsed = (Date.now() - startTime) / 1000;
-    const rate = processed / elapsed;
-    const remaining = (total - processed) / rate;
-    const eta = processed > 2 ? ` ETA: ${formatETA(remaining)}` : "";
-    process.stderr.write(`\rEmbedding: ${embedded}/${total}${errors > 0 ? ` (${errors} errors)` : ""}${eta}        `);
+    const bytesPerSec = bytesProcessed / elapsed;
+    const remainingBytes = totalBytes - bytesProcessed;
+    const etaSec = remainingBytes / bytesPerSec;
+
+    const bar = renderProgressBar(percent);
+    const percentStr = percent.toFixed(0).padStart(3);
+    const throughput = `${formatBytes(bytesPerSec)}/s`;
+    const eta = elapsed > 2 ? formatETA(etaSec) : "...";
+    const errStr = errors > 0 ? ` ${c.yellow}${errors} err${c.reset}` : "";
+
+    process.stderr.write(`\r${c.cyan}${bar}${c.reset} ${c.bold}${percentStr}%${c.reset} ${c.dim}${embedded}/${total}${c.reset}${errStr} ${c.dim}${throughput} ETA ${eta}${c.reset}   `);
   }
 
   progress.clear();
   const totalTime = ((Date.now() - startTime) / 1000).toFixed(1);
-  console.log(`\nDone! Embedded ${embedded} hashes${errors > 0 ? `, ${errors} errors` : ""} in ${totalTime}s.`);
+  const avgThroughput = formatBytes(totalBytes / parseFloat(totalTime));
+
+  console.log(`\r${c.green}${renderProgressBar(100)}${c.reset} ${c.bold}100%${c.reset}                                    `);
+  console.log(`\n${c.green}✓ Done!${c.reset} Embedded ${c.bold}${embedded}${c.reset} documents in ${c.bold}${totalTime}s${c.reset} ${c.dim}(${avgThroughput}/s)${c.reset}`);
+  if (errors > 0) {
+    console.log(`${c.yellow}⚠ ${errors} documents failed${c.reset}`);
+  }
   db.close();
 }