|
|
@@ -0,0 +1,319 @@
|
|
|
+#!/usr/bin/env bun
|
|
|
+/**
|
|
|
+ * QMD Reranker Benchmark
|
|
|
+ *
|
|
|
+ * Measures reranking performance across different configurations.
|
|
|
+ * Reports device, parallelism, memory, VRAM, and throughput.
|
|
|
+ *
|
|
|
+ * Usage:
|
|
|
+ * bun src/bench-rerank.ts # full benchmark
|
|
|
+ * bun src/bench-rerank.ts --quick # quick smoke test (10 docs, 1 iteration)
|
|
|
+ * bun src/bench-rerank.ts --docs 100 # custom doc count
|
|
|
+ */
|
|
|
+
|
|
|
+import {
|
|
|
+ getLlama,
|
|
|
+ getLlamaGpuTypes,
|
|
|
+ resolveModelFile,
|
|
|
+ LlamaLogLevel,
|
|
|
+ type Llama,
|
|
|
+ type LlamaModel,
|
|
|
+} from "node-llama-cpp";
|
|
|
+import { homedir } from "os";
|
|
|
+import { join } from "path";
|
|
|
+import { cpus } from "os";
|
|
|
+
|
|
|
+// ============================================================================
|
|
|
+// Config
|
|
|
+// ============================================================================
|
|
|
+
|
|
|
+const RERANK_MODEL = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
|
|
|
+const MODEL_CACHE = join(homedir(), ".cache", "qmd", "models");
|
|
|
+const CONTEXT_SIZE = 2048;
|
|
|
+
|
|
|
+const args = process.argv.slice(2);
|
|
|
+const quick = args.includes("--quick");
|
|
|
+const docsIdx = args.indexOf("--docs");
|
|
|
+const DOC_COUNT = docsIdx >= 0 ? parseInt(args[docsIdx + 1]!) : (quick ? 10 : 40);
|
|
|
+const ITERATIONS = quick ? 1 : 3;
|
|
|
+const PARALLEL_CONFIGS = quick ? [1, 4] : [1, 2, 4, 8];
|
|
|
+
|
|
|
+// ============================================================================
|
|
|
+// Test data — realistic-ish chunks of varying length
|
|
|
+// ============================================================================
|
|
|
+
|
|
|
+const QUERY = "How do AI agents work and what are their limitations?";
|
|
|
+
|
|
|
+function generateDocs(n: number): string[] {
|
|
|
+ const templates = [
|
|
|
+ "Artificial intelligence agents are software systems that perceive their environment and take actions to achieve goals. They use techniques like reinforcement learning, planning, and natural language processing to operate autonomously.",
|
|
|
+ "The transformer architecture, introduced in 2017, revolutionized natural language processing. Self-attention mechanisms allow models to weigh the importance of different parts of input sequences when generating outputs.",
|
|
|
+ "Machine learning models require careful evaluation to avoid overfitting. Cross-validation, holdout sets, and metrics like precision, recall, and F1 score help assess generalization performance.",
|
|
|
+ "Retrieval-augmented generation combines information retrieval with language models. Documents are embedded into vector spaces, retrieved based on query similarity, and used as context for generation.",
|
|
|
+ "Neural network training involves forward propagation, loss computation, and backpropagation. Optimizers like Adam and SGD adjust weights to minimize the loss function over training iterations.",
|
|
|
+ "Large language models exhibit emergent capabilities at scale, including few-shot learning, chain-of-thought reasoning, and instruction following. These properties were not explicitly trained for.",
|
|
|
+ "Embedding models convert text into dense vector representations that capture semantic meaning. Similar texts produce similar vectors, enabling efficient similarity search and clustering.",
|
|
|
+ "Autonomous agents face challenges including hallucination, lack of grounding, limited planning horizons, and difficulty with multi-step reasoning. Safety and alignment remain open research problems.",
|
|
|
+ "The attention mechanism computes query-key-value interactions to determine which parts of the input are most relevant. Multi-head attention allows the model to attend to different representation subspaces.",
|
|
|
+ "Fine-tuning adapts a pre-trained model to specific tasks using domain-specific data. Techniques like LoRA reduce the number of trainable parameters while maintaining performance.",
|
|
|
+ ];
|
|
|
+ return Array.from({ length: n }, (_, i) => templates[i % templates.length]!);
|
|
|
+}
|
|
|
+
|
|
|
+// ============================================================================
|
|
|
+// Helpers
|
|
|
+// ============================================================================
|
|
|
+
|
|
|
+function formatBytes(bytes: number): string {
|
|
|
+ if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`;
|
|
|
+ if (bytes < 1024 * 1024 * 1024) return `${(bytes / (1024 * 1024)).toFixed(1)} MB`;
|
|
|
+ return `${(bytes / (1024 * 1024 * 1024)).toFixed(2)} GB`;
|
|
|
+}
|
|
|
+
|
|
|
+function getMemUsage(): { rss: number; heapUsed: number } {
|
|
|
+ const m = process.memoryUsage();
|
|
|
+ return { rss: m.rss, heapUsed: m.heapUsed };
|
|
|
+}
|
|
|
+
|
|
|
+function median(arr: number[]): number {
|
|
|
+ const sorted = [...arr].sort((a, b) => a - b);
|
|
|
+ const mid = Math.floor(sorted.length / 2);
|
|
|
+ return sorted.length % 2 !== 0 ? sorted[mid]! : (sorted[mid - 1]! + sorted[mid]!) / 2;
|
|
|
+}
|
|
|
+
|
|
|
+// ============================================================================
|
|
|
+// Benchmark runner
|
|
|
+// ============================================================================
|
|
|
+
|
|
|
+interface BenchResult {
|
|
|
+ parallelism: number;
|
|
|
+ contextSize: number;
|
|
|
+ flashAttention: boolean;
|
|
|
+ times: number[]; // ms per run
|
|
|
+ medianMs: number;
|
|
|
+ docsPerSec: number;
|
|
|
+ vramPerContext: number; // bytes
|
|
|
+ totalVram: number; // bytes
|
|
|
+ peakRss: number; // bytes
|
|
|
+}
|
|
|
+
|
|
|
+async function benchmarkConfig(
|
|
|
+ model: LlamaModel,
|
|
|
+ llama: Llama,
|
|
|
+ docs: string[],
|
|
|
+ parallelism: number,
|
|
|
+ flash: boolean,
|
|
|
+): Promise<BenchResult> {
|
|
|
+ // Measure VRAM before
|
|
|
+ const vramBefore = llama.gpu ? await llama.getVramState() : null;
|
|
|
+ const rssBefore = getMemUsage().rss;
|
|
|
+
|
|
|
+ // Create contexts
|
|
|
+ const contexts = [];
|
|
|
+ for (let i = 0; i < parallelism; i++) {
|
|
|
+ try {
|
|
|
+ contexts.push(await model.createRankingContext({
|
|
|
+ contextSize: CONTEXT_SIZE,
|
|
|
+ flashAttention: flash,
|
|
|
+ }));
|
|
|
+ } catch {
|
|
|
+ if (contexts.length === 0) {
|
|
|
+ // Try without flash
|
|
|
+ contexts.push(await model.createRankingContext({ contextSize: CONTEXT_SIZE }));
|
|
|
+ }
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ const actualParallelism = contexts.length;
|
|
|
+
|
|
|
+ // Measure VRAM after context creation
|
|
|
+ const vramAfter = llama.gpu ? await llama.getVramState() : null;
|
|
|
+ const vramUsed = vramBefore && vramAfter ? vramAfter.used - vramBefore.used : 0;
|
|
|
+ const vramPerCtx = actualParallelism > 0 ? vramUsed / actualParallelism : 0;
|
|
|
+
|
|
|
+ // Warm up
|
|
|
+ await contexts[0]!.rankAll(QUERY, docs.slice(0, 2));
|
|
|
+
|
|
|
+ // Benchmark iterations
|
|
|
+ const times: number[] = [];
|
|
|
+ let peakRss = getMemUsage().rss;
|
|
|
+
|
|
|
+ for (let iter = 0; iter < ITERATIONS; iter++) {
|
|
|
+ const chunkSize = Math.ceil(docs.length / actualParallelism);
|
|
|
+
|
|
|
+ const t0 = performance.now();
|
|
|
+ const allScores = await Promise.all(
|
|
|
+ Array.from({ length: actualParallelism }, (_, i) => {
|
|
|
+ const chunk = docs.slice(i * chunkSize, (i + 1) * chunkSize);
|
|
|
+ return chunk.length > 0 ? contexts[i]!.rankAll(QUERY, chunk) : Promise.resolve([]);
|
|
|
+ })
|
|
|
+ );
|
|
|
+ const elapsed = performance.now() - t0;
|
|
|
+ times.push(elapsed);
|
|
|
+
|
|
|
+ // Verify scores are valid
|
|
|
+ const flat = allScores.flat();
|
|
|
+ if (flat.some(s => s < 0 || s > 1 || isNaN(s))) {
|
|
|
+ throw new Error("Invalid scores detected");
|
|
|
+ }
|
|
|
+
|
|
|
+ const currentRss = getMemUsage().rss;
|
|
|
+ if (currentRss > peakRss) peakRss = currentRss;
|
|
|
+ }
|
|
|
+
|
|
|
+ // Cleanup
|
|
|
+ for (const ctx of contexts) await ctx.dispose();
|
|
|
+
|
|
|
+ const med = median(times);
|
|
|
+ return {
|
|
|
+ parallelism: actualParallelism,
|
|
|
+ contextSize: CONTEXT_SIZE,
|
|
|
+ flashAttention: flash,
|
|
|
+ times,
|
|
|
+ medianMs: med,
|
|
|
+ docsPerSec: (docs.length / med) * 1000,
|
|
|
+ vramPerContext: vramPerCtx,
|
|
|
+ totalVram: vramUsed,
|
|
|
+ peakRss,
|
|
|
+ };
|
|
|
+}
|
|
|
+
|
|
|
+// ============================================================================
|
|
|
+// Main
|
|
|
+// ============================================================================
|
|
|
+
|
|
|
+async function main() {
|
|
|
+ console.log("═══════════════════════════════════════════════════════════════");
|
|
|
+ console.log(" QMD Reranker Benchmark");
|
|
|
+ console.log("═══════════════════════════════════════════════════════════════\n");
|
|
|
+
|
|
|
+ // Detect GPU
|
|
|
+ const gpuTypes = await getLlamaGpuTypes();
|
|
|
+ const preferred = (["cuda", "metal", "vulkan"] as const).find(g => gpuTypes.includes(g));
|
|
|
+
|
|
|
+ let llama: Llama;
|
|
|
+ let gpuLabel: string;
|
|
|
+ if (preferred) {
|
|
|
+ try {
|
|
|
+ llama = await getLlama({ gpu: preferred, logLevel: LlamaLogLevel.error });
|
|
|
+ gpuLabel = `${preferred}`;
|
|
|
+ } catch {
|
|
|
+ llama = await getLlama({ gpu: false, logLevel: LlamaLogLevel.error });
|
|
|
+ gpuLabel = "cpu (gpu init failed)";
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ llama = await getLlama({ gpu: false, logLevel: LlamaLogLevel.error });
|
|
|
+ gpuLabel = "cpu";
|
|
|
+ }
|
|
|
+
|
|
|
+ // System info
|
|
|
+ const cpuInfo = cpus();
|
|
|
+ const cpuModel = cpuInfo[0]?.model || "unknown";
|
|
|
+ const cpuCount = cpuInfo.length;
|
|
|
+
|
|
|
+ console.log("System");
|
|
|
+ console.log(` CPU: ${cpuModel}`);
|
|
|
+ console.log(` Cores: ${cpuCount} (${llama.cpuMathCores} math)`);
|
|
|
+ console.log(` Device: ${gpuLabel}`);
|
|
|
+
|
|
|
+ if (llama.gpu) {
|
|
|
+ const gpuNames = await llama.getGpuDeviceNames();
|
|
|
+ const counts = new Map<string, number>();
|
|
|
+ for (const name of gpuNames) counts.set(name, (counts.get(name) || 0) + 1);
|
|
|
+ const devStr = Array.from(counts.entries())
|
|
|
+ .map(([name, n]) => n > 1 ? `${n}× ${name}` : name).join(", ");
|
|
|
+ console.log(` GPU: ${devStr}`);
|
|
|
+ const vram = await llama.getVramState();
|
|
|
+ console.log(` VRAM: ${formatBytes(vram.total)} total, ${formatBytes(vram.free)} free`);
|
|
|
+ }
|
|
|
+
|
|
|
+ console.log(` RAM: ${formatBytes(getMemUsage().rss)} RSS at start`);
|
|
|
+
|
|
|
+ // Load model
|
|
|
+ console.log(`\nModel`);
|
|
|
+ console.log(` URI: ${RERANK_MODEL}`);
|
|
|
+ const modelPath = await resolveModelFile(RERANK_MODEL, MODEL_CACHE);
|
|
|
+ const vramPreModel = llama.gpu ? await llama.getVramState() : null;
|
|
|
+ const model = await llama.loadModel({ modelPath });
|
|
|
+ const vramPostModel = llama.gpu ? await llama.getVramState() : null;
|
|
|
+ const modelVram = vramPreModel && vramPostModel ? vramPostModel.used - vramPreModel.used : 0;
|
|
|
+ console.log(` Params: ${model.trainContextSize} train ctx`);
|
|
|
+ if (modelVram > 0) console.log(` VRAM: ${formatBytes(modelVram)} (model weights)`);
|
|
|
+
|
|
|
+ // Generate test docs
|
|
|
+ const docs = generateDocs(DOC_COUNT);
|
|
|
+ console.log(`\nBenchmark`);
|
|
|
+ console.log(` Documents: ${DOC_COUNT}`);
|
|
|
+ console.log(` Ctx size: ${CONTEXT_SIZE}`);
|
|
|
+ console.log(` Iterations:${ITERATIONS}`);
|
|
|
+ console.log(` Query: "${QUERY.slice(0, 50)}..."`);
|
|
|
+
|
|
|
+ // Run benchmarks
|
|
|
+ const results: BenchResult[] = [];
|
|
|
+
|
|
|
+ for (const p of PARALLEL_CONFIGS) {
|
|
|
+ if (!llama.gpu && p > 1) {
|
|
|
+ console.log(`\n [${p} ctx] skipped (CPU — no benefit from parallelism)`);
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ // Test with flash attention
|
|
|
+ process.stdout.write(`\n [${p} ctx, flash] running...`);
|
|
|
+ try {
|
|
|
+ const r = await benchmarkConfig(model, llama, docs, p, true);
|
|
|
+ results.push(r);
|
|
|
+ process.stdout.write(` ${r.medianMs.toFixed(0)}ms (${r.docsPerSec.toFixed(1)} docs/s)\n`);
|
|
|
+ } catch (e: any) {
|
|
|
+ process.stdout.write(` failed: ${e.message}\n`);
|
|
|
+ // Try without flash
|
|
|
+ process.stdout.write(` [${p} ctx, no flash] running...`);
|
|
|
+ try {
|
|
|
+ const r = await benchmarkConfig(model, llama, docs, p, false);
|
|
|
+ results.push(r);
|
|
|
+ process.stdout.write(` ${r.medianMs.toFixed(0)}ms (${r.docsPerSec.toFixed(1)} docs/s)\n`);
|
|
|
+ } catch (e2: any) {
|
|
|
+ process.stdout.write(` failed: ${e2.message}\n`);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // Summary table
|
|
|
+ console.log("\n═══════════════════════════════════════════════════════════════");
|
|
|
+ console.log(" Results");
|
|
|
+ console.log("═══════════════════════════════════════════════════════════════\n");
|
|
|
+
|
|
|
+ const header = " Ctx Flash Median Docs/s VRAM/ctx Total VRAM Peak RSS";
|
|
|
+ const sep = " ─── ───── ────── ────── ──────── ────────── ────────";
|
|
|
+ console.log(header);
|
|
|
+ console.log(sep);
|
|
|
+
|
|
|
+ const baseline = results[0]?.medianMs ?? 1;
|
|
|
+ for (const r of results) {
|
|
|
+ const speedup = baseline / r.medianMs;
|
|
|
+ const speedupStr = r === results[0] ? " " : `(${speedup.toFixed(1)}×)`;
|
|
|
+ console.log(
|
|
|
+ ` ${String(r.parallelism).padStart(3)} ` +
|
|
|
+ `${r.flashAttention ? " yes " : " no "} ` +
|
|
|
+ `${r.medianMs.toFixed(0).padStart(5)}ms ` +
|
|
|
+ `${r.docsPerSec.toFixed(1).padStart(6)} ` +
|
|
|
+ `${formatBytes(r.vramPerContext).padStart(8)} ` +
|
|
|
+ `${formatBytes(r.totalVram).padStart(10)} ` +
|
|
|
+ `${formatBytes(r.peakRss).padStart(8)} ` +
|
|
|
+ speedupStr
|
|
|
+ );
|
|
|
+ }
|
|
|
+
|
|
|
+ // Best config
|
|
|
+ if (results.length > 0) {
|
|
|
+ const best = results.reduce((a, b) => a.docsPerSec > b.docsPerSec ? a : b);
|
|
|
+ console.log(`\n Best: ${best.parallelism} contexts, flash=${best.flashAttention}`);
|
|
|
+ console.log(` ${best.medianMs.toFixed(0)}ms for ${DOC_COUNT} docs (${best.docsPerSec.toFixed(1)} docs/s)`);
|
|
|
+ if (best.totalVram > 0) console.log(` ${formatBytes(best.totalVram)} VRAM`);
|
|
|
+ }
|
|
|
+
|
|
|
+ console.log("");
|
|
|
+ await model.dispose();
|
|
|
+ await llama.dispose();
|
|
|
+}
|
|
|
+
|
|
|
+main().catch(console.error);
|