benchmark.py 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182
  1. #!/usr/bin/env python3
  2. """Benchmark QMD query expansion: LFM2.5 vs Qwen3 finetuned models."""
  3. import json
  4. import time
  5. import torch
  6. from pathlib import Path
  7. from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
  8. from peft import PeftModel
  9. QUERIES = [
  10. "kubernetes pod networking",
  11. "best practices for React server components",
  12. "how to optimize PostgreSQL queries for large tables",
  13. "what is retrieval augmented generation",
  14. "python async await concurrency patterns",
  15. "nginx reverse proxy load balancing",
  16. "git rebase vs merge workflow",
  17. "rust ownership and borrowing explained",
  18. "docker compose multi-stage builds",
  19. "elasticsearch full text search performance",
  20. "shopify liquid template customization",
  21. "machine learning feature engineering techniques",
  22. "aws lambda cold start optimization",
  23. "typescript generics and utility types",
  24. "redis caching strategies for web apps",
  25. ]
  26. def load_model(base_name, adapter_dir, device, trust_remote=False):
  27. tokenizer = AutoTokenizer.from_pretrained(base_name, trust_remote_code=trust_remote)
  28. base = AutoModelForCausalLM.from_pretrained(
  29. base_name, dtype=torch.bfloat16, device_map=device, trust_remote_code=trust_remote
  30. )
  31. model = PeftModel.from_pretrained(base, adapter_dir, local_files_only=True)
  32. model = model.merge_and_unload()
  33. model.eval()
  34. gen_config_path = Path(adapter_dir) / "generation_config.json"
  35. if gen_config_path.exists():
  36. gen_config = GenerationConfig.from_pretrained(adapter_dir)
  37. else:
  38. gen_config = GenerationConfig(
  39. temperature=0.1, top_k=50, top_p=0.1,
  40. repetition_penalty=1.05, do_sample=True, max_new_tokens=300,
  41. )
  42. return model, tokenizer, gen_config
  43. def run_inference(model, tokenizer, gen_config, query, device):
  44. messages = [{"role": "user", "content": f"Expand this search query: {query}"}]
  45. text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
  46. inputs = tokenizer(text, return_tensors="pt").to(device)
  47. start = time.perf_counter()
  48. with torch.no_grad():
  49. out = model.generate(**inputs, generation_config=gen_config, max_new_tokens=300)
  50. elapsed = time.perf_counter() - start
  51. new_tokens = out.shape[-1] - inputs["input_ids"].shape[-1]
  52. result = tokenizer.decode(out[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
  53. return result, elapsed, new_tokens
  54. def score_output(output):
  55. """Simple quality scoring: check for lex/vec/hyde presence and specificity."""
  56. score = 0
  57. lines = output.strip().split("\n")
  58. has_lex = has_vec = has_hyde = False
  59. hyde_text = ""
  60. for line in lines:
  61. l = line.strip()
  62. if l.startswith("lex:"):
  63. has_lex = True
  64. score += 1
  65. elif l.startswith("vec:"):
  66. has_vec = True
  67. score += 1
  68. elif l.startswith("hyde:"):
  69. has_hyde = True
  70. hyde_text = l[5:].strip()
  71. score += 2 # hyde is worth more
  72. # Bonus for hyde length in sweet spot (80-200 chars)
  73. if hyde_text:
  74. hlen = len(hyde_text)
  75. if 80 <= hlen <= 200:
  76. score += 2
  77. elif 50 <= hlen <= 250:
  78. score += 1
  79. # Penalty for generic/template hyde
  80. generic_phrases = ["comprehensive guide", "everything you need to know", "beginners and advanced users"]
  81. for phrase in generic_phrases:
  82. if phrase in hyde_text.lower():
  83. score -= 1
  84. return score, {"has_lex": has_lex, "has_vec": has_vec, "has_hyde": has_hyde, "hyde_len": len(hyde_text)}
  85. def main():
  86. device = "cuda:0"
  87. models = {
  88. "LFM2.5-1.2B (finetuned)": {
  89. "base": "LiquidAI/LFM2.5-1.2B-Instruct",
  90. "adapter": "outputs/sft-lfm2",
  91. "trust_remote": True,
  92. },
  93. "Qwen3-1.7B (finetuned)": {
  94. "base": "Qwen/Qwen3-1.7B",
  95. "adapter": "outputs/sft",
  96. "trust_remote": False,
  97. },
  98. }
  99. results = {}
  100. for name, cfg in models.items():
  101. print(f"\n{'='*60}")
  102. print(f"Loading {name}...")
  103. model, tokenizer, gen_config = load_model(
  104. cfg["base"], cfg["adapter"], device, cfg["trust_remote"]
  105. )
  106. model_results = []
  107. total_time = 0
  108. total_tokens = 0
  109. total_score = 0
  110. for query in QUERIES:
  111. output, elapsed, n_tokens = run_inference(model, tokenizer, gen_config, query, device)
  112. score, details = score_output(output)
  113. model_results.append({
  114. "query": query,
  115. "output": output,
  116. "time_s": round(elapsed, 3),
  117. "tokens": n_tokens,
  118. "score": score,
  119. "details": details,
  120. })
  121. total_time += elapsed
  122. total_tokens += n_tokens
  123. total_score += score
  124. tok_s = n_tokens / elapsed if elapsed > 0 else 0
  125. print(f" [{score:2d}] {query[:40]:<40} {elapsed:.2f}s {n_tokens:3d}tok {tok_s:.0f}tok/s")
  126. avg_time = total_time / len(QUERIES)
  127. avg_score = total_score / len(QUERIES)
  128. avg_toks = total_tokens / total_time if total_time > 0 else 0
  129. results[name] = {
  130. "queries": model_results,
  131. "avg_time_s": round(avg_time, 3),
  132. "avg_score": round(avg_score, 2),
  133. "avg_tok_s": round(avg_toks, 1),
  134. "total_score": total_score,
  135. }
  136. print(f"\n Summary: avg_score={avg_score:.2f} avg_time={avg_time:.2f}s avg_tok/s={avg_toks:.0f}")
  137. # Free GPU memory
  138. del model
  139. torch.cuda.empty_cache()
  140. # Print comparison
  141. print(f"\n{'='*60}")
  142. print("COMPARISON")
  143. print(f"{'='*60}")
  144. for name, r in results.items():
  145. print(f"\n{name}:")
  146. print(f" Total Score: {r['total_score']} / {len(QUERIES) * 8}") # max ~8 per query
  147. print(f" Avg Score: {r['avg_score']}")
  148. print(f" Avg Time: {r['avg_time_s']}s")
  149. print(f" Throughput: {r['avg_tok_s']} tok/s")
  150. # Save full results
  151. with open("outputs/benchmark_results.json", "w") as f:
  152. json.dump(results, f, indent=2)
  153. print("\nFull results saved to outputs/benchmark_results.json")
  154. if __name__ == "__main__":
  155. main()