#!/usr/bin/env python3 """Benchmark QMD query expansion: LFM2.5 vs Qwen3 finetuned models.""" import json import time import torch from pathlib import Path from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig from peft import PeftModel QUERIES = [ "kubernetes pod networking", "best practices for React server components", "how to optimize PostgreSQL queries for large tables", "what is retrieval augmented generation", "python async await concurrency patterns", "nginx reverse proxy load balancing", "git rebase vs merge workflow", "rust ownership and borrowing explained", "docker compose multi-stage builds", "elasticsearch full text search performance", "shopify liquid template customization", "machine learning feature engineering techniques", "aws lambda cold start optimization", "typescript generics and utility types", "redis caching strategies for web apps", ] def load_model(base_name, adapter_dir, device, trust_remote=False): tokenizer = AutoTokenizer.from_pretrained(base_name, trust_remote_code=trust_remote) base = AutoModelForCausalLM.from_pretrained( base_name, dtype=torch.bfloat16, device_map=device, trust_remote_code=trust_remote ) model = PeftModel.from_pretrained(base, adapter_dir, local_files_only=True) model = model.merge_and_unload() model.eval() gen_config_path = Path(adapter_dir) / "generation_config.json" if gen_config_path.exists(): gen_config = GenerationConfig.from_pretrained(adapter_dir) else: gen_config = GenerationConfig( temperature=0.1, top_k=50, top_p=0.1, repetition_penalty=1.05, do_sample=True, max_new_tokens=300, ) return model, tokenizer, gen_config def run_inference(model, tokenizer, gen_config, query, device): messages = [{"role": "user", "content": f"Expand this search query: {query}"}] text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False) inputs = tokenizer(text, return_tensors="pt").to(device) start = time.perf_counter() with torch.no_grad(): out = model.generate(**inputs, generation_config=gen_config, max_new_tokens=300) elapsed = time.perf_counter() - start new_tokens = out.shape[-1] - inputs["input_ids"].shape[-1] result = tokenizer.decode(out[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True) return result, elapsed, new_tokens def score_output(output): """Simple quality scoring: check for lex/vec/hyde presence and specificity.""" score = 0 lines = output.strip().split("\n") has_lex = has_vec = has_hyde = False hyde_text = "" for line in lines: l = line.strip() if l.startswith("lex:"): has_lex = True score += 1 elif l.startswith("vec:"): has_vec = True score += 1 elif l.startswith("hyde:"): has_hyde = True hyde_text = l[5:].strip() score += 2 # hyde is worth more # Bonus for hyde length in sweet spot (80-200 chars) if hyde_text: hlen = len(hyde_text) if 80 <= hlen <= 200: score += 2 elif 50 <= hlen <= 250: score += 1 # Penalty for generic/template hyde generic_phrases = ["comprehensive guide", "everything you need to know", "beginners and advanced users"] for phrase in generic_phrases: if phrase in hyde_text.lower(): score -= 1 return score, {"has_lex": has_lex, "has_vec": has_vec, "has_hyde": has_hyde, "hyde_len": len(hyde_text)} def main(): device = "cuda:0" models = { "LFM2.5-1.2B (finetuned)": { "base": "LiquidAI/LFM2.5-1.2B-Instruct", "adapter": "outputs/sft-lfm2", "trust_remote": True, }, "Qwen3-1.7B (finetuned)": { "base": "Qwen/Qwen3-1.7B", "adapter": "outputs/sft", "trust_remote": False, }, } results = {} for name, cfg in models.items(): print(f"\n{'='*60}") print(f"Loading {name}...") model, tokenizer, gen_config = load_model( cfg["base"], cfg["adapter"], device, cfg["trust_remote"] ) model_results = [] total_time = 0 total_tokens = 0 total_score = 0 for query in QUERIES: output, elapsed, n_tokens = run_inference(model, tokenizer, gen_config, query, device) score, details = score_output(output) model_results.append({ "query": query, "output": output, "time_s": round(elapsed, 3), "tokens": n_tokens, "score": score, "details": details, }) total_time += elapsed total_tokens += n_tokens total_score += score tok_s = n_tokens / elapsed if elapsed > 0 else 0 print(f" [{score:2d}] {query[:40]:<40} {elapsed:.2f}s {n_tokens:3d}tok {tok_s:.0f}tok/s") avg_time = total_time / len(QUERIES) avg_score = total_score / len(QUERIES) avg_toks = total_tokens / total_time if total_time > 0 else 0 results[name] = { "queries": model_results, "avg_time_s": round(avg_time, 3), "avg_score": round(avg_score, 2), "avg_tok_s": round(avg_toks, 1), "total_score": total_score, } print(f"\n Summary: avg_score={avg_score:.2f} avg_time={avg_time:.2f}s avg_tok/s={avg_toks:.0f}") # Free GPU memory del model torch.cuda.empty_cache() # Print comparison print(f"\n{'='*60}") print("COMPARISON") print(f"{'='*60}") for name, r in results.items(): print(f"\n{name}:") print(f" Total Score: {r['total_score']} / {len(QUERIES) * 8}") # max ~8 per query print(f" Avg Score: {r['avg_score']}") print(f" Avg Time: {r['avg_time_s']}s") print(f" Throughput: {r['avg_tok_s']} tok/s") # Save full results with open("outputs/benchmark_results.json", "w") as f: json.dump(results, f, indent=2) print("\nFull results saved to outputs/benchmark_results.json") if __name__ == "__main__": main()