| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182 |
- #!/usr/bin/env python3
- """Benchmark QMD query expansion: LFM2.5 vs Qwen3 finetuned models."""
- import json
- import time
- import torch
- from pathlib import Path
- from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
- from peft import PeftModel
- QUERIES = [
- "kubernetes pod networking",
- "best practices for React server components",
- "how to optimize PostgreSQL queries for large tables",
- "what is retrieval augmented generation",
- "python async await concurrency patterns",
- "nginx reverse proxy load balancing",
- "git rebase vs merge workflow",
- "rust ownership and borrowing explained",
- "docker compose multi-stage builds",
- "elasticsearch full text search performance",
- "shopify liquid template customization",
- "machine learning feature engineering techniques",
- "aws lambda cold start optimization",
- "typescript generics and utility types",
- "redis caching strategies for web apps",
- ]
- def load_model(base_name, adapter_dir, device, trust_remote=False):
- tokenizer = AutoTokenizer.from_pretrained(base_name, trust_remote_code=trust_remote)
- base = AutoModelForCausalLM.from_pretrained(
- base_name, dtype=torch.bfloat16, device_map=device, trust_remote_code=trust_remote
- )
- model = PeftModel.from_pretrained(base, adapter_dir, local_files_only=True)
- model = model.merge_and_unload()
- model.eval()
-
- gen_config_path = Path(adapter_dir) / "generation_config.json"
- if gen_config_path.exists():
- gen_config = GenerationConfig.from_pretrained(adapter_dir)
- else:
- gen_config = GenerationConfig(
- temperature=0.1, top_k=50, top_p=0.1,
- repetition_penalty=1.05, do_sample=True, max_new_tokens=300,
- )
- return model, tokenizer, gen_config
- def run_inference(model, tokenizer, gen_config, query, device):
- messages = [{"role": "user", "content": f"Expand this search query: {query}"}]
- text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
- inputs = tokenizer(text, return_tensors="pt").to(device)
-
- start = time.perf_counter()
- with torch.no_grad():
- out = model.generate(**inputs, generation_config=gen_config, max_new_tokens=300)
- elapsed = time.perf_counter() - start
-
- new_tokens = out.shape[-1] - inputs["input_ids"].shape[-1]
- result = tokenizer.decode(out[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
- return result, elapsed, new_tokens
- def score_output(output):
- """Simple quality scoring: check for lex/vec/hyde presence and specificity."""
- score = 0
- lines = output.strip().split("\n")
- has_lex = has_vec = has_hyde = False
- hyde_text = ""
-
- for line in lines:
- l = line.strip()
- if l.startswith("lex:"):
- has_lex = True
- score += 1
- elif l.startswith("vec:"):
- has_vec = True
- score += 1
- elif l.startswith("hyde:"):
- has_hyde = True
- hyde_text = l[5:].strip()
- score += 2 # hyde is worth more
-
- # Bonus for hyde length in sweet spot (80-200 chars)
- if hyde_text:
- hlen = len(hyde_text)
- if 80 <= hlen <= 200:
- score += 2
- elif 50 <= hlen <= 250:
- score += 1
-
- # Penalty for generic/template hyde
- generic_phrases = ["comprehensive guide", "everything you need to know", "beginners and advanced users"]
- for phrase in generic_phrases:
- if phrase in hyde_text.lower():
- score -= 1
-
- return score, {"has_lex": has_lex, "has_vec": has_vec, "has_hyde": has_hyde, "hyde_len": len(hyde_text)}
- def main():
- device = "cuda:0"
-
- models = {
- "LFM2.5-1.2B (finetuned)": {
- "base": "LiquidAI/LFM2.5-1.2B-Instruct",
- "adapter": "outputs/sft-lfm2",
- "trust_remote": True,
- },
- "Qwen3-1.7B (finetuned)": {
- "base": "Qwen/Qwen3-1.7B",
- "adapter": "outputs/sft",
- "trust_remote": False,
- },
- }
-
- results = {}
-
- for name, cfg in models.items():
- print(f"\n{'='*60}")
- print(f"Loading {name}...")
- model, tokenizer, gen_config = load_model(
- cfg["base"], cfg["adapter"], device, cfg["trust_remote"]
- )
-
- model_results = []
- total_time = 0
- total_tokens = 0
- total_score = 0
-
- for query in QUERIES:
- output, elapsed, n_tokens = run_inference(model, tokenizer, gen_config, query, device)
- score, details = score_output(output)
-
- model_results.append({
- "query": query,
- "output": output,
- "time_s": round(elapsed, 3),
- "tokens": n_tokens,
- "score": score,
- "details": details,
- })
- total_time += elapsed
- total_tokens += n_tokens
- total_score += score
-
- tok_s = n_tokens / elapsed if elapsed > 0 else 0
- print(f" [{score:2d}] {query[:40]:<40} {elapsed:.2f}s {n_tokens:3d}tok {tok_s:.0f}tok/s")
-
- avg_time = total_time / len(QUERIES)
- avg_score = total_score / len(QUERIES)
- avg_toks = total_tokens / total_time if total_time > 0 else 0
-
- results[name] = {
- "queries": model_results,
- "avg_time_s": round(avg_time, 3),
- "avg_score": round(avg_score, 2),
- "avg_tok_s": round(avg_toks, 1),
- "total_score": total_score,
- }
-
- print(f"\n Summary: avg_score={avg_score:.2f} avg_time={avg_time:.2f}s avg_tok/s={avg_toks:.0f}")
-
- # Free GPU memory
- del model
- torch.cuda.empty_cache()
-
- # Print comparison
- print(f"\n{'='*60}")
- print("COMPARISON")
- print(f"{'='*60}")
- for name, r in results.items():
- print(f"\n{name}:")
- print(f" Total Score: {r['total_score']} / {len(QUERIES) * 8}") # max ~8 per query
- print(f" Avg Score: {r['avg_score']}")
- print(f" Avg Time: {r['avg_time_s']}s")
- print(f" Throughput: {r['avg_tok_s']} tok/s")
-
- # Save full results
- with open("outputs/benchmark_results.json", "w") as f:
- json.dump(results, f, indent=2)
- print("\nFull results saved to outputs/benchmark_results.json")
- if __name__ == "__main__":
- main()
|