# /// script # requires-python = ">=3.10" # dependencies = [ # "transformers>=4.45.0", # "peft>=0.7.0", # "torch", # "accelerate", # ] # /// """ Minimal QMD query expansion evaluator. Usage: uv run eval.py ./outputs/sft uv run eval.py ./outputs/sft --queries evals/queries.txt By default, query file defaults to evals/queries.txt and runs all queries unless --max-queries is set. """ import argparse import json import re import sys from pathlib import Path # Import reward scoring sys.path.insert(0, str(Path(__file__).parent)) from reward import score_expansion_detailed DEFAULT_QUERY_FILE = Path(__file__).parent / "evals" / "queries.txt" def load_model(model_path: str): """Load model (adapter or merged).""" import torch from peft import PeftModel from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer model_path = Path(model_path) adapter_config = model_path / "adapter_config.json" # Get base model from adapter config or default base_model = "Qwen/Qwen3-1.7B" if adapter_config.exists(): with open(adapter_config) as f: cfg = json.load(f) base_model = cfg.get("base_model_name_or_path", base_model) print(f"Loading base: {base_model}", file=sys.stderr) tokenizer = AutoTokenizer.from_pretrained(base_model) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token tokenizer.padding_side = "left" config = AutoConfig.from_pretrained(base_model) config.tie_word_embeddings = False model = AutoModelForCausalLM.from_pretrained( base_model, dtype=torch.bfloat16, device_map={"": 0}, config=config ) if model.generation_config is not None: model.generation_config.do_sample = False model.generation_config.temperature = None model.generation_config.top_p = None model.generation_config.top_k = None # Load adapter if present if adapter_config.exists(): print(f"Loading adapter: {model_path}", file=sys.stderr) model = PeftModel.from_pretrained(model, str(model_path)) model.eval() return model, tokenizer def generate_batch( model, tokenizer, queries: list[str], max_new_tokens: int, max_time: float | None ) -> list[str]: """Generate expansions for a batch of queries.""" import torch prompts = [ tokenizer.apply_chat_template( [{"role": "user", "content": f"/no_think Expand this search query: {q}"}], tokenize=False, add_generation_prompt=True, ) for q in queries ] inputs = tokenizer(prompts, return_tensors="pt", padding=True).to(model.device) input_len = inputs["input_ids"].shape[1] generate_kwargs = { "max_new_tokens": max_new_tokens, "do_sample": False, "num_beams": 1, "pad_token_id": tokenizer.pad_token_id, "eos_token_id": tokenizer.eos_token_id, "use_cache": True, } if max_time and max_time > 0: generate_kwargs["max_time"] = max_time with torch.inference_mode(): out = model.generate(**inputs, **generate_kwargs) outputs = [] for i in range(len(queries)): gen_tokens = out[i][input_len:] text = tokenizer.decode(gen_tokens, skip_special_tokens=True) text = re.sub(r".*?", "", text, flags=re.DOTALL) outputs.append(text.strip()) return outputs def main(): parser = argparse.ArgumentParser(description="Evaluate QMD model") parser.add_argument("model", help="Model path (local or HF)") parser.add_argument( "--queries", default=str(DEFAULT_QUERY_FILE), help="Queries file (one per line) [default: evals/queries.txt]", ) parser.add_argument( "--max-new-tokens", type=int, default=400, help="Maximum new tokens to generate (default: 400)", ) parser.add_argument( "--max-time", type=float, default=0, help="Max seconds per batch generation (0 disables)", ) parser.add_argument( "--batch-size", type=int, default=2, help="Batch size for generation (default: 2)", ) parser.add_argument( "--max-queries", type=int, default=0, help="Limit number of queries (0 disables)", ) args = parser.parse_args() # Load queries (default to full evals/queries.txt) query_file = Path(args.queries) if not query_file.exists(): raise FileNotFoundError(f"Queries file not found: {query_file}") with query_file.open(encoding="utf-8") as f: queries = [ l.strip() for l in f if l.strip() and not l.strip().startswith("#") ] if args.max_queries and args.max_queries > 0: queries = queries[: args.max_queries] # Load model model, tokenizer = load_model(args.model) # Run eval scores = [] batch_size = max(1, args.batch_size) total = len(queries) for start in range(0, total, batch_size): batch = queries[start : start + batch_size] batch_outputs = generate_batch( model, tokenizer, batch, args.max_new_tokens, args.max_time ) for i, (query, expansion) in enumerate(zip(batch, batch_outputs), start + 1): print(f"\n[{i}/{total}] {query}") print("-" * 50) result = score_expansion_detailed(query, expansion) print(expansion[:300] + ("..." if len(expansion) > 300 else "")) print(f"Score: {result['percentage']:.0f}% ({result['rating']})") scores.append(result["percentage"]) # Summary avg = sum(scores) / len(scores) print(f"\n{'=' * 50}") print(f"Average: {avg:.1f}% | Model: {args.model}") print(f"{'=' * 50}") return 0 if avg >= 50 else 1 if __name__ == "__main__": sys.exit(main())