# GRPO Training Config for QMD Query Expansion v5 # Uses SFT model as starting point with RL fine-tuning # # Key learnings from v4 failure (0% score, catastrophic drift): # 1. beta=0.0 (default) allows unlimited drift from SFT checkpoint # 2. Format-based rewards need KL regularization unlike math reasoning # 3. Reward function must give 0 for wrong format (now implemented) model: sft: "tobil/qmd-query-expansion-0.6B-v4" # Starting point base: "Qwen/Qwen3-0.6B" # For tokenizer output: "tobil/qmd-query-expansion-0.6B-v4-grpo" dataset: name: "tobil/qmd-query-expansion-train-v2" prompt_field: "messages" max_samples: 1000 # Reduced to prevent overfitting training: epochs: 1 batch_size: 2 gradient_accumulation_steps: 8 learning_rate: 5e-7 # Even lower for stability max_grad_norm: 0.5 max_steps: 200 # Limit total steps to prevent drift grpo: num_generations: 4 max_completion_length: 200 beta: 0.04 # KL regularization - prevents drift from SFT checkpoint temperature: 0.7 # Slightly lower for more focused generations lora: rank: 4 # Smaller for RL stability alpha: 8 dropout: 0.05 target_modules: - "q_proj" - "v_proj" tracking: project: "qmd-query-expansion" run_name: "grpo-v5-kl-regularized"