grpo_v4.yaml 1.2 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243
  1. # GRPO Training Config for QMD Query Expansion v5
  2. # Uses SFT model as starting point with RL fine-tuning
  3. #
  4. # Key learnings from v4 failure (0% score, catastrophic drift):
  5. # 1. beta=0.0 (default) allows unlimited drift from SFT checkpoint
  6. # 2. Format-based rewards need KL regularization unlike math reasoning
  7. # 3. Reward function must give 0 for wrong format (now implemented)
  8. model:
  9. sft: "tobil/qmd-query-expansion-0.6B-v4" # Starting point
  10. base: "Qwen/Qwen3-0.6B" # For tokenizer
  11. output: "tobil/qmd-query-expansion-0.6B-v4-grpo"
  12. dataset:
  13. name: "tobil/qmd-query-expansion-train-v2"
  14. prompt_field: "messages"
  15. max_samples: 1000 # Reduced to prevent overfitting
  16. training:
  17. epochs: 1
  18. batch_size: 2
  19. gradient_accumulation_steps: 8
  20. learning_rate: 5e-7 # Even lower for stability
  21. max_grad_norm: 0.5
  22. max_steps: 200 # Limit total steps to prevent drift
  23. grpo:
  24. num_generations: 4
  25. max_completion_length: 200
  26. beta: 0.04 # KL regularization - prevents drift from SFT checkpoint
  27. temperature: 0.7 # Slightly lower for more focused generations
  28. lora:
  29. rank: 4 # Smaller for RL stability
  30. alpha: 8
  31. dropout: 0.05
  32. target_modules:
  33. - "q_proj"
  34. - "v_proj"
  35. tracking:
  36. project: "qmd-query-expansion"
  37. run_name: "grpo-v5-kl-regularized"