| 12345678910111213141516171819202122232425262728293031323334353637383940414243 |
- # GRPO Training Config for QMD Query Expansion v5
- # Uses SFT model as starting point with RL fine-tuning
- #
- # Key learnings from v4 failure (0% score, catastrophic drift):
- # 1. beta=0.0 (default) allows unlimited drift from SFT checkpoint
- # 2. Format-based rewards need KL regularization unlike math reasoning
- # 3. Reward function must give 0 for wrong format (now implemented)
- model:
- sft: "tobil/qmd-query-expansion-0.6B-v4" # Starting point
- base: "Qwen/Qwen3-0.6B" # For tokenizer
- output: "tobil/qmd-query-expansion-0.6B-v4-grpo"
- dataset:
- name: "tobil/qmd-query-expansion-train-v2"
- prompt_field: "messages"
- max_samples: 1000 # Reduced to prevent overfitting
- training:
- epochs: 1
- batch_size: 2
- gradient_accumulation_steps: 8
- learning_rate: 5e-7 # Even lower for stability
- max_grad_norm: 0.5
- max_steps: 200 # Limit total steps to prevent drift
- grpo:
- num_generations: 4
- max_completion_length: 200
- beta: 0.04 # KL regularization - prevents drift from SFT checkpoint
- temperature: 0.7 # Slightly lower for more focused generations
- lora:
- rank: 4 # Smaller for RL stability
- alpha: 8
- dropout: 0.05
- target_modules:
- - "q_proj"
- - "v_proj"
- tracking:
- project: "qmd-query-expansion"
- run_name: "grpo-v5-kl-regularized"
|