grpo_v4.yaml 1023 B

123456789101112131415161718192021222324252627282930313233343536373839
  1. # GRPO Training Config for QMD Query Expansion v4
  2. # Uses SFT model as starting point with RL fine-tuning
  3. model:
  4. sft: "tobil/qmd-query-expansion-0.6B-v4" # Starting point
  5. base: "Qwen/Qwen3-0.6B" # For tokenizer
  6. output: "tobil/qmd-query-expansion-0.6B-v4-grpo"
  7. dataset:
  8. name: "tobil/qmd-query-expansion-train-v2"
  9. prompt_field: "messages"
  10. max_samples: 2000
  11. training:
  12. epochs: 1
  13. batch_size: 2
  14. gradient_accumulation_steps: 8
  15. learning_rate: 1e-6 # Very low for RL stability
  16. max_grad_norm: 0.5
  17. grpo:
  18. num_generations: 4
  19. max_completion_length: 200
  20. lora:
  21. rank: 4 # Smaller for RL stability
  22. alpha: 8
  23. dropout: 0.05
  24. target_modules:
  25. - "q_proj"
  26. - "v_proj"
  27. tracking:
  28. project: "qmd-query-expansion"
  29. run_name: "grpo-v4-key-term-preservation"
  30. # Note: GRPO v4 failed due to reward function not enforcing format strictly enough.
  31. # The model drifted to verbose explanations instead of lex:/vec:/hyde: format.
  32. # Recommendation: Add hard penalty when output doesn't start with valid prefix.