# GRPO Training Config for QMD Query Expansion v5
# Uses SFT model as starting point with RL fine-tuning
#
# Key learnings from v4 failure (0% score, catastrophic drift):
# 1. beta=0.0 (default) allows unlimited drift from SFT checkpoint
# 2. Format-based rewards need KL regularization unlike math reasoning
# 3. Reward function must give 0 for wrong format (now implemented)

model:
  sft: "tobil/qmd-query-expansion-0.6B-v4"  # Starting point
  base: "Qwen/Qwen3-0.6B"  # For tokenizer
  output: "tobil/qmd-query-expansion-0.6B-v4-grpo"

dataset:
  name: "tobil/qmd-query-expansion-train-v2"
  prompt_field: "messages"
  max_samples: 1000  # Reduced to prevent overfitting

training:
  epochs: 1
  batch_size: 2
  gradient_accumulation_steps: 8
  learning_rate: 5e-7  # Even lower for stability
  max_grad_norm: 0.5
  max_steps: 200  # Limit total steps to prevent drift

grpo:
  num_generations: 4
  max_completion_length: 200
  beta: 0.04  # KL regularization - prevents drift from SFT checkpoint
  temperature: 0.7  # Slightly lower for more focused generations

lora:
  rank: 4  # Smaller for RL stability
  alpha: 8
  dropout: 0.05
  target_modules:
    - "q_proj"
    - "v_proj"

tracking:
  project: "qmd-query-expansion"
  run_name: "grpo-v5-kl-regularized"