# GRPO Training Config for QMD Query Expansion v4
# Uses SFT model as starting point with RL fine-tuning

model:
  sft: "tobil/qmd-query-expansion-0.6B-v4"  # Starting point
  base: "Qwen/Qwen3-0.6B"  # For tokenizer
  output: "tobil/qmd-query-expansion-0.6B-v4-grpo"

dataset:
  name: "tobil/qmd-query-expansion-train-v2"
  prompt_field: "messages"
  max_samples: 2000

training:
  epochs: 1
  batch_size: 2
  gradient_accumulation_steps: 8
  learning_rate: 1e-6  # Very low for RL stability
  max_grad_norm: 0.5

grpo:
  num_generations: 4
  max_completion_length: 200

lora:
  rank: 4  # Smaller for RL stability
  alpha: 8
  dropout: 0.05
  target_modules:
    - "q_proj"
    - "v_proj"

tracking:
  project: "qmd-query-expansion"
  run_name: "grpo-v4-key-term-preservation"

# Note: GRPO v4 failed due to reward function not enforcing format strictly enough.
# The model drifted to verbose explanations instead of lex:/vec:/hyde: format.
# Recommendation: Add hard penalty when output doesn't start with valid prefix.