sft_local.yaml 1.0 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344
  1. # SFT Training Config - Local Data, Multi-GPU
  2. # Usage: accelerate launch --config_file configs/accelerate_multi_gpu.yaml train.py sft --config configs/sft_local.yaml
  3. model:
  4. base: "Qwen/Qwen3-1.7B"
  5. output: "outputs/sft" # Local output
  6. push_to_hub: false
  7. dataset:
  8. name: "data/train" # Local path
  9. text_field: "text"
  10. split: "train"
  11. eval_split: 0.1
  12. training:
  13. epochs: 5
  14. batch_size: 2 # Per GPU, effective batch = 2 * 4 GPUs * 4 accum = 32
  15. gradient_accumulation_steps: 4
  16. learning_rate: 0.0002 # 2e-4 as float
  17. max_length: 512
  18. warmup_ratio: 0.03
  19. lr_scheduler: "cosine"
  20. ddp_find_unused_parameters: false
  21. # Save checkpoints every 30 minutes
  22. save_interval_minutes: 30
  23. # Fallback time-step save cadence if needed (not used for wall-clock mode)
  24. save_steps: 200
  25. lora:
  26. rank: 16
  27. alpha: 32
  28. dropout: 0.05
  29. target_modules:
  30. - "q_proj"
  31. - "k_proj"
  32. - "v_proj"
  33. - "o_proj"
  34. - "gate_proj"
  35. - "up_proj"
  36. - "down_proj"
  37. tracking:
  38. project: "qmd-query-expansion"
  39. run_name: "{day} {time}"