Here are all the config used. But note that this was a while ago, so parameter names may have changed.
deepspeed.yaml
compute_environment: LOCAL_MACHINE
deepspeed_config:
deepspeed_multinode_launcher: standard
deepspeed_config_file: ds_config.json
zero3_init_flag: true
distributed_type: DEEPSPEED
fsdp_config: {}
machine_rank: 0
main_process_ip: null
main_process_port: null
main_training_function: main
num_machines: 1
num_processes: 8
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false
ds_config.json
{
"bf16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 1000,
"initial_scale_power": 16,
"hysteresis": 2,
"min_loss_scale": 1
},
"gradient_accumulation_steps": "auto",
"zero_optimization": {
"stage": 3,
"offload_optimizer": {
"device": "cpu",
"pin_memory": true
},
"overlap_comm": true,
"contiguous_gradients": true,
"sub_group_size": 1e9,
"reduce_bucket_size": 1e6,
"stage3_prefetch_bucket_size": 0.94e6,
"stage3_param_persistence_threshold": 1e4,
"stage3_max_live_parameters": 1e9,
"stage3_max_reuse_distance": 1e9,
"stage3_gather_fp16_weights_on_model_save": true
},
"train_batch_size": "auto",
"steps_per_print": 2000,
"wall_clock_breakdown": false
}
experiment_config.yaml
# Model arguments
model_name_or_path: Qwen/Qwen2.5-Math-72B
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2
# Data training arguments
dataset_name: DigitalLearningGmbH/MATH-lighteval
dataset_config: default
dataset_prompt_column: problem
system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
# GRPO trainer config
bf16: true
use_vllm: true
vllm_mode: colocate
vllm_tensor_parallel_size: 8
vllm_gpu_memory_utilization: 0.5
vllm_enable_prefix_caching: false
vllm_max_model_len: 4096
do_eval: false
gradient_accumulation_steps: 1
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
learning_rate: 3.0e-06
log_completions: false
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: cosine
max_prompt_length: 512
max_completion_length: 3584
max_steps: -1
num_generations: 4
num_train_epochs: 1
overwrite_output_dir: true
per_device_train_batch_size: 4
push_to_hub: false
reward_funcs:
- accuracy
- format
reward_weights:
- 1.0
- 1.0
eval_strategy: "no"
save_strategy: "steps"
save_steps: 30
save_total_limit: 3
report_to:
- wandb
seed: 42
temperature: 0.7
warmup_ratio: 0.1