|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 20, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 189.46875, |
|
"epoch": 0.1, |
|
"grad_norm": 72.04657745361328, |
|
"kl": 0.0, |
|
"learning_rate": 4.965903258506806e-07, |
|
"loss": 0.0, |
|
"reward": 3.999774469062686, |
|
"reward_std": 2.164257466851268, |
|
"rewards/concensus_correctness_reward_func": 1.5554375015199184, |
|
"rewards/consensus_reward_func": 0.8125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.25, |
|
"rewards/question_recreation_reward_func": 0.7209931435063481, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.03125, |
|
"rewards/xmlcount_reward_func": 0.6295937476679683, |
|
"step": 2 |
|
}, |
|
{ |
|
"completion_length": 132.5625, |
|
"epoch": 0.2, |
|
"grad_norm": 10451.45703125, |
|
"kl": 253.42494644969702, |
|
"learning_rate": 4.698684378016222e-07, |
|
"loss": 0.2534, |
|
"reward": 5.015479948371649, |
|
"reward_std": 0.6629242373019224, |
|
"rewards/concensus_correctness_reward_func": 1.5227500051259995, |
|
"rewards/consensus_reward_func": 1.5, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.25, |
|
"rewards/question_recreation_reward_func": 0.8574799399357289, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.03125, |
|
"rewards/xmlcount_reward_func": 0.8539999984204769, |
|
"step": 4 |
|
}, |
|
{ |
|
"completion_length": 143.75, |
|
"epoch": 0.3, |
|
"grad_norm": 97.95394897460938, |
|
"kl": 96.7897967826575, |
|
"learning_rate": 4.193203929064353e-07, |
|
"loss": 0.0968, |
|
"reward": 5.306805005297065, |
|
"reward_std": 0.6057099380996078, |
|
"rewards/concensus_correctness_reward_func": 1.5708125047385693, |
|
"rewards/consensus_reward_func": 1.625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.125, |
|
"rewards/question_recreation_reward_func": 0.8909612875431776, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.109375, |
|
"rewards/xmlcount_reward_func": 0.9856562428176403, |
|
"step": 6 |
|
}, |
|
{ |
|
"completion_length": 163.25, |
|
"epoch": 0.4, |
|
"grad_norm": 118.51630401611328, |
|
"kl": 10.452200602274388, |
|
"learning_rate": 3.5042385616324236e-07, |
|
"loss": 0.0105, |
|
"reward": 5.741433784365654, |
|
"reward_std": 0.5551880679558963, |
|
"rewards/concensus_correctness_reward_func": 1.730749998241663, |
|
"rewards/consensus_reward_func": 1.8125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.125, |
|
"rewards/question_recreation_reward_func": 0.8832775540649891, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.1875, |
|
"rewards/xmlcount_reward_func": 1.0024062488228083, |
|
"step": 8 |
|
}, |
|
{ |
|
"completion_length": 133.96875, |
|
"epoch": 0.5, |
|
"grad_norm": 311.2564392089844, |
|
"kl": 6.50762258892064, |
|
"learning_rate": 2.706448363680831e-07, |
|
"loss": 0.0065, |
|
"reward": 5.870141640305519, |
|
"reward_std": 1.113852635025978, |
|
"rewards/concensus_correctness_reward_func": 1.815562479197979, |
|
"rewards/consensus_reward_func": 1.75, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.25, |
|
"rewards/question_recreation_reward_func": 0.8371417056769133, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.1875, |
|
"rewards/xmlcount_reward_func": 1.0299374982714653, |
|
"step": 10 |
|
}, |
|
{ |
|
"completion_length": 145.78125, |
|
"epoch": 0.6, |
|
"grad_norm": 77.84561157226562, |
|
"kl": 310228.7304797147, |
|
"learning_rate": 1.886286282148002e-07, |
|
"loss": 310.2288, |
|
"reward": 4.60096238553524, |
|
"reward_std": 0.7561608459218405, |
|
"rewards/concensus_correctness_reward_func": 1.2766249924898148, |
|
"rewards/consensus_reward_func": 1.375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0625, |
|
"rewards/question_recreation_reward_func": 0.8684623129665852, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.09375, |
|
"rewards/xmlcount_reward_func": 0.9246250009164214, |
|
"step": 12 |
|
}, |
|
{ |
|
"completion_length": 136.3125, |
|
"epoch": 0.7, |
|
"grad_norm": 33276.56640625, |
|
"kl": 892.4592962227762, |
|
"learning_rate": 1.1326296046939333e-07, |
|
"loss": 0.8925, |
|
"reward": 5.898394346237183, |
|
"reward_std": 0.5051253397541586, |
|
"rewards/concensus_correctness_reward_func": 1.8743750005960464, |
|
"rewards/consensus_reward_func": 1.8125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.25, |
|
"rewards/question_recreation_reward_func": 0.8720193216577172, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.109375, |
|
"rewards/xmlcount_reward_func": 0.9801249913871288, |
|
"step": 14 |
|
}, |
|
{ |
|
"completion_length": 150.3125, |
|
"epoch": 0.8, |
|
"grad_norm": 237.27255249023438, |
|
"kl": 170.10233331704512, |
|
"learning_rate": 5.271487265090163e-08, |
|
"loss": 0.1701, |
|
"reward": 5.7240906953811646, |
|
"reward_std": 0.7569041545502841, |
|
"rewards/concensus_correctness_reward_func": 1.7523125000298023, |
|
"rewards/consensus_reward_func": 1.8125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.125, |
|
"rewards/question_recreation_reward_func": 0.9259657934308052, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.125, |
|
"rewards/xmlcount_reward_func": 0.9833124950528145, |
|
"step": 16 |
|
}, |
|
{ |
|
"completion_length": 146.5625, |
|
"epoch": 0.9, |
|
"grad_norm": 62.39609909057617, |
|
"kl": 21.542335913982242, |
|
"learning_rate": 1.3545689574841341e-08, |
|
"loss": 0.0215, |
|
"reward": 5.123600989580154, |
|
"reward_std": 1.120502723613754, |
|
"rewards/concensus_correctness_reward_func": 1.5977499894797802, |
|
"rewards/consensus_reward_func": 1.6875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.125, |
|
"rewards/question_recreation_reward_func": 0.8194135017693043, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.078125, |
|
"rewards/xmlcount_reward_func": 0.8158124927431345, |
|
"step": 18 |
|
}, |
|
{ |
|
"completion_length": 149.75, |
|
"epoch": 1.0, |
|
"grad_norm": 64.91241455078125, |
|
"kl": 4.645937927765772, |
|
"learning_rate": 0.0, |
|
"loss": 0.0046, |
|
"reward": 5.778949812054634, |
|
"reward_std": 0.8444689979951363, |
|
"rewards/concensus_correctness_reward_func": 1.751749999821186, |
|
"rewards/consensus_reward_func": 1.875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.125, |
|
"rewards/question_recreation_reward_func": 0.8938873894512653, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.15625, |
|
"rewards/xmlcount_reward_func": 0.9770625047385693, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 20, |
|
"total_flos": 0.0, |
|
"train_loss": 31.168473226716742, |
|
"train_runtime": 124.1651, |
|
"train_samples_per_second": 2.577, |
|
"train_steps_per_second": 0.161 |
|
} |
|
], |
|
"logging_steps": 2, |
|
"max_steps": 20, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 25, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|