|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.0014534883720930232, |
|
"eval_steps": 500, |
|
"global_step": 60, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 10086.0, |
|
"completions/max_terminated_length": 10086.0, |
|
"completions/mean_length": 4296.546875, |
|
"completions/mean_terminated_length": 4296.546875, |
|
"completions/min_length": 1720.0, |
|
"completions/min_terminated_length": 1720.0, |
|
"epoch": 2.4224806201550387e-05, |
|
"grad_norm": 0.016954593260394005, |
|
"kl": 0.0009393692016601562, |
|
"learning_rate": 0.0, |
|
"loss": -0.0003, |
|
"num_tokens": 601834.0, |
|
"reward": 0.4602593183517456, |
|
"reward_std": 0.24803586304187775, |
|
"rewards/avg_thinking_length_func": 185.02471923828125, |
|
"rewards/correct_answer_reward_func": 0.453125, |
|
"rewards/efficient_thinking_reward_func": 0.8889554441999474, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/num_xml_reward_func": 1.7176268100738525, |
|
"rewards/tool_execution_reward_func": 1.9936248064041138, |
|
"rewards/visit_tool_reward_func": 0.9308543801307678, |
|
"step": 1 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 4.8449612403100775e-05, |
|
"grad_norm": 0.016953615886545852, |
|
"kl": 0.0009393692016601562, |
|
"learning_rate": 6.25e-08, |
|
"loss": -0.0003, |
|
"step": 2 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 7.267441860465116e-05, |
|
"grad_norm": 0.016864690676516626, |
|
"kl": 0.0009565353393554688, |
|
"learning_rate": 1.25e-07, |
|
"loss": -0.0003, |
|
"step": 3 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 9.689922480620155e-05, |
|
"grad_norm": 0.016822420848305722, |
|
"kl": 0.0009622573852539062, |
|
"learning_rate": 1.875e-07, |
|
"loss": -0.0003, |
|
"step": 4 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 9385.0, |
|
"completions/max_terminated_length": 9385.0, |
|
"completions/mean_length": 4270.703125, |
|
"completions/mean_terminated_length": 4270.703125, |
|
"completions/min_length": 1390.0, |
|
"completions/min_terminated_length": 1390.0, |
|
"epoch": 0.00012112403100775194, |
|
"grad_norm": 0.025862550499858347, |
|
"kl": 0.000957489013671875, |
|
"learning_rate": 2.5e-07, |
|
"loss": 0.0031, |
|
"num_tokens": 1199795.0, |
|
"reward": 0.566771388053894, |
|
"reward_std": 0.48137491941452026, |
|
"rewards/avg_thinking_length_func": 182.33303833007812, |
|
"rewards/correct_answer_reward_func": 0.578125, |
|
"rewards/efficient_thinking_reward_func": 0.8707049785861538, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/num_xml_reward_func": 1.7195165157318115, |
|
"rewards/tool_execution_reward_func": 1.9965277910232544, |
|
"rewards/visit_tool_reward_func": 0.9274243116378784, |
|
"step": 5 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.00014534883720930232, |
|
"grad_norm": 0.025877236026611388, |
|
"kl": 0.0009489059448242188, |
|
"learning_rate": 3.1249999999999997e-07, |
|
"loss": 0.0031, |
|
"step": 6 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0001695736434108527, |
|
"grad_norm": 0.025817236127475232, |
|
"kl": 0.0009660720825195312, |
|
"learning_rate": 3.75e-07, |
|
"loss": 0.0031, |
|
"step": 7 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0001937984496124031, |
|
"grad_norm": 0.02584169829863559, |
|
"kl": 0.0009441375732421875, |
|
"learning_rate": 4.375e-07, |
|
"loss": 0.0031, |
|
"step": 8 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 7008.0, |
|
"completions/max_terminated_length": 7008.0, |
|
"completions/mean_length": 4088.546875, |
|
"completions/mean_terminated_length": 4088.546875, |
|
"completions/min_length": 1705.0, |
|
"completions/min_terminated_length": 1705.0, |
|
"epoch": 0.00021802325581395349, |
|
"grad_norm": 0.01625597308376849, |
|
"kl": 0.0009918212890625, |
|
"learning_rate": 5e-07, |
|
"loss": 0.0013, |
|
"num_tokens": 1783761.0, |
|
"reward": 0.3732198178768158, |
|
"reward_std": 0.2907864451408386, |
|
"rewards/avg_thinking_length_func": 177.95510864257812, |
|
"rewards/correct_answer_reward_func": 0.390625, |
|
"rewards/efficient_thinking_reward_func": 0.8993925619789238, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/num_xml_reward_func": 1.6866124868392944, |
|
"rewards/tool_execution_reward_func": 1.950781226158142, |
|
"rewards/visit_tool_reward_func": 0.8574961423873901, |
|
"step": 9 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.00024224806201550387, |
|
"grad_norm": 0.016618535814852814, |
|
"kl": 0.0009899139404296875, |
|
"learning_rate": 5.625e-07, |
|
"loss": 0.0013, |
|
"step": 10 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.00026647286821705426, |
|
"grad_norm": 0.016248156263205492, |
|
"kl": 0.0009660720825195312, |
|
"learning_rate": 6.249999999999999e-07, |
|
"loss": 0.0013, |
|
"step": 11 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.00029069767441860465, |
|
"grad_norm": 0.016111032400620007, |
|
"kl": 0.0009870529174804688, |
|
"learning_rate": 6.875e-07, |
|
"loss": 0.0013, |
|
"step": 12 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 6572.0, |
|
"completions/max_terminated_length": 6572.0, |
|
"completions/mean_length": 4119.703125, |
|
"completions/mean_terminated_length": 4119.703125, |
|
"completions/min_length": 1356.0, |
|
"completions/min_terminated_length": 1356.0, |
|
"epoch": 0.00031492248062015503, |
|
"grad_norm": 0.019643777904198217, |
|
"kl": 0.0009822845458984375, |
|
"learning_rate": 7.5e-07, |
|
"loss": -0.0008, |
|
"num_tokens": 2367034.0, |
|
"reward": 0.6774564981460571, |
|
"reward_std": 0.3563808798789978, |
|
"rewards/avg_thinking_length_func": 176.69476318359375, |
|
"rewards/correct_answer_reward_func": 0.625, |
|
"rewards/efficient_thinking_reward_func": 0.8704519537344548, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/num_xml_reward_func": 1.6578426361083984, |
|
"rewards/tool_execution_reward_func": 2.0, |
|
"rewards/visit_tool_reward_func": 0.9361900091171265, |
|
"step": 13 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0003391472868217054, |
|
"grad_norm": 0.0194815826710202, |
|
"kl": 0.0010242462158203125, |
|
"learning_rate": 8.125e-07, |
|
"loss": -0.0008, |
|
"step": 14 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0003633720930232558, |
|
"grad_norm": 0.019402854833833996, |
|
"kl": 0.0010585784912109375, |
|
"learning_rate": 8.75e-07, |
|
"loss": -0.0008, |
|
"step": 15 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0003875968992248062, |
|
"grad_norm": 0.019438299719581362, |
|
"kl": 0.0011272430419921875, |
|
"learning_rate": 9.374999999999999e-07, |
|
"loss": -0.0008, |
|
"step": 16 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 7597.0, |
|
"completions/max_terminated_length": 7597.0, |
|
"completions/mean_length": 4205.671875, |
|
"completions/mean_terminated_length": 4205.671875, |
|
"completions/min_length": 1507.0, |
|
"completions/min_terminated_length": 1507.0, |
|
"epoch": 0.0004118217054263566, |
|
"grad_norm": 0.014823687168402296, |
|
"kl": 0.0011005401611328125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0009, |
|
"num_tokens": 2985545.0, |
|
"reward": 0.3260263204574585, |
|
"reward_std": 0.2300996333360672, |
|
"rewards/avg_thinking_length_func": 177.14329528808594, |
|
"rewards/correct_answer_reward_func": 0.375, |
|
"rewards/efficient_thinking_reward_func": 0.8988714947132084, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/num_xml_reward_func": 1.8095711469650269, |
|
"rewards/tool_execution_reward_func": 1.99609375, |
|
"rewards/visit_tool_reward_func": 0.852025032043457, |
|
"step": 17 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.00043604651162790697, |
|
"grad_norm": 0.014727006858324664, |
|
"kl": 0.0011577606201171875, |
|
"learning_rate": 1.0625e-06, |
|
"loss": 0.0009, |
|
"step": 18 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.00046027131782945736, |
|
"grad_norm": 0.014837711956269274, |
|
"kl": 0.0012722015380859375, |
|
"learning_rate": 1.125e-06, |
|
"loss": 0.0009, |
|
"step": 19 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.00048449612403100775, |
|
"grad_norm": 0.014894430575329584, |
|
"kl": 0.00146484375, |
|
"learning_rate": 1.1874999999999999e-06, |
|
"loss": 0.0009, |
|
"step": 20 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 7476.0, |
|
"completions/max_terminated_length": 7476.0, |
|
"completions/mean_length": 4097.921875, |
|
"completions/mean_terminated_length": 4097.921875, |
|
"completions/min_length": 1514.0, |
|
"completions/min_terminated_length": 1514.0, |
|
"epoch": 0.0005087209302325581, |
|
"grad_norm": 0.0189498267274778, |
|
"kl": 0.0019931793212890625, |
|
"learning_rate": 1.2499999999999999e-06, |
|
"loss": 0.0003, |
|
"num_tokens": 3561495.0, |
|
"reward": 0.5717383623123169, |
|
"reward_std": 0.33007949590682983, |
|
"rewards/avg_thinking_length_func": 177.5142822265625, |
|
"rewards/correct_answer_reward_func": 0.53125, |
|
"rewards/efficient_thinking_reward_func": 0.8662384906971484, |
|
"rewards/format_reward_func": 0.9937499761581421, |
|
"rewards/num_xml_reward_func": 1.779766321182251, |
|
"rewards/tool_execution_reward_func": 1.979819416999817, |
|
"rewards/visit_tool_reward_func": 0.9004297256469727, |
|
"step": 21 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0005329457364341085, |
|
"grad_norm": 0.019010527717988047, |
|
"kl": 0.00229644775390625, |
|
"learning_rate": 1.3125e-06, |
|
"loss": 0.0003, |
|
"step": 22 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0005571705426356589, |
|
"grad_norm": 0.01910688815244073, |
|
"kl": 0.00276947021484375, |
|
"learning_rate": 1.375e-06, |
|
"loss": 0.0003, |
|
"step": 23 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0005813953488372093, |
|
"grad_norm": 0.019047374161024387, |
|
"kl": 0.00327301025390625, |
|
"learning_rate": 1.4375e-06, |
|
"loss": 0.0003, |
|
"step": 24 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 7779.0, |
|
"completions/max_terminated_length": 7779.0, |
|
"completions/mean_length": 4011.9375, |
|
"completions/mean_terminated_length": 4011.9375, |
|
"completions/min_length": 1884.0, |
|
"completions/min_terminated_length": 1884.0, |
|
"epoch": 0.0006056201550387597, |
|
"grad_norm": 0.01969391991938911, |
|
"kl": 0.0029449462890625, |
|
"learning_rate": 1.5e-06, |
|
"loss": 0.0003, |
|
"num_tokens": 4148002.0, |
|
"reward": 0.4466557502746582, |
|
"reward_std": 0.2478387951850891, |
|
"rewards/avg_thinking_length_func": 174.6974639892578, |
|
"rewards/correct_answer_reward_func": 0.40625, |
|
"rewards/efficient_thinking_reward_func": 0.9054659197504085, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/num_xml_reward_func": 1.806973934173584, |
|
"rewards/tool_execution_reward_func": 1.9922122955322266, |
|
"rewards/visit_tool_reward_func": 0.871803879737854, |
|
"step": 25 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0006298449612403101, |
|
"grad_norm": 0.01979038843755439, |
|
"kl": 0.003414154052734375, |
|
"learning_rate": 1.5624999999999999e-06, |
|
"loss": 0.0003, |
|
"step": 26 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0006540697674418605, |
|
"grad_norm": 0.019676702255338734, |
|
"kl": 0.004245758056640625, |
|
"learning_rate": 1.625e-06, |
|
"loss": 0.0003, |
|
"step": 27 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0006782945736434108, |
|
"grad_norm": 0.0198896583655868, |
|
"kl": 0.00508880615234375, |
|
"learning_rate": 1.6875e-06, |
|
"loss": 0.0003, |
|
"step": 28 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 7881.0, |
|
"completions/max_terminated_length": 7881.0, |
|
"completions/mean_length": 4278.0, |
|
"completions/mean_terminated_length": 4278.0, |
|
"completions/min_length": 1269.0, |
|
"completions/min_terminated_length": 1269.0, |
|
"epoch": 0.0007025193798449612, |
|
"grad_norm": 0.02473412222614823, |
|
"kl": 0.00722503662109375, |
|
"learning_rate": 1.75e-06, |
|
"loss": 0.0005, |
|
"num_tokens": 4732732.0, |
|
"reward": 0.639769971370697, |
|
"reward_std": 0.3489268720149994, |
|
"rewards/avg_thinking_length_func": 183.79090881347656, |
|
"rewards/correct_answer_reward_func": 0.640625, |
|
"rewards/efficient_thinking_reward_func": 0.8433743364598003, |
|
"rewards/format_reward_func": 0.9991071224212646, |
|
"rewards/num_xml_reward_func": 1.686936616897583, |
|
"rewards/tool_execution_reward_func": 1.9818710088729858, |
|
"rewards/visit_tool_reward_func": 0.923589289188385, |
|
"step": 29 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0007267441860465116, |
|
"grad_norm": 0.024757116664213524, |
|
"kl": 0.0076904296875, |
|
"learning_rate": 1.8125e-06, |
|
"loss": 0.0005, |
|
"step": 30 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.000750968992248062, |
|
"grad_norm": 0.02444644630643307, |
|
"kl": 0.0073394775390625, |
|
"learning_rate": 1.8749999999999998e-06, |
|
"loss": 0.0005, |
|
"step": 31 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0007751937984496124, |
|
"grad_norm": 0.024210451469423133, |
|
"kl": 0.007171630859375, |
|
"learning_rate": 1.9375e-06, |
|
"loss": 0.0005, |
|
"step": 32 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 7912.0, |
|
"completions/max_terminated_length": 7912.0, |
|
"completions/mean_length": 4317.890625, |
|
"completions/mean_terminated_length": 4317.890625, |
|
"completions/min_length": 1736.0, |
|
"completions/min_terminated_length": 1736.0, |
|
"epoch": 0.0007994186046511628, |
|
"grad_norm": 0.020658762871057952, |
|
"kl": 0.007049560546875, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0, |
|
"num_tokens": 5347783.0, |
|
"reward": 0.33683592081069946, |
|
"reward_std": 0.32624948024749756, |
|
"rewards/avg_thinking_length_func": 177.01129150390625, |
|
"rewards/correct_answer_reward_func": 0.375, |
|
"rewards/efficient_thinking_reward_func": 0.8817601664392056, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/num_xml_reward_func": 1.5408036708831787, |
|
"rewards/tool_execution_reward_func": 1.9917367696762085, |
|
"rewards/visit_tool_reward_func": 0.9276807308197021, |
|
"step": 33 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0008236434108527132, |
|
"grad_norm": 0.02072632567074888, |
|
"kl": 0.0077972412109375, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0, |
|
"step": 34 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0008478682170542636, |
|
"grad_norm": 0.020770020029080613, |
|
"kl": 0.0087432861328125, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0, |
|
"step": 35 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0008720930232558139, |
|
"grad_norm": 0.020487067102301602, |
|
"kl": 0.0097198486328125, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0, |
|
"step": 36 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 7378.0, |
|
"completions/max_terminated_length": 7378.0, |
|
"completions/mean_length": 4152.5, |
|
"completions/mean_terminated_length": 4152.5, |
|
"completions/min_length": 1423.0, |
|
"completions/min_terminated_length": 1423.0, |
|
"epoch": 0.0008963178294573643, |
|
"grad_norm": 0.022364107178309313, |
|
"kl": 0.0112152099609375, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0001, |
|
"num_tokens": 5921090.0, |
|
"reward": 0.6556656360626221, |
|
"reward_std": 0.5008378028869629, |
|
"rewards/avg_thinking_length_func": 170.4791259765625, |
|
"rewards/correct_answer_reward_func": 0.625, |
|
"rewards/efficient_thinking_reward_func": 0.8892575272805912, |
|
"rewards/format_reward_func": 0.987500011920929, |
|
"rewards/num_xml_reward_func": 1.5408031940460205, |
|
"rewards/tool_execution_reward_func": 1.96875, |
|
"rewards/visit_tool_reward_func": 0.9249746799468994, |
|
"step": 37 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0009205426356589147, |
|
"grad_norm": 0.022597206540891295, |
|
"kl": 0.0123443603515625, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0001, |
|
"step": 38 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0009447674418604651, |
|
"grad_norm": 0.02246679376217943, |
|
"kl": 0.013580322265625, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0001, |
|
"step": 39 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0009689922480620155, |
|
"grad_norm": 0.022296105800735398, |
|
"kl": 0.015106201171875, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0001, |
|
"step": 40 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 7494.0, |
|
"completions/max_terminated_length": 7494.0, |
|
"completions/mean_length": 4562.296875, |
|
"completions/mean_terminated_length": 4562.296875, |
|
"completions/min_length": 2143.0, |
|
"completions/min_terminated_length": 2143.0, |
|
"epoch": 0.0009932170542635659, |
|
"grad_norm": 0.021503135345542313, |
|
"kl": 0.015594482421875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0007, |
|
"num_tokens": 6556719.0, |
|
"reward": 0.47225743532180786, |
|
"reward_std": 0.3904932141304016, |
|
"rewards/avg_thinking_length_func": 169.57839965820312, |
|
"rewards/correct_answer_reward_func": 0.4375, |
|
"rewards/efficient_thinking_reward_func": 0.917264621947748, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/num_xml_reward_func": 1.817958116531372, |
|
"rewards/tool_execution_reward_func": 1.9884111881256104, |
|
"rewards/visit_tool_reward_func": 0.9651369452476501, |
|
"step": 41 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0010174418604651163, |
|
"grad_norm": 0.02149252867250571, |
|
"kl": 0.01715087890625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0007, |
|
"step": 42 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0010416666666666667, |
|
"grad_norm": 0.02173596902997293, |
|
"kl": 0.018798828125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0007, |
|
"step": 43 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.001065891472868217, |
|
"grad_norm": 0.02188237517399594, |
|
"kl": 0.020751953125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0007, |
|
"step": 44 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 9017.0, |
|
"completions/max_terminated_length": 9017.0, |
|
"completions/mean_length": 4664.796875, |
|
"completions/mean_terminated_length": 4664.796875, |
|
"completions/min_length": 1910.0, |
|
"completions/min_terminated_length": 1910.0, |
|
"epoch": 0.0010901162790697674, |
|
"grad_norm": 0.02354857583102173, |
|
"kl": 0.020477294921875, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0014, |
|
"num_tokens": 7181732.0, |
|
"reward": 0.7991669178009033, |
|
"reward_std": 0.36247026920318604, |
|
"rewards/avg_thinking_length_func": 171.8461151123047, |
|
"rewards/correct_answer_reward_func": 0.703125, |
|
"rewards/efficient_thinking_reward_func": 0.8915984372821139, |
|
"rewards/format_reward_func": 0.9998437166213989, |
|
"rewards/num_xml_reward_func": 1.8501074314117432, |
|
"rewards/tool_execution_reward_func": 1.9971354007720947, |
|
"rewards/visit_tool_reward_func": 1.071668028831482, |
|
"step": 45 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0011143410852713178, |
|
"grad_norm": 0.023994471938115103, |
|
"kl": 0.0224609375, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0014, |
|
"step": 46 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0011385658914728682, |
|
"grad_norm": 0.026516939220345738, |
|
"kl": 0.02508544921875, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0014, |
|
"step": 47 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0011627906976744186, |
|
"grad_norm": 0.024485287814160223, |
|
"kl": 0.0262451171875, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0014, |
|
"step": 48 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 8522.0, |
|
"completions/max_terminated_length": 8522.0, |
|
"completions/mean_length": 4866.125, |
|
"completions/mean_terminated_length": 4866.125, |
|
"completions/min_length": 1959.0, |
|
"completions/min_terminated_length": 1959.0, |
|
"epoch": 0.001187015503875969, |
|
"grad_norm": 0.02407332594201, |
|
"kl": 0.032012939453125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0014, |
|
"num_tokens": 7868034.0, |
|
"reward": 0.39128515124320984, |
|
"reward_std": 0.3533371090888977, |
|
"rewards/avg_thinking_length_func": 164.74734497070312, |
|
"rewards/correct_answer_reward_func": 0.359375, |
|
"rewards/efficient_thinking_reward_func": 0.9209367558816545, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/num_xml_reward_func": 1.6406757831573486, |
|
"rewards/tool_execution_reward_func": 1.98927903175354, |
|
"rewards/visit_tool_reward_func": 1.0120830535888672, |
|
"step": 49 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0012112403100775194, |
|
"grad_norm": 0.02479690454991753, |
|
"kl": 0.035888671875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0014, |
|
"step": 50 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0012354651162790698, |
|
"grad_norm": 0.027012142633289393, |
|
"kl": 0.04046630859375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0014, |
|
"step": 51 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0012596899224806201, |
|
"grad_norm": 0.026499465739179152, |
|
"kl": 0.04803466796875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0014, |
|
"step": 52 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 7622.0, |
|
"completions/max_terminated_length": 7622.0, |
|
"completions/mean_length": 4509.75, |
|
"completions/mean_terminated_length": 4509.75, |
|
"completions/min_length": 1816.0, |
|
"completions/min_terminated_length": 1816.0, |
|
"epoch": 0.0012839147286821705, |
|
"grad_norm": 0.019741394516818018, |
|
"kl": 0.04510498046875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0, |
|
"num_tokens": 8481102.0, |
|
"reward": 0.7655854225158691, |
|
"reward_std": 0.27847254276275635, |
|
"rewards/avg_thinking_length_func": 158.9434051513672, |
|
"rewards/correct_answer_reward_func": 0.671875, |
|
"rewards/efficient_thinking_reward_func": 0.884494477975468, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/num_xml_reward_func": 1.8834664821624756, |
|
"rewards/tool_execution_reward_func": 2.0, |
|
"rewards/visit_tool_reward_func": 1.1049017906188965, |
|
"step": 53 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.001308139534883721, |
|
"grad_norm": 0.028517188784132036, |
|
"kl": 0.06060791015625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0001, |
|
"step": 54 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0013323643410852713, |
|
"grad_norm": 0.02643367822401968, |
|
"kl": 0.06280517578125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0001, |
|
"step": 55 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0013565891472868217, |
|
"grad_norm": 0.020594752118506976, |
|
"kl": 0.056884765625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0001, |
|
"step": 56 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.0, |
|
"completions/max_length": 7375.0, |
|
"completions/max_terminated_length": 7375.0, |
|
"completions/mean_length": 4285.046875, |
|
"completions/mean_terminated_length": 4285.046875, |
|
"completions/min_length": 2418.0, |
|
"completions/min_terminated_length": 2418.0, |
|
"epoch": 0.001380813953488372, |
|
"grad_norm": 0.019100627823517295, |
|
"kl": 0.06195068359375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0005, |
|
"num_tokens": 9112297.0, |
|
"reward": 0.5274717807769775, |
|
"reward_std": 0.2380232810974121, |
|
"rewards/avg_thinking_length_func": 145.75924682617188, |
|
"rewards/correct_answer_reward_func": 0.453125, |
|
"rewards/efficient_thinking_reward_func": 0.9274070198828231, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/num_xml_reward_func": 1.7929463386535645, |
|
"rewards/tool_execution_reward_func": 1.9959805011749268, |
|
"rewards/visit_tool_reward_func": 1.0335674285888672, |
|
"step": 57 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0014050387596899225, |
|
"grad_norm": 0.019834849658967178, |
|
"kl": 0.06695556640625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0005, |
|
"step": 58 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0014292635658914728, |
|
"grad_norm": 0.020359737753586633, |
|
"kl": 0.0740966796875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0005, |
|
"step": 59 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"epoch": 0.0014534883720930232, |
|
"grad_norm": 0.020904893352951728, |
|
"kl": 0.085693359375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0005, |
|
"step": 60 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 640, |
|
"num_input_tokens_seen": 9112297, |
|
"num_train_epochs": 1, |
|
"save_steps": 20, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|