|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 161, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"advantages": 0.0, |
|
"completion_length": 77.421875, |
|
"epoch": 0.006211180124223602, |
|
"grad_norm": 4.007043838500977, |
|
"kl": 0.0, |
|
"learning_rate": 9.937888198757763e-07, |
|
"loss": -0.0, |
|
"reward": 1.46875, |
|
"reward_mean": 1.46875, |
|
"reward_std": 0.23356688022613525, |
|
"rewards/accuracy_reward": 0.46875, |
|
"rewards/format_reward": 1.0, |
|
"step": 1 |
|
}, |
|
{ |
|
"advantages": 1.30385160446167e-08, |
|
"completion_length": 78.921875, |
|
"epoch": 0.012422360248447204, |
|
"grad_norm": 7.15122652053833, |
|
"kl": 0.00041961669921875, |
|
"learning_rate": 9.875776397515528e-07, |
|
"loss": 0.0, |
|
"reward": 1.53125, |
|
"reward_mean": 1.53125, |
|
"reward_std": 0.2845909595489502, |
|
"rewards/accuracy_reward": 0.53125, |
|
"rewards/format_reward": 1.0, |
|
"step": 2 |
|
}, |
|
{ |
|
"advantages": 0.0, |
|
"completion_length": 78.671875, |
|
"epoch": 0.018633540372670808, |
|
"grad_norm": 2.9171459674835205, |
|
"kl": 0.000392913818359375, |
|
"learning_rate": 9.813664596273291e-07, |
|
"loss": 0.0, |
|
"reward": 1.6875, |
|
"reward_mean": 1.6875, |
|
"reward_std": 0.1552036553621292, |
|
"rewards/accuracy_reward": 0.6875, |
|
"rewards/format_reward": 1.0, |
|
"step": 3 |
|
}, |
|
{ |
|
"advantages": -1.862645149230957e-09, |
|
"completion_length": 77.78125, |
|
"epoch": 0.024844720496894408, |
|
"grad_norm": 5.474589824676514, |
|
"kl": 0.000759124755859375, |
|
"learning_rate": 9.751552795031055e-07, |
|
"loss": 0.0001, |
|
"reward": 1.71875, |
|
"reward_mean": 1.71875, |
|
"reward_std": 0.213067427277565, |
|
"rewards/accuracy_reward": 0.71875, |
|
"rewards/format_reward": 1.0, |
|
"step": 4 |
|
}, |
|
{ |
|
"advantages": -2.7939677238464355e-09, |
|
"completion_length": 90.8125, |
|
"epoch": 0.031055900621118012, |
|
"grad_norm": 5.2480363845825195, |
|
"kl": 0.001495361328125, |
|
"learning_rate": 9.68944099378882e-07, |
|
"loss": 0.0001, |
|
"reward": 1.53125, |
|
"reward_mean": 1.53125, |
|
"reward_std": 0.17570313811302185, |
|
"rewards/accuracy_reward": 0.53125, |
|
"rewards/format_reward": 1.0, |
|
"step": 5 |
|
}, |
|
{ |
|
"advantages": 6.51925802230835e-09, |
|
"completion_length": 81.203125, |
|
"epoch": 0.037267080745341616, |
|
"grad_norm": 8.329508781433105, |
|
"kl": 0.006256103515625, |
|
"learning_rate": 9.627329192546583e-07, |
|
"loss": 0.0006, |
|
"reward": 1.375, |
|
"reward_mean": 1.375, |
|
"reward_std": 0.26409146189689636, |
|
"rewards/accuracy_reward": 0.375, |
|
"rewards/format_reward": 1.0, |
|
"step": 6 |
|
}, |
|
{ |
|
"advantages": -3.725290298461914e-09, |
|
"completion_length": 84.265625, |
|
"epoch": 0.043478260869565216, |
|
"grad_norm": 9.400680541992188, |
|
"kl": 0.0106201171875, |
|
"learning_rate": 9.565217391304349e-07, |
|
"loss": 0.0011, |
|
"reward": 1.65625, |
|
"reward_mean": 1.65625, |
|
"reward_std": 0.2404065877199173, |
|
"rewards/accuracy_reward": 0.65625, |
|
"rewards/format_reward": 1.0, |
|
"step": 7 |
|
}, |
|
{ |
|
"advantages": -2.7939677238464355e-09, |
|
"completion_length": 84.109375, |
|
"epoch": 0.049689440993788817, |
|
"grad_norm": 6.011223316192627, |
|
"kl": 0.0057373046875, |
|
"learning_rate": 9.503105590062112e-07, |
|
"loss": 0.0006, |
|
"reward": 1.4375, |
|
"reward_mean": 1.4375, |
|
"reward_std": 0.22461533546447754, |
|
"rewards/accuracy_reward": 0.453125, |
|
"rewards/format_reward": 0.984375, |
|
"step": 8 |
|
}, |
|
{ |
|
"advantages": -3.725290298461914e-09, |
|
"completion_length": 87.046875, |
|
"epoch": 0.055900621118012424, |
|
"grad_norm": 4.103212356567383, |
|
"kl": 0.00244140625, |
|
"learning_rate": 9.440993788819875e-07, |
|
"loss": 0.0002, |
|
"reward": 1.71875, |
|
"reward_mean": 1.71875, |
|
"reward_std": 0.0578637570142746, |
|
"rewards/accuracy_reward": 0.71875, |
|
"rewards/format_reward": 1.0, |
|
"step": 9 |
|
}, |
|
{ |
|
"advantages": -7.450580596923828e-09, |
|
"completion_length": 76.15625, |
|
"epoch": 0.062111801242236024, |
|
"grad_norm": 7.4132466316223145, |
|
"kl": 0.01287841796875, |
|
"learning_rate": 9.37888198757764e-07, |
|
"loss": 0.0013, |
|
"reward": 1.4375, |
|
"reward_mean": 1.4375, |
|
"reward_std": 0.3335031569004059, |
|
"rewards/accuracy_reward": 0.4375, |
|
"rewards/format_reward": 1.0, |
|
"step": 10 |
|
}, |
|
{ |
|
"advantages": 3.725290298461914e-09, |
|
"completion_length": 76.75, |
|
"epoch": 0.06832298136645963, |
|
"grad_norm": 4.6751017570495605, |
|
"kl": 0.0130615234375, |
|
"learning_rate": 9.316770186335403e-07, |
|
"loss": 0.0013, |
|
"reward": 1.28125, |
|
"reward_mean": 1.28125, |
|
"reward_std": 0.0578637570142746, |
|
"rewards/accuracy_reward": 0.28125, |
|
"rewards/format_reward": 1.0, |
|
"step": 11 |
|
}, |
|
{ |
|
"advantages": -1.862645149230957e-09, |
|
"completion_length": 76.390625, |
|
"epoch": 0.07453416149068323, |
|
"grad_norm": 7.682182788848877, |
|
"kl": 0.018310546875, |
|
"learning_rate": 9.254658385093167e-07, |
|
"loss": 0.0018, |
|
"reward": 1.734375, |
|
"reward_mean": 1.734375, |
|
"reward_std": 0.15992169082164764, |
|
"rewards/accuracy_reward": 0.734375, |
|
"rewards/format_reward": 1.0, |
|
"step": 12 |
|
}, |
|
{ |
|
"advantages": -2.7939677238464355e-09, |
|
"completion_length": 85.234375, |
|
"epoch": 0.08074534161490683, |
|
"grad_norm": 4.814305305480957, |
|
"kl": 0.00396728515625, |
|
"learning_rate": 9.19254658385093e-07, |
|
"loss": 0.0004, |
|
"reward": 1.8125, |
|
"reward_mean": 1.8125, |
|
"reward_std": 0.22461533546447754, |
|
"rewards/accuracy_reward": 0.8125, |
|
"rewards/format_reward": 1.0, |
|
"step": 13 |
|
}, |
|
{ |
|
"advantages": -1.862645149230957e-09, |
|
"completion_length": 86.3125, |
|
"epoch": 0.08695652173913043, |
|
"grad_norm": 189.3062744140625, |
|
"kl": 0.033203125, |
|
"learning_rate": 9.130434782608695e-07, |
|
"loss": 0.0033, |
|
"reward": 1.609375, |
|
"reward_mean": 1.609375, |
|
"reward_std": 0.19044628739356995, |
|
"rewards/accuracy_reward": 0.609375, |
|
"rewards/format_reward": 1.0, |
|
"step": 14 |
|
}, |
|
{ |
|
"advantages": 2.7939677238464355e-09, |
|
"completion_length": 69.8125, |
|
"epoch": 0.09316770186335403, |
|
"grad_norm": 3.5520412921905518, |
|
"kl": 0.01129150390625, |
|
"learning_rate": 9.06832298136646e-07, |
|
"loss": 0.0011, |
|
"reward": 1.78125, |
|
"reward_mean": 1.78125, |
|
"reward_std": 0.10888782143592834, |
|
"rewards/accuracy_reward": 0.78125, |
|
"rewards/format_reward": 1.0, |
|
"step": 15 |
|
}, |
|
{ |
|
"advantages": -9.313225746154785e-10, |
|
"completion_length": 76.296875, |
|
"epoch": 0.09937888198757763, |
|
"grad_norm": 3.526542901992798, |
|
"kl": 0.00909423828125, |
|
"learning_rate": 9.006211180124223e-07, |
|
"loss": 0.0009, |
|
"reward": 1.609375, |
|
"reward_mean": 1.609375, |
|
"reward_std": 0.12255740165710449, |
|
"rewards/accuracy_reward": 0.609375, |
|
"rewards/format_reward": 1.0, |
|
"step": 16 |
|
}, |
|
{ |
|
"advantages": 1.862645149230957e-09, |
|
"completion_length": 86.3125, |
|
"epoch": 0.10559006211180125, |
|
"grad_norm": 5.98048210144043, |
|
"kl": 0.0057373046875, |
|
"learning_rate": 8.944099378881988e-07, |
|
"loss": 0.0006, |
|
"reward": 1.71875, |
|
"reward_mean": 1.71875, |
|
"reward_std": 0.2041158676147461, |
|
"rewards/accuracy_reward": 0.71875, |
|
"rewards/format_reward": 1.0, |
|
"step": 17 |
|
}, |
|
{ |
|
"advantages": 0.0, |
|
"completion_length": 82.921875, |
|
"epoch": 0.11180124223602485, |
|
"grad_norm": 2.0705599784851074, |
|
"kl": 0.00592041015625, |
|
"learning_rate": 8.881987577639751e-07, |
|
"loss": 0.0006, |
|
"reward": 1.5625, |
|
"reward_mean": 1.5625, |
|
"reward_std": 0.06681530922651291, |
|
"rewards/accuracy_reward": 0.578125, |
|
"rewards/format_reward": 0.984375, |
|
"step": 18 |
|
}, |
|
{ |
|
"advantages": -2.7939677238464355e-09, |
|
"completion_length": 81.40625, |
|
"epoch": 0.11801242236024845, |
|
"grad_norm": 9.266715049743652, |
|
"kl": 0.0079345703125, |
|
"learning_rate": 8.819875776397515e-07, |
|
"loss": 0.0008, |
|
"reward": 1.546875, |
|
"reward_mean": 1.546875, |
|
"reward_std": 0.2109457552433014, |
|
"rewards/accuracy_reward": 0.5625, |
|
"rewards/format_reward": 0.984375, |
|
"step": 19 |
|
}, |
|
{ |
|
"advantages": -3.725290298461914e-09, |
|
"completion_length": 80.203125, |
|
"epoch": 0.12422360248447205, |
|
"grad_norm": 10.367863655090332, |
|
"kl": 0.0072021484375, |
|
"learning_rate": 8.757763975155279e-07, |
|
"loss": 0.0007, |
|
"reward": 1.40625, |
|
"reward_mean": 1.40625, |
|
"reward_std": 0.2404065728187561, |
|
"rewards/accuracy_reward": 0.40625, |
|
"rewards/format_reward": 1.0, |
|
"step": 20 |
|
}, |
|
{ |
|
"advantages": 1.862645149230957e-09, |
|
"completion_length": 75.734375, |
|
"epoch": 0.13043478260869565, |
|
"grad_norm": 2.6553478240966797, |
|
"kl": 0.00592041015625, |
|
"learning_rate": 8.695652173913043e-07, |
|
"loss": 0.0006, |
|
"reward": 1.578125, |
|
"reward_mean": 1.578125, |
|
"reward_std": 0.10205793380737305, |
|
"rewards/accuracy_reward": 0.578125, |
|
"rewards/format_reward": 1.0, |
|
"step": 21 |
|
}, |
|
{ |
|
"advantages": 3.725290298461914e-09, |
|
"completion_length": 85.8125, |
|
"epoch": 0.13664596273291926, |
|
"grad_norm": 3.458266496658325, |
|
"kl": 0.00604248046875, |
|
"learning_rate": 8.633540372670807e-07, |
|
"loss": 0.0006, |
|
"reward": 1.515625, |
|
"reward_mean": 1.515625, |
|
"reward_std": 0.15981829166412354, |
|
"rewards/accuracy_reward": 0.53125, |
|
"rewards/format_reward": 0.984375, |
|
"step": 22 |
|
}, |
|
{ |
|
"advantages": -2.7939677238464355e-09, |
|
"completion_length": 79.328125, |
|
"epoch": 0.14285714285714285, |
|
"grad_norm": 3.2002384662628174, |
|
"kl": 0.00543212890625, |
|
"learning_rate": 8.57142857142857e-07, |
|
"loss": 0.0005, |
|
"reward": 1.671875, |
|
"reward_mean": 1.671875, |
|
"reward_std": 0.2109457552433014, |
|
"rewards/accuracy_reward": 0.671875, |
|
"rewards/format_reward": 1.0, |
|
"step": 23 |
|
}, |
|
{ |
|
"advantages": -1.862645149230957e-09, |
|
"completion_length": 75.375, |
|
"epoch": 0.14906832298136646, |
|
"grad_norm": 5.946903705596924, |
|
"kl": 0.0087890625, |
|
"learning_rate": 8.509316770186336e-07, |
|
"loss": 0.0009, |
|
"reward": 1.484375, |
|
"reward_mean": 1.484375, |
|
"reward_std": 0.19044628739356995, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.984375, |
|
"step": 24 |
|
}, |
|
{ |
|
"advantages": -9.313225746154785e-10, |
|
"completion_length": 68.8125, |
|
"epoch": 0.15527950310559005, |
|
"grad_norm": 4.977855682373047, |
|
"kl": 0.008544921875, |
|
"learning_rate": 8.447204968944099e-07, |
|
"loss": 0.0009, |
|
"reward": 1.734375, |
|
"reward_mean": 1.734375, |
|
"reward_std": 0.12255740165710449, |
|
"rewards/accuracy_reward": 0.734375, |
|
"rewards/format_reward": 1.0, |
|
"step": 25 |
|
}, |
|
{ |
|
"advantages": -8.381903171539307e-09, |
|
"completion_length": 83.59375, |
|
"epoch": 0.16149068322981366, |
|
"grad_norm": 4.409206390380859, |
|
"kl": 0.01239013671875, |
|
"learning_rate": 8.385093167701863e-07, |
|
"loss": 0.0012, |
|
"reward": 1.609375, |
|
"reward_mean": 1.609375, |
|
"reward_std": 0.2198973000049591, |
|
"rewards/accuracy_reward": 0.609375, |
|
"rewards/format_reward": 1.0, |
|
"step": 26 |
|
}, |
|
{ |
|
"advantages": 1.862645149230957e-09, |
|
"completion_length": 78.109375, |
|
"epoch": 0.16770186335403728, |
|
"grad_norm": 3.1185989379882812, |
|
"kl": 0.006072998046875, |
|
"learning_rate": 8.322981366459628e-07, |
|
"loss": 0.0006, |
|
"reward": 1.65625, |
|
"reward_mean": 1.65625, |
|
"reward_std": 0.10888782143592834, |
|
"rewards/accuracy_reward": 0.671875, |
|
"rewards/format_reward": 0.984375, |
|
"step": 27 |
|
}, |
|
{ |
|
"advantages": 4.6566128730773926e-09, |
|
"completion_length": 72.0625, |
|
"epoch": 0.17391304347826086, |
|
"grad_norm": 4.9565935134887695, |
|
"kl": 0.010009765625, |
|
"learning_rate": 8.260869565217391e-07, |
|
"loss": 0.001, |
|
"reward": 1.34375, |
|
"reward_mean": 1.34375, |
|
"reward_std": 0.16675157845020294, |
|
"rewards/accuracy_reward": 0.34375, |
|
"rewards/format_reward": 1.0, |
|
"step": 28 |
|
}, |
|
{ |
|
"advantages": -4.6566128730773926e-09, |
|
"completion_length": 84.953125, |
|
"epoch": 0.18012422360248448, |
|
"grad_norm": 4.35609769821167, |
|
"kl": 0.011962890625, |
|
"learning_rate": 8.198757763975155e-07, |
|
"loss": 0.0012, |
|
"reward": 1.46875, |
|
"reward_mean": 1.46875, |
|
"reward_std": 0.25513991713523865, |
|
"rewards/accuracy_reward": 0.484375, |
|
"rewards/format_reward": 0.984375, |
|
"step": 29 |
|
}, |
|
{ |
|
"advantages": -9.313225746154785e-10, |
|
"completion_length": 85.34375, |
|
"epoch": 0.18633540372670807, |
|
"grad_norm": 5.767938137054443, |
|
"kl": 0.009033203125, |
|
"learning_rate": 8.136645962732918e-07, |
|
"loss": 0.0009, |
|
"reward": 1.609375, |
|
"reward_mean": 1.609375, |
|
"reward_std": 0.1530819982290268, |
|
"rewards/accuracy_reward": 0.609375, |
|
"rewards/format_reward": 1.0, |
|
"step": 30 |
|
}, |
|
{ |
|
"advantages": -5.587935447692871e-09, |
|
"completion_length": 83.5625, |
|
"epoch": 0.19254658385093168, |
|
"grad_norm": 49.12059783935547, |
|
"kl": 0.0091552734375, |
|
"learning_rate": 8.074534161490683e-07, |
|
"loss": 0.0009, |
|
"reward": 1.578125, |
|
"reward_mean": 1.578125, |
|
"reward_std": 0.10205793380737305, |
|
"rewards/accuracy_reward": 0.578125, |
|
"rewards/format_reward": 1.0, |
|
"step": 31 |
|
}, |
|
{ |
|
"advantages": 4.6566128730773926e-09, |
|
"completion_length": 77.734375, |
|
"epoch": 0.19875776397515527, |
|
"grad_norm": 1.4828208684921265, |
|
"kl": 0.00970458984375, |
|
"learning_rate": 8.012422360248446e-07, |
|
"loss": 0.001, |
|
"reward": 1.421875, |
|
"reward_mean": 1.421875, |
|
"reward_std": 0.0646936446428299, |
|
"rewards/accuracy_reward": 0.421875, |
|
"rewards/format_reward": 1.0, |
|
"step": 32 |
|
}, |
|
{ |
|
"advantages": 1.862645149230957e-09, |
|
"completion_length": 78.453125, |
|
"epoch": 0.20496894409937888, |
|
"grad_norm": 7.876468658447266, |
|
"kl": 0.020263671875, |
|
"learning_rate": 7.95031055900621e-07, |
|
"loss": 0.002, |
|
"reward": 1.734375, |
|
"reward_mean": 1.734375, |
|
"reward_std": 0.2109457552433014, |
|
"rewards/accuracy_reward": 0.734375, |
|
"rewards/format_reward": 1.0, |
|
"step": 33 |
|
}, |
|
{ |
|
"advantages": -3.725290298461914e-09, |
|
"completion_length": 80.5, |
|
"epoch": 0.2111801242236025, |
|
"grad_norm": 3.8213541507720947, |
|
"kl": 0.01361083984375, |
|
"learning_rate": 7.888198757763976e-07, |
|
"loss": 0.0014, |
|
"reward": 1.46875, |
|
"reward_mean": 1.46875, |
|
"reward_std": 0.0578637570142746, |
|
"rewards/accuracy_reward": 0.46875, |
|
"rewards/format_reward": 1.0, |
|
"step": 34 |
|
}, |
|
{ |
|
"advantages": -6.51925802230835e-09, |
|
"completion_length": 89.5625, |
|
"epoch": 0.21739130434782608, |
|
"grad_norm": 3.453101634979248, |
|
"kl": 0.01470947265625, |
|
"learning_rate": 7.826086956521739e-07, |
|
"loss": 0.0015, |
|
"reward": 1.75, |
|
"reward_mean": 1.75, |
|
"reward_std": 0.17570312321186066, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 1.0, |
|
"step": 35 |
|
}, |
|
{ |
|
"advantages": 0.0, |
|
"completion_length": 90.28125, |
|
"epoch": 0.2236024844720497, |
|
"grad_norm": 2.642101526260376, |
|
"kl": 0.0107421875, |
|
"learning_rate": 7.763975155279503e-07, |
|
"loss": 0.0011, |
|
"reward": 1.5625, |
|
"reward_mean": 1.5625, |
|
"reward_std": 0.06681530922651291, |
|
"rewards/accuracy_reward": 0.5625, |
|
"rewards/format_reward": 1.0, |
|
"step": 36 |
|
}, |
|
{ |
|
"advantages": -3.725290298461914e-09, |
|
"completion_length": 80.3125, |
|
"epoch": 0.22981366459627328, |
|
"grad_norm": 3.5424673557281494, |
|
"kl": 0.01239013671875, |
|
"learning_rate": 7.701863354037266e-07, |
|
"loss": 0.0012, |
|
"reward": 1.71875, |
|
"reward_mean": 1.71875, |
|
"reward_std": 0.0578637570142746, |
|
"rewards/accuracy_reward": 0.71875, |
|
"rewards/format_reward": 1.0, |
|
"step": 37 |
|
}, |
|
{ |
|
"advantages": 0.0, |
|
"completion_length": 84.140625, |
|
"epoch": 0.2360248447204969, |
|
"grad_norm": 0.38800248503685, |
|
"kl": 0.01275634765625, |
|
"learning_rate": 7.639751552795031e-07, |
|
"loss": 0.0013, |
|
"reward": 1.375, |
|
"reward_mean": 1.375, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.390625, |
|
"rewards/format_reward": 0.984375, |
|
"step": 38 |
|
}, |
|
{ |
|
"advantages": 1.862645149230957e-09, |
|
"completion_length": 89.109375, |
|
"epoch": 0.2422360248447205, |
|
"grad_norm": 2.645474433898926, |
|
"kl": 0.01397705078125, |
|
"learning_rate": 7.577639751552795e-07, |
|
"loss": 0.0014, |
|
"reward": 1.515625, |
|
"reward_mean": 1.515625, |
|
"reward_std": 0.04419417306780815, |
|
"rewards/accuracy_reward": 0.515625, |
|
"rewards/format_reward": 1.0, |
|
"step": 39 |
|
}, |
|
{ |
|
"advantages": -3.725290298461914e-09, |
|
"completion_length": 72.296875, |
|
"epoch": 0.2484472049689441, |
|
"grad_norm": 8.574762344360352, |
|
"kl": 0.0159912109375, |
|
"learning_rate": 7.515527950310558e-07, |
|
"loss": 0.0016, |
|
"reward": 1.671875, |
|
"reward_mean": 1.671875, |
|
"reward_std": 0.23144522309303284, |
|
"rewards/accuracy_reward": 0.671875, |
|
"rewards/format_reward": 1.0, |
|
"step": 40 |
|
}, |
|
{ |
|
"advantages": 7.450580596923828e-09, |
|
"completion_length": 86.046875, |
|
"epoch": 0.2546583850931677, |
|
"grad_norm": 36.26329040527344, |
|
"kl": 0.0147705078125, |
|
"learning_rate": 7.453416149068323e-07, |
|
"loss": 0.0015, |
|
"reward": 1.65625, |
|
"reward_mean": 1.65625, |
|
"reward_std": 0.23356688022613525, |
|
"rewards/accuracy_reward": 0.65625, |
|
"rewards/format_reward": 1.0, |
|
"step": 41 |
|
}, |
|
{ |
|
"advantages": -3.725290298461914e-09, |
|
"completion_length": 77.0625, |
|
"epoch": 0.2608695652173913, |
|
"grad_norm": 10.992830276489258, |
|
"kl": 0.0113525390625, |
|
"learning_rate": 7.391304347826086e-07, |
|
"loss": 0.0011, |
|
"reward": 1.703125, |
|
"reward_mean": 1.703125, |
|
"reward_std": 0.24464011192321777, |
|
"rewards/accuracy_reward": 0.703125, |
|
"rewards/format_reward": 1.0, |
|
"step": 42 |
|
}, |
|
{ |
|
"advantages": -1.862645149230957e-09, |
|
"completion_length": 86.53125, |
|
"epoch": 0.2670807453416149, |
|
"grad_norm": 6.251725673675537, |
|
"kl": 0.009033203125, |
|
"learning_rate": 7.329192546583851e-07, |
|
"loss": 0.0009, |
|
"reward": 1.609375, |
|
"reward_mean": 1.609375, |
|
"reward_std": 0.23144522309303284, |
|
"rewards/accuracy_reward": 0.609375, |
|
"rewards/format_reward": 1.0, |
|
"step": 43 |
|
}, |
|
{ |
|
"advantages": -4.6566128730773926e-09, |
|
"completion_length": 86.4375, |
|
"epoch": 0.2732919254658385, |
|
"grad_norm": 3.8048486709594727, |
|
"kl": 0.01385498046875, |
|
"learning_rate": 7.267080745341615e-07, |
|
"loss": 0.0014, |
|
"reward": 1.765625, |
|
"reward_mean": 1.765625, |
|
"reward_std": 0.17358146607875824, |
|
"rewards/accuracy_reward": 0.765625, |
|
"rewards/format_reward": 1.0, |
|
"step": 44 |
|
}, |
|
{ |
|
"advantages": -1.862645149230957e-09, |
|
"completion_length": 84.21875, |
|
"epoch": 0.2795031055900621, |
|
"grad_norm": 2.5062499046325684, |
|
"kl": 0.00811767578125, |
|
"learning_rate": 7.204968944099379e-07, |
|
"loss": 0.0008, |
|
"reward": 1.796875, |
|
"reward_mean": 1.796875, |
|
"reward_std": 0.11100947856903076, |
|
"rewards/accuracy_reward": 0.8125, |
|
"rewards/format_reward": 0.984375, |
|
"step": 45 |
|
}, |
|
{ |
|
"advantages": 3.725290298461914e-09, |
|
"completion_length": 77.9375, |
|
"epoch": 0.2857142857142857, |
|
"grad_norm": 3.8415560722351074, |
|
"kl": 0.01165771484375, |
|
"learning_rate": 7.142857142857143e-07, |
|
"loss": 0.0012, |
|
"reward": 1.53125, |
|
"reward_mean": 1.53125, |
|
"reward_std": 0.1462521106004715, |
|
"rewards/accuracy_reward": 0.53125, |
|
"rewards/format_reward": 1.0, |
|
"step": 46 |
|
}, |
|
{ |
|
"advantages": 1.862645149230957e-09, |
|
"completion_length": 82.6875, |
|
"epoch": 0.2919254658385093, |
|
"grad_norm": 2.903069496154785, |
|
"kl": 0.012451171875, |
|
"learning_rate": 7.080745341614906e-07, |
|
"loss": 0.0012, |
|
"reward": 1.578125, |
|
"reward_mean": 1.578125, |
|
"reward_std": 0.11100947856903076, |
|
"rewards/accuracy_reward": 0.59375, |
|
"rewards/format_reward": 0.984375, |
|
"step": 47 |
|
}, |
|
{ |
|
"advantages": -2.7939677238464355e-09, |
|
"completion_length": 75.578125, |
|
"epoch": 0.2981366459627329, |
|
"grad_norm": 11.884781837463379, |
|
"kl": 0.0125732421875, |
|
"learning_rate": 7.018633540372671e-07, |
|
"loss": 0.0013, |
|
"reward": 1.65625, |
|
"reward_mean": 1.65625, |
|
"reward_std": 0.17570312321186066, |
|
"rewards/accuracy_reward": 0.65625, |
|
"rewards/format_reward": 1.0, |
|
"step": 48 |
|
}, |
|
{ |
|
"advantages": -1.862645149230957e-09, |
|
"completion_length": 73.34375, |
|
"epoch": 0.30434782608695654, |
|
"grad_norm": 2.234876871109009, |
|
"kl": 0.0084228515625, |
|
"learning_rate": 6.956521739130434e-07, |
|
"loss": 0.0008, |
|
"reward": 1.484375, |
|
"reward_mean": 1.484375, |
|
"reward_std": 0.04419417306780815, |
|
"rewards/accuracy_reward": 0.484375, |
|
"rewards/format_reward": 1.0, |
|
"step": 49 |
|
}, |
|
{ |
|
"advantages": 1.862645149230957e-09, |
|
"completion_length": 81.8125, |
|
"epoch": 0.3105590062111801, |
|
"grad_norm": 4.401739597320557, |
|
"kl": 0.007232666015625, |
|
"learning_rate": 6.894409937888198e-07, |
|
"loss": 0.0007, |
|
"reward": 1.765625, |
|
"reward_mean": 1.765625, |
|
"reward_std": 0.17782479524612427, |
|
"rewards/accuracy_reward": 0.765625, |
|
"rewards/format_reward": 1.0, |
|
"step": 50 |
|
}, |
|
{ |
|
"advantages": 0.0, |
|
"completion_length": 84.015625, |
|
"epoch": 0.3167701863354037, |
|
"grad_norm": 0.28293830156326294, |
|
"kl": 0.0062255859375, |
|
"learning_rate": 6.832298136645962e-07, |
|
"loss": 0.0006, |
|
"reward": 2.0, |
|
"reward_mean": 2.0, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 1.0, |
|
"rewards/format_reward": 1.0, |
|
"step": 51 |
|
}, |
|
{ |
|
"advantages": 0.0, |
|
"completion_length": 79.125, |
|
"epoch": 0.32298136645962733, |
|
"grad_norm": 2.2039246559143066, |
|
"kl": 0.0106201171875, |
|
"learning_rate": 6.770186335403726e-07, |
|
"loss": 0.0011, |
|
"reward": 1.75, |
|
"reward_mean": 1.75, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 1.0, |
|
"step": 52 |
|
}, |
|
{ |
|
"advantages": 9.313225746154785e-10, |
|
"completion_length": 76.0625, |
|
"epoch": 0.32919254658385094, |
|
"grad_norm": 4.176709175109863, |
|
"kl": 0.01123046875, |
|
"learning_rate": 6.708074534161491e-07, |
|
"loss": 0.0011, |
|
"reward": 1.640625, |
|
"reward_mean": 1.640625, |
|
"reward_std": 0.1530819982290268, |
|
"rewards/accuracy_reward": 0.640625, |
|
"rewards/format_reward": 1.0, |
|
"step": 53 |
|
}, |
|
{ |
|
"advantages": -3.725290298461914e-09, |
|
"completion_length": 81.125, |
|
"epoch": 0.33540372670807456, |
|
"grad_norm": 30.12848663330078, |
|
"kl": 0.099609375, |
|
"learning_rate": 6.645962732919254e-07, |
|
"loss": 0.01, |
|
"reward": 1.71875, |
|
"reward_mean": 1.71875, |
|
"reward_std": 0.1462520956993103, |
|
"rewards/accuracy_reward": 0.71875, |
|
"rewards/format_reward": 1.0, |
|
"step": 54 |
|
}, |
|
{ |
|
"advantages": -1.862645149230957e-09, |
|
"completion_length": 80.609375, |
|
"epoch": 0.3416149068322981, |
|
"grad_norm": 12.808406829833984, |
|
"kl": 0.01416015625, |
|
"learning_rate": 6.583850931677019e-07, |
|
"loss": 0.0014, |
|
"reward": 1.6875, |
|
"reward_mean": 1.6875, |
|
"reward_std": 0.2238783985376358, |
|
"rewards/accuracy_reward": 0.703125, |
|
"rewards/format_reward": 0.984375, |
|
"step": 55 |
|
}, |
|
{ |
|
"advantages": -5.587935447692871e-09, |
|
"completion_length": 76.15625, |
|
"epoch": 0.34782608695652173, |
|
"grad_norm": 5.750046253204346, |
|
"kl": 0.01019287109375, |
|
"learning_rate": 6.521739130434782e-07, |
|
"loss": 0.001, |
|
"reward": 1.5, |
|
"reward_mean": 1.5, |
|
"reward_std": 0.2041158676147461, |
|
"rewards/accuracy_reward": 0.515625, |
|
"rewards/format_reward": 0.984375, |
|
"step": 56 |
|
}, |
|
{ |
|
"advantages": -3.725290298461914e-09, |
|
"completion_length": 76.765625, |
|
"epoch": 0.35403726708074534, |
|
"grad_norm": 4.7853102684021, |
|
"kl": 0.010986328125, |
|
"learning_rate": 6.459627329192546e-07, |
|
"loss": 0.0011, |
|
"reward": 1.328125, |
|
"reward_mean": 1.328125, |
|
"reward_std": 0.19044628739356995, |
|
"rewards/accuracy_reward": 0.34375, |
|
"rewards/format_reward": 0.984375, |
|
"step": 57 |
|
}, |
|
{ |
|
"advantages": 4.6566128730773926e-09, |
|
"completion_length": 88.796875, |
|
"epoch": 0.36024844720496896, |
|
"grad_norm": 1.7344197034835815, |
|
"kl": 0.00982666015625, |
|
"learning_rate": 6.39751552795031e-07, |
|
"loss": 0.001, |
|
"reward": 1.671875, |
|
"reward_mean": 1.671875, |
|
"reward_std": 0.0646936446428299, |
|
"rewards/accuracy_reward": 0.671875, |
|
"rewards/format_reward": 1.0, |
|
"step": 58 |
|
}, |
|
{ |
|
"advantages": -9.313225746154785e-09, |
|
"completion_length": 84.28125, |
|
"epoch": 0.36645962732919257, |
|
"grad_norm": 3.1260499954223633, |
|
"kl": 0.01416015625, |
|
"learning_rate": 6.335403726708074e-07, |
|
"loss": 0.0014, |
|
"reward": 1.6875, |
|
"reward_mean": 1.6875, |
|
"reward_std": 0.1828794628381729, |
|
"rewards/accuracy_reward": 0.703125, |
|
"rewards/format_reward": 0.984375, |
|
"step": 59 |
|
}, |
|
{ |
|
"advantages": -3.725290298461914e-09, |
|
"completion_length": 77.21875, |
|
"epoch": 0.37267080745341613, |
|
"grad_norm": 1.7963190078735352, |
|
"kl": 0.0089111328125, |
|
"learning_rate": 6.273291925465838e-07, |
|
"loss": 0.0009, |
|
"reward": 1.84375, |
|
"reward_mean": 1.84375, |
|
"reward_std": 0.0578637570142746, |
|
"rewards/accuracy_reward": 0.84375, |
|
"rewards/format_reward": 1.0, |
|
"step": 60 |
|
}, |
|
{ |
|
"advantages": 0.0, |
|
"completion_length": 82.25, |
|
"epoch": 0.37888198757763975, |
|
"grad_norm": 2.5049538612365723, |
|
"kl": 0.00787353515625, |
|
"learning_rate": 6.211180124223601e-07, |
|
"loss": 0.0008, |
|
"reward": 1.625, |
|
"reward_mean": 1.625, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.625, |
|
"rewards/format_reward": 1.0, |
|
"step": 61 |
|
}, |
|
{ |
|
"advantages": -1.210719347000122e-08, |
|
"completion_length": 80.375, |
|
"epoch": 0.38509316770186336, |
|
"grad_norm": 5.739541530609131, |
|
"kl": 0.01312255859375, |
|
"learning_rate": 6.149068322981367e-07, |
|
"loss": 0.0013, |
|
"reward": 1.75, |
|
"reward_mean": 1.75, |
|
"reward_std": 0.2177756428718567, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 1.0, |
|
"step": 62 |
|
}, |
|
{ |
|
"advantages": 1.862645149230957e-09, |
|
"completion_length": 84.296875, |
|
"epoch": 0.391304347826087, |
|
"grad_norm": 4.335031032562256, |
|
"kl": 0.01116943359375, |
|
"learning_rate": 6.08695652173913e-07, |
|
"loss": 0.0011, |
|
"reward": 1.90625, |
|
"reward_mean": 1.90625, |
|
"reward_std": 0.2041158676147461, |
|
"rewards/accuracy_reward": 0.90625, |
|
"rewards/format_reward": 1.0, |
|
"step": 63 |
|
}, |
|
{ |
|
"advantages": 1.862645149230957e-09, |
|
"completion_length": 86.828125, |
|
"epoch": 0.39751552795031053, |
|
"grad_norm": 4.443232536315918, |
|
"kl": 0.01336669921875, |
|
"learning_rate": 6.024844720496894e-07, |
|
"loss": 0.0013, |
|
"reward": 1.703125, |
|
"reward_mean": 1.703125, |
|
"reward_std": 0.19939783215522766, |
|
"rewards/accuracy_reward": 0.703125, |
|
"rewards/format_reward": 1.0, |
|
"step": 64 |
|
}, |
|
{ |
|
"advantages": 5.587935447692871e-09, |
|
"completion_length": 75.71875, |
|
"epoch": 0.40372670807453415, |
|
"grad_norm": 7.092515468597412, |
|
"kl": 0.01251220703125, |
|
"learning_rate": 5.962732919254659e-07, |
|
"loss": 0.0013, |
|
"reward": 1.65625, |
|
"reward_mean": 1.65625, |
|
"reward_std": 0.23827511072158813, |
|
"rewards/accuracy_reward": 0.65625, |
|
"rewards/format_reward": 1.0, |
|
"step": 65 |
|
}, |
|
{ |
|
"advantages": 4.6566128730773926e-09, |
|
"completion_length": 82.140625, |
|
"epoch": 0.40993788819875776, |
|
"grad_norm": 4.468729496002197, |
|
"kl": 0.0211181640625, |
|
"learning_rate": 5.900621118012422e-07, |
|
"loss": 0.0021, |
|
"reward": 1.796875, |
|
"reward_mean": 1.796875, |
|
"reward_std": 0.0646936446428299, |
|
"rewards/accuracy_reward": 0.796875, |
|
"rewards/format_reward": 1.0, |
|
"step": 66 |
|
}, |
|
{ |
|
"advantages": 4.6566128730773926e-09, |
|
"completion_length": 74.890625, |
|
"epoch": 0.4161490683229814, |
|
"grad_norm": 9.289567947387695, |
|
"kl": 0.01611328125, |
|
"learning_rate": 5.838509316770186e-07, |
|
"loss": 0.0016, |
|
"reward": 1.421875, |
|
"reward_mean": 1.421875, |
|
"reward_std": 0.1983242630958557, |
|
"rewards/accuracy_reward": 0.421875, |
|
"rewards/format_reward": 1.0, |
|
"step": 67 |
|
}, |
|
{ |
|
"advantages": 0.0, |
|
"completion_length": 77.625, |
|
"epoch": 0.422360248447205, |
|
"grad_norm": 0.4326918125152588, |
|
"kl": 0.0140380859375, |
|
"learning_rate": 5.77639751552795e-07, |
|
"loss": 0.0014, |
|
"reward": 1.875, |
|
"reward_mean": 1.875, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.875, |
|
"rewards/format_reward": 1.0, |
|
"step": 68 |
|
}, |
|
{ |
|
"advantages": -9.313225746154785e-10, |
|
"completion_length": 81.71875, |
|
"epoch": 0.42857142857142855, |
|
"grad_norm": 5.539842128753662, |
|
"kl": 0.04296875, |
|
"learning_rate": 5.714285714285714e-07, |
|
"loss": 0.0043, |
|
"reward": 1.4375, |
|
"reward_mean": 1.4375, |
|
"reward_std": 0.34352827072143555, |
|
"rewards/accuracy_reward": 0.453125, |
|
"rewards/format_reward": 0.984375, |
|
"step": 69 |
|
}, |
|
{ |
|
"advantages": 0.0, |
|
"completion_length": 90.703125, |
|
"epoch": 0.43478260869565216, |
|
"grad_norm": 0.46686819195747375, |
|
"kl": 0.0074462890625, |
|
"learning_rate": 5.652173913043477e-07, |
|
"loss": 0.0007, |
|
"reward": 1.875, |
|
"reward_mean": 1.875, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.875, |
|
"rewards/format_reward": 1.0, |
|
"step": 70 |
|
}, |
|
{ |
|
"advantages": 1.862645149230957e-09, |
|
"completion_length": 83.578125, |
|
"epoch": 0.4409937888198758, |
|
"grad_norm": 8.54028606414795, |
|
"kl": 0.00897216796875, |
|
"learning_rate": 5.590062111801241e-07, |
|
"loss": 0.0009, |
|
"reward": 1.765625, |
|
"reward_mean": 1.765625, |
|
"reward_std": 0.04419417306780815, |
|
"rewards/accuracy_reward": 0.765625, |
|
"rewards/format_reward": 1.0, |
|
"step": 71 |
|
}, |
|
{ |
|
"advantages": 7.450580596923828e-09, |
|
"completion_length": 84.25, |
|
"epoch": 0.4472049689440994, |
|
"grad_norm": 12.895256996154785, |
|
"kl": 0.00579833984375, |
|
"learning_rate": 5.527950310559007e-07, |
|
"loss": 0.0006, |
|
"reward": 1.453125, |
|
"reward_mean": 1.453125, |
|
"reward_std": 0.12255740165710449, |
|
"rewards/accuracy_reward": 0.453125, |
|
"rewards/format_reward": 1.0, |
|
"step": 72 |
|
}, |
|
{ |
|
"advantages": -9.313225746154785e-09, |
|
"completion_length": 77.15625, |
|
"epoch": 0.453416149068323, |
|
"grad_norm": 5.548634052276611, |
|
"kl": 0.0123291015625, |
|
"learning_rate": 5.46583850931677e-07, |
|
"loss": 0.0012, |
|
"reward": 1.796875, |
|
"reward_mean": 1.796875, |
|
"reward_std": 0.31983357667922974, |
|
"rewards/accuracy_reward": 0.796875, |
|
"rewards/format_reward": 1.0, |
|
"step": 73 |
|
}, |
|
{ |
|
"advantages": -1.0244548320770264e-08, |
|
"completion_length": 84.75, |
|
"epoch": 0.45962732919254656, |
|
"grad_norm": 3.4154112339019775, |
|
"kl": 0.018798828125, |
|
"learning_rate": 5.403726708074534e-07, |
|
"loss": 0.0019, |
|
"reward": 1.78125, |
|
"reward_mean": 1.78125, |
|
"reward_std": 0.19727616012096405, |
|
"rewards/accuracy_reward": 0.78125, |
|
"rewards/format_reward": 1.0, |
|
"step": 74 |
|
}, |
|
{ |
|
"advantages": -4.6566128730773926e-09, |
|
"completion_length": 83.015625, |
|
"epoch": 0.4658385093167702, |
|
"grad_norm": 3.4328691959381104, |
|
"kl": 0.01275634765625, |
|
"learning_rate": 5.341614906832298e-07, |
|
"loss": 0.0013, |
|
"reward": 1.53125, |
|
"reward_mean": 1.53125, |
|
"reward_std": 0.23356688022613525, |
|
"rewards/accuracy_reward": 0.53125, |
|
"rewards/format_reward": 1.0, |
|
"step": 75 |
|
}, |
|
{ |
|
"advantages": -1.862645149230957e-09, |
|
"completion_length": 78.609375, |
|
"epoch": 0.4720496894409938, |
|
"grad_norm": 3.627190113067627, |
|
"kl": 0.0142822265625, |
|
"learning_rate": 5.279503105590062e-07, |
|
"loss": 0.0014, |
|
"reward": 1.9375, |
|
"reward_mean": 1.9375, |
|
"reward_std": 0.1462521106004715, |
|
"rewards/accuracy_reward": 0.9375, |
|
"rewards/format_reward": 1.0, |
|
"step": 76 |
|
}, |
|
{ |
|
"advantages": -3.725290298461914e-09, |
|
"completion_length": 80.46875, |
|
"epoch": 0.4782608695652174, |
|
"grad_norm": 10.168981552124023, |
|
"kl": 0.01251220703125, |
|
"learning_rate": 5.217391304347825e-07, |
|
"loss": 0.0013, |
|
"reward": 1.515625, |
|
"reward_mean": 1.515625, |
|
"reward_std": 0.2109457552433014, |
|
"rewards/accuracy_reward": 0.515625, |
|
"rewards/format_reward": 1.0, |
|
"step": 77 |
|
}, |
|
{ |
|
"advantages": 0.0, |
|
"completion_length": 83.421875, |
|
"epoch": 0.484472049689441, |
|
"grad_norm": 20.923242568969727, |
|
"kl": 0.0113525390625, |
|
"learning_rate": 5.15527950310559e-07, |
|
"loss": 0.0011, |
|
"reward": 1.828125, |
|
"reward_mean": 1.828125, |
|
"reward_std": 0.19044628739356995, |
|
"rewards/accuracy_reward": 0.828125, |
|
"rewards/format_reward": 1.0, |
|
"step": 78 |
|
}, |
|
{ |
|
"advantages": 2.7939677238464355e-09, |
|
"completion_length": 75.109375, |
|
"epoch": 0.4906832298136646, |
|
"grad_norm": 3.643770933151245, |
|
"kl": 0.00811767578125, |
|
"learning_rate": 5.093167701863354e-07, |
|
"loss": 0.0008, |
|
"reward": 1.78125, |
|
"reward_mean": 1.78125, |
|
"reward_std": 0.10888782143592834, |
|
"rewards/accuracy_reward": 0.78125, |
|
"rewards/format_reward": 1.0, |
|
"step": 79 |
|
}, |
|
{ |
|
"advantages": 1.862645149230957e-09, |
|
"completion_length": 81.65625, |
|
"epoch": 0.4968944099378882, |
|
"grad_norm": 4.883938312530518, |
|
"kl": 0.015625, |
|
"learning_rate": 5.031055900621117e-07, |
|
"loss": 0.0016, |
|
"reward": 1.25, |
|
"reward_mean": 1.25, |
|
"reward_std": 0.2130674123764038, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 1.0, |
|
"step": 80 |
|
}, |
|
{ |
|
"advantages": -3.725290298461914e-09, |
|
"completion_length": 82.515625, |
|
"epoch": 0.5031055900621118, |
|
"grad_norm": 1.3860398530960083, |
|
"kl": 0.00799560546875, |
|
"learning_rate": 4.968944099378881e-07, |
|
"loss": 0.0008, |
|
"reward": 1.71875, |
|
"reward_mean": 1.71875, |
|
"reward_std": 0.0578637570142746, |
|
"rewards/accuracy_reward": 0.71875, |
|
"rewards/format_reward": 1.0, |
|
"step": 81 |
|
}, |
|
{ |
|
"advantages": 9.313225746154785e-10, |
|
"completion_length": 77.6875, |
|
"epoch": 0.5093167701863354, |
|
"grad_norm": 3.7328872680664062, |
|
"kl": 0.0181884765625, |
|
"learning_rate": 4.906832298136646e-07, |
|
"loss": 0.0018, |
|
"reward": 1.75, |
|
"reward_mean": 1.75, |
|
"reward_std": 0.16675157845020294, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 1.0, |
|
"step": 82 |
|
}, |
|
{ |
|
"advantages": 3.725290298461914e-09, |
|
"completion_length": 76.4375, |
|
"epoch": 0.515527950310559, |
|
"grad_norm": 3.6228644847869873, |
|
"kl": 0.01446533203125, |
|
"learning_rate": 4.84472049689441e-07, |
|
"loss": 0.0014, |
|
"reward": 1.46875, |
|
"reward_mean": 1.46875, |
|
"reward_std": 0.1246790662407875, |
|
"rewards/accuracy_reward": 0.46875, |
|
"rewards/format_reward": 1.0, |
|
"step": 83 |
|
}, |
|
{ |
|
"advantages": -1.862645149230957e-09, |
|
"completion_length": 79.140625, |
|
"epoch": 0.5217391304347826, |
|
"grad_norm": 5.579171180725098, |
|
"kl": 0.0159912109375, |
|
"learning_rate": 4.782608695652174e-07, |
|
"loss": 0.0016, |
|
"reward": 1.65625, |
|
"reward_mean": 1.65625, |
|
"reward_std": 0.23356688022613525, |
|
"rewards/accuracy_reward": 0.671875, |
|
"rewards/format_reward": 0.984375, |
|
"step": 84 |
|
}, |
|
{ |
|
"advantages": -5.587935447692871e-09, |
|
"completion_length": 80.0, |
|
"epoch": 0.5279503105590062, |
|
"grad_norm": 9.611387252807617, |
|
"kl": 0.01080322265625, |
|
"learning_rate": 4.7204968944099376e-07, |
|
"loss": 0.0011, |
|
"reward": 1.828125, |
|
"reward_mean": 1.828125, |
|
"reward_std": 0.13258251547813416, |
|
"rewards/accuracy_reward": 0.828125, |
|
"rewards/format_reward": 1.0, |
|
"step": 85 |
|
}, |
|
{ |
|
"advantages": -1.862645149230957e-09, |
|
"completion_length": 85.5, |
|
"epoch": 0.5341614906832298, |
|
"grad_norm": 4.1448540687561035, |
|
"kl": 0.01007080078125, |
|
"learning_rate": 4.6583850931677014e-07, |
|
"loss": 0.001, |
|
"reward": 1.859375, |
|
"reward_mean": 1.859375, |
|
"reward_std": 0.17358146607875824, |
|
"rewards/accuracy_reward": 0.859375, |
|
"rewards/format_reward": 1.0, |
|
"step": 86 |
|
}, |
|
{ |
|
"advantages": 1.862645149230957e-09, |
|
"completion_length": 75.5, |
|
"epoch": 0.5403726708074534, |
|
"grad_norm": 5.654483795166016, |
|
"kl": 0.01123046875, |
|
"learning_rate": 4.596273291925465e-07, |
|
"loss": 0.0011, |
|
"reward": 1.796875, |
|
"reward_mean": 1.796875, |
|
"reward_std": 0.1530819982290268, |
|
"rewards/accuracy_reward": 0.796875, |
|
"rewards/format_reward": 1.0, |
|
"step": 87 |
|
}, |
|
{ |
|
"advantages": -3.725290298461914e-09, |
|
"completion_length": 72.671875, |
|
"epoch": 0.546583850931677, |
|
"grad_norm": 2.2370052337646484, |
|
"kl": 0.0137939453125, |
|
"learning_rate": 4.53416149068323e-07, |
|
"loss": 0.0014, |
|
"reward": 1.46875, |
|
"reward_mean": 1.46875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.46875, |
|
"rewards/format_reward": 1.0, |
|
"step": 88 |
|
}, |
|
{ |
|
"advantages": 0.0, |
|
"completion_length": 81.84375, |
|
"epoch": 0.5527950310559007, |
|
"grad_norm": 1.389394760131836, |
|
"kl": 0.00836181640625, |
|
"learning_rate": 4.472049689440994e-07, |
|
"loss": 0.0008, |
|
"reward": 1.75, |
|
"reward_mean": 1.75, |
|
"reward_std": 0.06681530922651291, |
|
"rewards/accuracy_reward": 0.765625, |
|
"rewards/format_reward": 0.984375, |
|
"step": 89 |
|
}, |
|
{ |
|
"advantages": 0.0, |
|
"completion_length": 74.59375, |
|
"epoch": 0.5590062111801242, |
|
"grad_norm": 2.353760242462158, |
|
"kl": 0.00811767578125, |
|
"learning_rate": 4.4099378881987576e-07, |
|
"loss": 0.0008, |
|
"reward": 1.6875, |
|
"reward_mean": 1.6875, |
|
"reward_std": 0.06681530922651291, |
|
"rewards/accuracy_reward": 0.6875, |
|
"rewards/format_reward": 1.0, |
|
"step": 90 |
|
}, |
|
{ |
|
"advantages": 0.0, |
|
"completion_length": 85.28125, |
|
"epoch": 0.5652173913043478, |
|
"grad_norm": 1.5767848491668701, |
|
"kl": 0.009765625, |
|
"learning_rate": 4.3478260869565214e-07, |
|
"loss": 0.001, |
|
"reward": 1.75, |
|
"reward_mean": 1.75, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 1.0, |
|
"step": 91 |
|
}, |
|
{ |
|
"advantages": -8.381903171539307e-09, |
|
"completion_length": 81.859375, |
|
"epoch": 0.5714285714285714, |
|
"grad_norm": 3.835320234298706, |
|
"kl": 0.0181884765625, |
|
"learning_rate": 4.285714285714285e-07, |
|
"loss": 0.0018, |
|
"reward": 1.671875, |
|
"reward_mean": 1.671875, |
|
"reward_std": 0.1530819982290268, |
|
"rewards/accuracy_reward": 0.6875, |
|
"rewards/format_reward": 0.984375, |
|
"step": 92 |
|
}, |
|
{ |
|
"advantages": 3.725290298461914e-09, |
|
"completion_length": 83.671875, |
|
"epoch": 0.577639751552795, |
|
"grad_norm": 9.30271053314209, |
|
"kl": 0.017822265625, |
|
"learning_rate": 4.2236024844720495e-07, |
|
"loss": 0.0018, |
|
"reward": 1.796875, |
|
"reward_mean": 1.796875, |
|
"reward_std": 0.23144522309303284, |
|
"rewards/accuracy_reward": 0.796875, |
|
"rewards/format_reward": 1.0, |
|
"step": 93 |
|
}, |
|
{ |
|
"advantages": -4.6566128730773926e-09, |
|
"completion_length": 77.53125, |
|
"epoch": 0.5838509316770186, |
|
"grad_norm": 6.170975685119629, |
|
"kl": 0.009521484375, |
|
"learning_rate": 4.161490683229814e-07, |
|
"loss": 0.001, |
|
"reward": 1.65625, |
|
"reward_mean": 1.65625, |
|
"reward_std": 0.16675157845020294, |
|
"rewards/accuracy_reward": 0.65625, |
|
"rewards/format_reward": 1.0, |
|
"step": 94 |
|
}, |
|
{ |
|
"advantages": -1.862645149230957e-09, |
|
"completion_length": 85.640625, |
|
"epoch": 0.5900621118012422, |
|
"grad_norm": 4.217593669891357, |
|
"kl": 0.01409912109375, |
|
"learning_rate": 4.0993788819875776e-07, |
|
"loss": 0.0014, |
|
"reward": 1.734375, |
|
"reward_mean": 1.734375, |
|
"reward_std": 0.15992169082164764, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.984375, |
|
"step": 95 |
|
}, |
|
{ |
|
"advantages": 0.0, |
|
"completion_length": 77.296875, |
|
"epoch": 0.5962732919254659, |
|
"grad_norm": 6.138365268707275, |
|
"kl": 0.0106201171875, |
|
"learning_rate": 4.0372670807453413e-07, |
|
"loss": 0.0011, |
|
"reward": 1.375, |
|
"reward_mean": 1.375, |
|
"reward_std": 0.06681530922651291, |
|
"rewards/accuracy_reward": 0.390625, |
|
"rewards/format_reward": 0.984375, |
|
"step": 96 |
|
}, |
|
{ |
|
"advantages": 1.862645149230957e-09, |
|
"completion_length": 76.484375, |
|
"epoch": 0.6024844720496895, |
|
"grad_norm": 1.2896429300308228, |
|
"kl": 0.00970458984375, |
|
"learning_rate": 3.975155279503105e-07, |
|
"loss": 0.001, |
|
"reward": 1.71875, |
|
"reward_mean": 1.71875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.734375, |
|
"rewards/format_reward": 0.984375, |
|
"step": 97 |
|
}, |
|
{ |
|
"advantages": -3.725290298461914e-09, |
|
"completion_length": 81.71875, |
|
"epoch": 0.6086956521739131, |
|
"grad_norm": 6.941093444824219, |
|
"kl": 0.01165771484375, |
|
"learning_rate": 3.9130434782608694e-07, |
|
"loss": 0.0012, |
|
"reward": 1.71875, |
|
"reward_mean": 1.71875, |
|
"reward_std": 0.0578637570142746, |
|
"rewards/accuracy_reward": 0.71875, |
|
"rewards/format_reward": 1.0, |
|
"step": 98 |
|
}, |
|
{ |
|
"advantages": -3.725290298461914e-09, |
|
"completion_length": 81.390625, |
|
"epoch": 0.6149068322981367, |
|
"grad_norm": 3.163457155227661, |
|
"kl": 0.00787353515625, |
|
"learning_rate": 3.850931677018633e-07, |
|
"loss": 0.0008, |
|
"reward": 1.96875, |
|
"reward_mean": 1.96875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.96875, |
|
"rewards/format_reward": 1.0, |
|
"step": 99 |
|
}, |
|
{ |
|
"advantages": 4.6566128730773926e-09, |
|
"completion_length": 79.1875, |
|
"epoch": 0.6211180124223602, |
|
"grad_norm": 4.2669830322265625, |
|
"kl": 0.0108642578125, |
|
"learning_rate": 3.7888198757763975e-07, |
|
"loss": 0.0011, |
|
"reward": 1.671875, |
|
"reward_mean": 1.671875, |
|
"reward_std": 0.0646936446428299, |
|
"rewards/accuracy_reward": 0.671875, |
|
"rewards/format_reward": 1.0, |
|
"step": 100 |
|
}, |
|
{ |
|
"advantages": -1.862645149230957e-09, |
|
"completion_length": 77.578125, |
|
"epoch": 0.6273291925465838, |
|
"grad_norm": 6.153615474700928, |
|
"kl": 0.0101318359375, |
|
"learning_rate": 3.7267080745341613e-07, |
|
"loss": 0.001, |
|
"reward": 1.359375, |
|
"reward_mean": 1.359375, |
|
"reward_std": 0.04419417306780815, |
|
"rewards/accuracy_reward": 0.359375, |
|
"rewards/format_reward": 1.0, |
|
"step": 101 |
|
}, |
|
{ |
|
"advantages": -1.862645149230957e-09, |
|
"completion_length": 81.53125, |
|
"epoch": 0.6335403726708074, |
|
"grad_norm": 4.077609539031982, |
|
"kl": 0.0181884765625, |
|
"learning_rate": 3.6645962732919256e-07, |
|
"loss": 0.0018, |
|
"reward": 1.84375, |
|
"reward_mean": 1.84375, |
|
"reward_std": 0.2177756428718567, |
|
"rewards/accuracy_reward": 0.84375, |
|
"rewards/format_reward": 1.0, |
|
"step": 102 |
|
}, |
|
{ |
|
"advantages": -3.725290298461914e-09, |
|
"completion_length": 80.375, |
|
"epoch": 0.639751552795031, |
|
"grad_norm": 3.084027051925659, |
|
"kl": 0.01007080078125, |
|
"learning_rate": 3.6024844720496894e-07, |
|
"loss": 0.001, |
|
"reward": 1.53125, |
|
"reward_mean": 1.53125, |
|
"reward_std": 0.1462521106004715, |
|
"rewards/accuracy_reward": 0.53125, |
|
"rewards/format_reward": 1.0, |
|
"step": 103 |
|
}, |
|
{ |
|
"advantages": -3.725290298461914e-09, |
|
"completion_length": 83.953125, |
|
"epoch": 0.6459627329192547, |
|
"grad_norm": 2.0512335300445557, |
|
"kl": 0.007476806640625, |
|
"learning_rate": 3.540372670807453e-07, |
|
"loss": 0.0007, |
|
"reward": 1.453125, |
|
"reward_mean": 1.453125, |
|
"reward_std": 0.0646936446428299, |
|
"rewards/accuracy_reward": 0.453125, |
|
"rewards/format_reward": 1.0, |
|
"step": 104 |
|
}, |
|
{ |
|
"advantages": 0.0, |
|
"completion_length": 82.5625, |
|
"epoch": 0.6521739130434783, |
|
"grad_norm": 0.5302374362945557, |
|
"kl": 0.00982666015625, |
|
"learning_rate": 3.478260869565217e-07, |
|
"loss": 0.001, |
|
"reward": 1.75, |
|
"reward_mean": 1.75, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 1.0, |
|
"step": 105 |
|
}, |
|
{ |
|
"advantages": -4.6566128730773926e-09, |
|
"completion_length": 75.953125, |
|
"epoch": 0.6583850931677019, |
|
"grad_norm": 6.2678751945495605, |
|
"kl": 0.0111083984375, |
|
"learning_rate": 3.416149068322981e-07, |
|
"loss": 0.0011, |
|
"reward": 1.890625, |
|
"reward_mean": 1.890625, |
|
"reward_std": 0.1315089464187622, |
|
"rewards/accuracy_reward": 0.890625, |
|
"rewards/format_reward": 1.0, |
|
"step": 106 |
|
}, |
|
{ |
|
"advantages": -3.725290298461914e-09, |
|
"completion_length": 78.328125, |
|
"epoch": 0.6645962732919255, |
|
"grad_norm": 1.7859537601470947, |
|
"kl": 0.00946044921875, |
|
"learning_rate": 3.3540372670807456e-07, |
|
"loss": 0.0009, |
|
"reward": 1.71875, |
|
"reward_mean": 1.71875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.71875, |
|
"rewards/format_reward": 1.0, |
|
"step": 107 |
|
}, |
|
{ |
|
"advantages": -3.725290298461914e-09, |
|
"completion_length": 91.8125, |
|
"epoch": 0.6708074534161491, |
|
"grad_norm": 2.7167623043060303, |
|
"kl": 0.0081787109375, |
|
"learning_rate": 3.2919254658385094e-07, |
|
"loss": 0.0008, |
|
"reward": 1.65625, |
|
"reward_mean": 1.65625, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.671875, |
|
"rewards/format_reward": 0.984375, |
|
"step": 108 |
|
}, |
|
{ |
|
"advantages": 0.0, |
|
"completion_length": 76.984375, |
|
"epoch": 0.6770186335403726, |
|
"grad_norm": 5.3628058433532715, |
|
"kl": 0.009033203125, |
|
"learning_rate": 3.229813664596273e-07, |
|
"loss": 0.0009, |
|
"reward": 1.515625, |
|
"reward_mean": 1.515625, |
|
"reward_std": 0.19044628739356995, |
|
"rewards/accuracy_reward": 0.515625, |
|
"rewards/format_reward": 1.0, |
|
"step": 109 |
|
}, |
|
{ |
|
"advantages": -1.862645149230957e-09, |
|
"completion_length": 73.5, |
|
"epoch": 0.6832298136645962, |
|
"grad_norm": 3.2727582454681396, |
|
"kl": 0.0108642578125, |
|
"learning_rate": 3.167701863354037e-07, |
|
"loss": 0.0011, |
|
"reward": 1.609375, |
|
"reward_mean": 1.609375, |
|
"reward_std": 0.04419417306780815, |
|
"rewards/accuracy_reward": 0.609375, |
|
"rewards/format_reward": 1.0, |
|
"step": 110 |
|
}, |
|
{ |
|
"advantages": 0.0, |
|
"completion_length": 75.75, |
|
"epoch": 0.6894409937888198, |
|
"grad_norm": 11.552366256713867, |
|
"kl": 0.0145263671875, |
|
"learning_rate": 3.105590062111801e-07, |
|
"loss": 0.0015, |
|
"reward": 1.75, |
|
"reward_mean": 1.75, |
|
"reward_std": 0.1157275140285492, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 1.0, |
|
"step": 111 |
|
}, |
|
{ |
|
"advantages": 3.725290298461914e-09, |
|
"completion_length": 85.734375, |
|
"epoch": 0.6956521739130435, |
|
"grad_norm": 6.025736331939697, |
|
"kl": 0.01556396484375, |
|
"learning_rate": 3.043478260869565e-07, |
|
"loss": 0.0016, |
|
"reward": 1.59375, |
|
"reward_mean": 1.59375, |
|
"reward_std": 0.1552036553621292, |
|
"rewards/accuracy_reward": 0.59375, |
|
"rewards/format_reward": 1.0, |
|
"step": 112 |
|
}, |
|
{ |
|
"advantages": -3.725290298461914e-09, |
|
"completion_length": 82.734375, |
|
"epoch": 0.7018633540372671, |
|
"grad_norm": 15.336418151855469, |
|
"kl": 0.057373046875, |
|
"learning_rate": 2.9813664596273294e-07, |
|
"loss": 0.0057, |
|
"reward": 1.84375, |
|
"reward_mean": 1.84375, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.84375, |
|
"rewards/format_reward": 1.0, |
|
"step": 113 |
|
}, |
|
{ |
|
"advantages": -3.725290298461914e-09, |
|
"completion_length": 78.953125, |
|
"epoch": 0.7080745341614907, |
|
"grad_norm": 65.76184844970703, |
|
"kl": 0.01385498046875, |
|
"learning_rate": 2.919254658385093e-07, |
|
"loss": 0.0014, |
|
"reward": 1.90625, |
|
"reward_mean": 1.90625, |
|
"reward_std": 0.1552036553621292, |
|
"rewards/accuracy_reward": 0.90625, |
|
"rewards/format_reward": 1.0, |
|
"step": 114 |
|
}, |
|
{ |
|
"advantages": -1.862645149230957e-09, |
|
"completion_length": 79.765625, |
|
"epoch": 0.7142857142857143, |
|
"grad_norm": 3.660456657409668, |
|
"kl": 0.0194091796875, |
|
"learning_rate": 2.857142857142857e-07, |
|
"loss": 0.0019, |
|
"reward": 1.59375, |
|
"reward_mean": 1.59375, |
|
"reward_std": 0.10888782143592834, |
|
"rewards/accuracy_reward": 0.59375, |
|
"rewards/format_reward": 1.0, |
|
"step": 115 |
|
}, |
|
{ |
|
"advantages": -5.587935447692871e-09, |
|
"completion_length": 76.34375, |
|
"epoch": 0.7204968944099379, |
|
"grad_norm": 4.989613056182861, |
|
"kl": 0.00787353515625, |
|
"learning_rate": 2.7950310559006207e-07, |
|
"loss": 0.0008, |
|
"reward": 1.828125, |
|
"reward_mean": 1.828125, |
|
"reward_std": 0.13258251547813416, |
|
"rewards/accuracy_reward": 0.84375, |
|
"rewards/format_reward": 0.984375, |
|
"step": 116 |
|
}, |
|
{ |
|
"advantages": -9.313225746154785e-10, |
|
"completion_length": 77.859375, |
|
"epoch": 0.7267080745341615, |
|
"grad_norm": 2.4932050704956055, |
|
"kl": 0.0081787109375, |
|
"learning_rate": 2.732919254658385e-07, |
|
"loss": 0.0008, |
|
"reward": 1.859375, |
|
"reward_mean": 1.859375, |
|
"reward_std": 0.12255740165710449, |
|
"rewards/accuracy_reward": 0.859375, |
|
"rewards/format_reward": 1.0, |
|
"step": 117 |
|
}, |
|
{ |
|
"advantages": 1.862645149230957e-09, |
|
"completion_length": 84.5, |
|
"epoch": 0.7329192546583851, |
|
"grad_norm": 5.0420732498168945, |
|
"kl": 0.01226806640625, |
|
"learning_rate": 2.670807453416149e-07, |
|
"loss": 0.0012, |
|
"reward": 1.640625, |
|
"reward_mean": 1.640625, |
|
"reward_std": 0.23144522309303284, |
|
"rewards/accuracy_reward": 0.640625, |
|
"rewards/format_reward": 1.0, |
|
"step": 118 |
|
}, |
|
{ |
|
"advantages": -1.862645149230957e-09, |
|
"completion_length": 79.703125, |
|
"epoch": 0.7391304347826086, |
|
"grad_norm": 3.599855899810791, |
|
"kl": 0.0086669921875, |
|
"learning_rate": 2.6086956521739126e-07, |
|
"loss": 0.0009, |
|
"reward": 1.484375, |
|
"reward_mean": 1.484375, |
|
"reward_std": 0.13258251547813416, |
|
"rewards/accuracy_reward": 0.484375, |
|
"rewards/format_reward": 1.0, |
|
"step": 119 |
|
}, |
|
{ |
|
"advantages": -5.587935447692871e-09, |
|
"completion_length": 79.1875, |
|
"epoch": 0.7453416149068323, |
|
"grad_norm": 3.320706605911255, |
|
"kl": 0.0133056640625, |
|
"learning_rate": 2.546583850931677e-07, |
|
"loss": 0.0013, |
|
"reward": 1.828125, |
|
"reward_mean": 1.828125, |
|
"reward_std": 0.10205793380737305, |
|
"rewards/accuracy_reward": 0.828125, |
|
"rewards/format_reward": 1.0, |
|
"step": 120 |
|
}, |
|
{ |
|
"advantages": -8.381903171539307e-09, |
|
"completion_length": 95.203125, |
|
"epoch": 0.7515527950310559, |
|
"grad_norm": 2.8366851806640625, |
|
"kl": 0.0064697265625, |
|
"learning_rate": 2.4844720496894407e-07, |
|
"loss": 0.0006, |
|
"reward": 1.671875, |
|
"reward_mean": 1.671875, |
|
"reward_std": 0.1530819833278656, |
|
"rewards/accuracy_reward": 0.671875, |
|
"rewards/format_reward": 1.0, |
|
"step": 121 |
|
}, |
|
{ |
|
"advantages": 1.862645149230957e-09, |
|
"completion_length": 78.765625, |
|
"epoch": 0.7577639751552795, |
|
"grad_norm": 3.376732587814331, |
|
"kl": 0.00860595703125, |
|
"learning_rate": 2.422360248447205e-07, |
|
"loss": 0.0009, |
|
"reward": 1.640625, |
|
"reward_mean": 1.640625, |
|
"reward_std": 0.10205793380737305, |
|
"rewards/accuracy_reward": 0.640625, |
|
"rewards/format_reward": 1.0, |
|
"step": 122 |
|
}, |
|
{ |
|
"advantages": 1.862645149230957e-09, |
|
"completion_length": 79.453125, |
|
"epoch": 0.7639751552795031, |
|
"grad_norm": 3.5682129859924316, |
|
"kl": 0.018798828125, |
|
"learning_rate": 2.3602484472049688e-07, |
|
"loss": 0.0019, |
|
"reward": 1.671875, |
|
"reward_mean": 1.671875, |
|
"reward_std": 0.1804211586713791, |
|
"rewards/accuracy_reward": 0.671875, |
|
"rewards/format_reward": 1.0, |
|
"step": 123 |
|
}, |
|
{ |
|
"advantages": 4.6566128730773926e-09, |
|
"completion_length": 74.96875, |
|
"epoch": 0.7701863354037267, |
|
"grad_norm": 2.6698434352874756, |
|
"kl": 0.006256103515625, |
|
"learning_rate": 2.2981366459627326e-07, |
|
"loss": 0.0006, |
|
"reward": 1.546875, |
|
"reward_mean": 1.546875, |
|
"reward_std": 0.0646936446428299, |
|
"rewards/accuracy_reward": 0.546875, |
|
"rewards/format_reward": 1.0, |
|
"step": 124 |
|
}, |
|
{ |
|
"advantages": 0.0, |
|
"completion_length": 78.421875, |
|
"epoch": 0.7763975155279503, |
|
"grad_norm": 3.1063811779022217, |
|
"kl": 0.01214599609375, |
|
"learning_rate": 2.236024844720497e-07, |
|
"loss": 0.0012, |
|
"reward": 1.765625, |
|
"reward_mean": 1.765625, |
|
"reward_std": 0.12255740165710449, |
|
"rewards/accuracy_reward": 0.765625, |
|
"rewards/format_reward": 1.0, |
|
"step": 125 |
|
}, |
|
{ |
|
"advantages": 1.862645149230957e-09, |
|
"completion_length": 86.703125, |
|
"epoch": 0.782608695652174, |
|
"grad_norm": 2.7392446994781494, |
|
"kl": 0.00634765625, |
|
"learning_rate": 2.1739130434782607e-07, |
|
"loss": 0.0006, |
|
"reward": 1.640625, |
|
"reward_mean": 1.640625, |
|
"reward_std": 0.04419417306780815, |
|
"rewards/accuracy_reward": 0.640625, |
|
"rewards/format_reward": 1.0, |
|
"step": 126 |
|
}, |
|
{ |
|
"advantages": 7.450580596923828e-09, |
|
"completion_length": 83.546875, |
|
"epoch": 0.7888198757763976, |
|
"grad_norm": 9.345684051513672, |
|
"kl": 0.008056640625, |
|
"learning_rate": 2.1118012422360247e-07, |
|
"loss": 0.0008, |
|
"reward": 1.296875, |
|
"reward_mean": 1.296875, |
|
"reward_std": 0.19044628739356995, |
|
"rewards/accuracy_reward": 0.296875, |
|
"rewards/format_reward": 1.0, |
|
"step": 127 |
|
}, |
|
{ |
|
"advantages": 0.0, |
|
"completion_length": 85.84375, |
|
"epoch": 0.7950310559006211, |
|
"grad_norm": 0.22835175693035126, |
|
"kl": 0.0084228515625, |
|
"learning_rate": 2.0496894409937888e-07, |
|
"loss": 0.0008, |
|
"reward": 1.75, |
|
"reward_mean": 1.75, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 1.0, |
|
"step": 128 |
|
}, |
|
{ |
|
"advantages": 0.0, |
|
"completion_length": 81.828125, |
|
"epoch": 0.8012422360248447, |
|
"grad_norm": 2.44989275932312, |
|
"kl": 0.007171630859375, |
|
"learning_rate": 1.9875776397515526e-07, |
|
"loss": 0.0007, |
|
"reward": 1.5, |
|
"reward_mean": 1.5, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 1.0, |
|
"step": 129 |
|
}, |
|
{ |
|
"advantages": 9.313225746154785e-10, |
|
"completion_length": 83.296875, |
|
"epoch": 0.8074534161490683, |
|
"grad_norm": 26.60379409790039, |
|
"kl": 0.01031494140625, |
|
"learning_rate": 1.9254658385093166e-07, |
|
"loss": 0.001, |
|
"reward": 1.640625, |
|
"reward_mean": 1.640625, |
|
"reward_std": 0.1530819982290268, |
|
"rewards/accuracy_reward": 0.640625, |
|
"rewards/format_reward": 1.0, |
|
"step": 130 |
|
}, |
|
{ |
|
"advantages": -3.725290298461914e-09, |
|
"completion_length": 75.296875, |
|
"epoch": 0.8136645962732919, |
|
"grad_norm": 2.649775981903076, |
|
"kl": 0.00787353515625, |
|
"learning_rate": 1.8633540372670807e-07, |
|
"loss": 0.0008, |
|
"reward": 1.71875, |
|
"reward_mean": 1.71875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.71875, |
|
"rewards/format_reward": 1.0, |
|
"step": 131 |
|
}, |
|
{ |
|
"advantages": -7.450580596923828e-09, |
|
"completion_length": 72.921875, |
|
"epoch": 0.8198757763975155, |
|
"grad_norm": 6.021523952484131, |
|
"kl": 0.017333984375, |
|
"learning_rate": 1.8012422360248447e-07, |
|
"loss": 0.0017, |
|
"reward": 1.546875, |
|
"reward_mean": 1.546875, |
|
"reward_std": 0.17358146607875824, |
|
"rewards/accuracy_reward": 0.546875, |
|
"rewards/format_reward": 1.0, |
|
"step": 132 |
|
}, |
|
{ |
|
"advantages": 1.862645149230957e-09, |
|
"completion_length": 80.203125, |
|
"epoch": 0.8260869565217391, |
|
"grad_norm": 5.850553035736084, |
|
"kl": 0.0118408203125, |
|
"learning_rate": 1.7391304347826085e-07, |
|
"loss": 0.0012, |
|
"reward": 1.671875, |
|
"reward_mean": 1.671875, |
|
"reward_std": 0.25726157426834106, |
|
"rewards/accuracy_reward": 0.671875, |
|
"rewards/format_reward": 1.0, |
|
"step": 133 |
|
}, |
|
{ |
|
"advantages": -5.587935447692871e-09, |
|
"completion_length": 90.234375, |
|
"epoch": 0.8322981366459627, |
|
"grad_norm": 9.700899124145508, |
|
"kl": 0.01422119140625, |
|
"learning_rate": 1.6770186335403728e-07, |
|
"loss": 0.0014, |
|
"reward": 1.78125, |
|
"reward_mean": 1.78125, |
|
"reward_std": 0.2651650309562683, |
|
"rewards/accuracy_reward": 0.828125, |
|
"rewards/format_reward": 0.953125, |
|
"step": 134 |
|
}, |
|
{ |
|
"advantages": 3.725290298461914e-09, |
|
"completion_length": 81.90625, |
|
"epoch": 0.8385093167701864, |
|
"grad_norm": 2.9975473880767822, |
|
"kl": 0.00909423828125, |
|
"learning_rate": 1.6149068322981366e-07, |
|
"loss": 0.0009, |
|
"reward": 1.6875, |
|
"reward_mean": 1.6875, |
|
"reward_std": 0.1552036553621292, |
|
"rewards/accuracy_reward": 0.6875, |
|
"rewards/format_reward": 1.0, |
|
"step": 135 |
|
}, |
|
{ |
|
"advantages": -3.725290298461914e-09, |
|
"completion_length": 79.3125, |
|
"epoch": 0.84472049689441, |
|
"grad_norm": 4.324582099914551, |
|
"kl": 0.01165771484375, |
|
"learning_rate": 1.5527950310559004e-07, |
|
"loss": 0.0012, |
|
"reward": 1.84375, |
|
"reward_mean": 1.84375, |
|
"reward_std": 0.2177756428718567, |
|
"rewards/accuracy_reward": 0.859375, |
|
"rewards/format_reward": 0.984375, |
|
"step": 136 |
|
}, |
|
{ |
|
"advantages": 7.450580596923828e-09, |
|
"completion_length": 80.265625, |
|
"epoch": 0.8509316770186336, |
|
"grad_norm": 3.8911736011505127, |
|
"kl": 0.009521484375, |
|
"learning_rate": 1.4906832298136647e-07, |
|
"loss": 0.001, |
|
"reward": 1.5625, |
|
"reward_mean": 1.5625, |
|
"reward_std": 0.1552036553621292, |
|
"rewards/accuracy_reward": 0.5625, |
|
"rewards/format_reward": 1.0, |
|
"step": 137 |
|
}, |
|
{ |
|
"advantages": 0.0, |
|
"completion_length": 78.40625, |
|
"epoch": 0.8571428571428571, |
|
"grad_norm": 2.864941120147705, |
|
"kl": 0.01129150390625, |
|
"learning_rate": 1.4285714285714285e-07, |
|
"loss": 0.0011, |
|
"reward": 1.765625, |
|
"reward_mean": 1.765625, |
|
"reward_std": 0.10205793380737305, |
|
"rewards/accuracy_reward": 0.765625, |
|
"rewards/format_reward": 1.0, |
|
"step": 138 |
|
}, |
|
{ |
|
"advantages": 3.725290298461914e-09, |
|
"completion_length": 80.21875, |
|
"epoch": 0.8633540372670807, |
|
"grad_norm": 5.788990497589111, |
|
"kl": 0.00799560546875, |
|
"learning_rate": 1.3664596273291925e-07, |
|
"loss": 0.0008, |
|
"reward": 1.5625, |
|
"reward_mean": 1.5625, |
|
"reward_std": 0.1552036553621292, |
|
"rewards/accuracy_reward": 0.5625, |
|
"rewards/format_reward": 1.0, |
|
"step": 139 |
|
}, |
|
{ |
|
"advantages": -1.862645149230957e-09, |
|
"completion_length": 92.90625, |
|
"epoch": 0.8695652173913043, |
|
"grad_norm": 4.130926609039307, |
|
"kl": 0.0111083984375, |
|
"learning_rate": 1.3043478260869563e-07, |
|
"loss": 0.0011, |
|
"reward": 1.859375, |
|
"reward_mean": 1.859375, |
|
"reward_std": 0.2198973000049591, |
|
"rewards/accuracy_reward": 0.859375, |
|
"rewards/format_reward": 1.0, |
|
"step": 140 |
|
}, |
|
{ |
|
"advantages": -3.725290298461914e-09, |
|
"completion_length": 79.109375, |
|
"epoch": 0.8757763975155279, |
|
"grad_norm": 3.025212287902832, |
|
"kl": 0.01324462890625, |
|
"learning_rate": 1.2422360248447204e-07, |
|
"loss": 0.0013, |
|
"reward": 1.46875, |
|
"reward_mean": 1.46875, |
|
"reward_std": 0.1767766922712326, |
|
"rewards/accuracy_reward": 0.484375, |
|
"rewards/format_reward": 0.984375, |
|
"step": 141 |
|
}, |
|
{ |
|
"advantages": 3.725290298461914e-09, |
|
"completion_length": 78.25, |
|
"epoch": 0.8819875776397516, |
|
"grad_norm": 6.828762531280518, |
|
"kl": 0.01177978515625, |
|
"learning_rate": 1.1801242236024844e-07, |
|
"loss": 0.0012, |
|
"reward": 1.40625, |
|
"reward_mean": 1.40625, |
|
"reward_std": 0.1462520956993103, |
|
"rewards/accuracy_reward": 0.40625, |
|
"rewards/format_reward": 1.0, |
|
"step": 142 |
|
}, |
|
{ |
|
"advantages": 0.0, |
|
"completion_length": 73.625, |
|
"epoch": 0.8881987577639752, |
|
"grad_norm": 3.4486515522003174, |
|
"kl": 0.00762939453125, |
|
"learning_rate": 1.1180124223602484e-07, |
|
"loss": 0.0008, |
|
"reward": 1.65625, |
|
"reward_mean": 1.65625, |
|
"reward_std": 0.1552036553621292, |
|
"rewards/accuracy_reward": 0.65625, |
|
"rewards/format_reward": 1.0, |
|
"step": 143 |
|
}, |
|
{ |
|
"advantages": -3.725290298461914e-09, |
|
"completion_length": 80.359375, |
|
"epoch": 0.8944099378881988, |
|
"grad_norm": 8.272978782653809, |
|
"kl": 0.00628662109375, |
|
"learning_rate": 1.0559006211180124e-07, |
|
"loss": 0.0006, |
|
"reward": 1.71875, |
|
"reward_mean": 1.71875, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.734375, |
|
"rewards/format_reward": 0.984375, |
|
"step": 144 |
|
}, |
|
{ |
|
"advantages": -7.450580596923828e-09, |
|
"completion_length": 81.1875, |
|
"epoch": 0.9006211180124224, |
|
"grad_norm": 4.848587512969971, |
|
"kl": 0.017578125, |
|
"learning_rate": 9.937888198757763e-08, |
|
"loss": 0.0018, |
|
"reward": 1.5625, |
|
"reward_mean": 1.5625, |
|
"reward_std": 0.2041158676147461, |
|
"rewards/accuracy_reward": 0.5625, |
|
"rewards/format_reward": 1.0, |
|
"step": 145 |
|
}, |
|
{ |
|
"advantages": 0.0, |
|
"completion_length": 79.984375, |
|
"epoch": 0.906832298136646, |
|
"grad_norm": 0.3604845702648163, |
|
"kl": 0.00958251953125, |
|
"learning_rate": 9.316770186335403e-08, |
|
"loss": 0.001, |
|
"reward": 1.75, |
|
"reward_mean": 1.75, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 1.0, |
|
"step": 146 |
|
}, |
|
{ |
|
"advantages": 5.587935447692871e-09, |
|
"completion_length": 76.5625, |
|
"epoch": 0.9130434782608695, |
|
"grad_norm": 10.680438995361328, |
|
"kl": 0.01116943359375, |
|
"learning_rate": 8.695652173913042e-08, |
|
"loss": 0.0011, |
|
"reward": 1.671875, |
|
"reward_mean": 1.671875, |
|
"reward_std": 0.19939783215522766, |
|
"rewards/accuracy_reward": 0.671875, |
|
"rewards/format_reward": 1.0, |
|
"step": 147 |
|
}, |
|
{ |
|
"advantages": -3.725290298461914e-09, |
|
"completion_length": 76.984375, |
|
"epoch": 0.9192546583850931, |
|
"grad_norm": 2.091907024383545, |
|
"kl": 0.010498046875, |
|
"learning_rate": 8.074534161490683e-08, |
|
"loss": 0.0011, |
|
"reward": 1.640625, |
|
"reward_mean": 1.640625, |
|
"reward_std": 0.08010874688625336, |
|
"rewards/accuracy_reward": 0.65625, |
|
"rewards/format_reward": 0.984375, |
|
"step": 148 |
|
}, |
|
{ |
|
"advantages": 0.0, |
|
"completion_length": 76.203125, |
|
"epoch": 0.9254658385093167, |
|
"grad_norm": 0.20045147836208344, |
|
"kl": 0.0078125, |
|
"learning_rate": 7.453416149068323e-08, |
|
"loss": 0.0008, |
|
"reward": 1.75, |
|
"reward_mean": 1.75, |
|
"reward_std": 0.0, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 1.0, |
|
"step": 149 |
|
}, |
|
{ |
|
"advantages": 0.0, |
|
"completion_length": 84.140625, |
|
"epoch": 0.9316770186335404, |
|
"grad_norm": 3.21720814704895, |
|
"kl": 0.008544921875, |
|
"learning_rate": 6.832298136645963e-08, |
|
"loss": 0.0009, |
|
"reward": 1.6875, |
|
"reward_mean": 1.6875, |
|
"reward_std": 0.06681530922651291, |
|
"rewards/accuracy_reward": 0.6875, |
|
"rewards/format_reward": 1.0, |
|
"step": 150 |
|
}, |
|
{ |
|
"advantages": -1.862645149230957e-09, |
|
"completion_length": 82.734375, |
|
"epoch": 0.937888198757764, |
|
"grad_norm": 7.955801963806152, |
|
"kl": 0.01263427734375, |
|
"learning_rate": 6.211180124223602e-08, |
|
"loss": 0.0013, |
|
"reward": 1.734375, |
|
"reward_mean": 1.734375, |
|
"reward_std": 0.10205793380737305, |
|
"rewards/accuracy_reward": 0.734375, |
|
"rewards/format_reward": 1.0, |
|
"step": 151 |
|
}, |
|
{ |
|
"advantages": 3.725290298461914e-09, |
|
"completion_length": 72.96875, |
|
"epoch": 0.9440993788819876, |
|
"grad_norm": 3.563530921936035, |
|
"kl": 0.0093994140625, |
|
"learning_rate": 5.590062111801242e-08, |
|
"loss": 0.0009, |
|
"reward": 1.90625, |
|
"reward_mean": 1.90625, |
|
"reward_std": 0.0578637570142746, |
|
"rewards/accuracy_reward": 0.90625, |
|
"rewards/format_reward": 1.0, |
|
"step": 152 |
|
}, |
|
{ |
|
"advantages": -9.313225746154785e-09, |
|
"completion_length": 83.125, |
|
"epoch": 0.9503105590062112, |
|
"grad_norm": 9.811988830566406, |
|
"kl": 0.0184326171875, |
|
"learning_rate": 4.9689440993788814e-08, |
|
"loss": 0.0019, |
|
"reward": 1.796875, |
|
"reward_mean": 1.796875, |
|
"reward_std": 0.15992169082164764, |
|
"rewards/accuracy_reward": 0.796875, |
|
"rewards/format_reward": 1.0, |
|
"step": 153 |
|
}, |
|
{ |
|
"advantages": -1.0244548320770264e-08, |
|
"completion_length": 83.515625, |
|
"epoch": 0.9565217391304348, |
|
"grad_norm": 3.8269639015197754, |
|
"kl": 0.0133056640625, |
|
"learning_rate": 4.347826086956521e-08, |
|
"loss": 0.0013, |
|
"reward": 1.78125, |
|
"reward_mean": 1.78125, |
|
"reward_std": 0.16675157845020294, |
|
"rewards/accuracy_reward": 0.78125, |
|
"rewards/format_reward": 1.0, |
|
"step": 154 |
|
}, |
|
{ |
|
"advantages": -7.450580596923828e-09, |
|
"completion_length": 85.671875, |
|
"epoch": 0.9627329192546584, |
|
"grad_norm": 3.470165252685547, |
|
"kl": 0.01300048828125, |
|
"learning_rate": 3.726708074534162e-08, |
|
"loss": 0.0013, |
|
"reward": 1.5625, |
|
"reward_mean": 1.5625, |
|
"reward_std": 0.1462520956993103, |
|
"rewards/accuracy_reward": 0.5625, |
|
"rewards/format_reward": 1.0, |
|
"step": 155 |
|
}, |
|
{ |
|
"advantages": -9.313225746154785e-10, |
|
"completion_length": 84.203125, |
|
"epoch": 0.968944099378882, |
|
"grad_norm": 2.550407648086548, |
|
"kl": 0.00946044921875, |
|
"learning_rate": 3.105590062111801e-08, |
|
"loss": 0.0009, |
|
"reward": 1.625, |
|
"reward_mean": 1.625, |
|
"reward_std": 0.16675157845020294, |
|
"rewards/accuracy_reward": 0.640625, |
|
"rewards/format_reward": 0.984375, |
|
"step": 156 |
|
}, |
|
{ |
|
"advantages": 3.725290298461914e-09, |
|
"completion_length": 76.296875, |
|
"epoch": 0.9751552795031055, |
|
"grad_norm": 3.396425247192383, |
|
"kl": 0.00714111328125, |
|
"learning_rate": 2.4844720496894407e-08, |
|
"loss": 0.0007, |
|
"reward": 1.546875, |
|
"reward_mean": 1.546875, |
|
"reward_std": 0.15992169082164764, |
|
"rewards/accuracy_reward": 0.546875, |
|
"rewards/format_reward": 1.0, |
|
"step": 157 |
|
}, |
|
{ |
|
"advantages": -2.7939677238464355e-09, |
|
"completion_length": 73.859375, |
|
"epoch": 0.9813664596273292, |
|
"grad_norm": 3.776041030883789, |
|
"kl": 0.00921630859375, |
|
"learning_rate": 1.863354037267081e-08, |
|
"loss": 0.0009, |
|
"reward": 1.59375, |
|
"reward_mean": 1.59375, |
|
"reward_std": 0.10888782143592834, |
|
"rewards/accuracy_reward": 0.59375, |
|
"rewards/format_reward": 1.0, |
|
"step": 158 |
|
}, |
|
{ |
|
"advantages": 1.862645149230957e-09, |
|
"completion_length": 77.953125, |
|
"epoch": 0.9875776397515528, |
|
"grad_norm": 3.304471254348755, |
|
"kl": 0.01263427734375, |
|
"learning_rate": 1.2422360248447204e-08, |
|
"loss": 0.0013, |
|
"reward": 1.796875, |
|
"reward_mean": 1.796875, |
|
"reward_std": 0.11100947856903076, |
|
"rewards/accuracy_reward": 0.796875, |
|
"rewards/format_reward": 1.0, |
|
"step": 159 |
|
}, |
|
{ |
|
"advantages": 0.0, |
|
"completion_length": 79.25, |
|
"epoch": 0.9937888198757764, |
|
"grad_norm": 5.823967456817627, |
|
"kl": 0.00897216796875, |
|
"learning_rate": 6.211180124223602e-09, |
|
"loss": 0.0009, |
|
"reward": 1.5, |
|
"reward_mean": 1.5, |
|
"reward_std": 0.0883883461356163, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 1.0, |
|
"step": 160 |
|
}, |
|
{ |
|
"advantages": -0.5890890955924988, |
|
"completion_length": 89.33333587646484, |
|
"epoch": 1.0, |
|
"grad_norm": 2.0931286811828613, |
|
"kl": 0.00677490234375, |
|
"learning_rate": 0.0, |
|
"loss": 0.001, |
|
"reward": 1.6666667461395264, |
|
"reward_mean": 1.875, |
|
"reward_std": 0.3535533845424652, |
|
"rewards/accuracy_reward": 0.6666666865348816, |
|
"rewards/format_reward": 1.0, |
|
"step": 161 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 161, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 10, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|