|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 5.633802816901408, |
|
"eval_steps": 500, |
|
"global_step": 400, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 194.03125, |
|
"epoch": 0.028169014084507043, |
|
"grad_norm": 15.590116500854492, |
|
"kl": 67.96075868606567, |
|
"learning_rate": 4.1666666666666667e-07, |
|
"loss": 0.068, |
|
"reward": 18.734252750873566, |
|
"reward_std": 6.550818961113691, |
|
"rewards/concensus_correctness_reward_func": 14.375, |
|
"rewards/consensus_reward_func": 1.5625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.4375, |
|
"rewards/question_recreation_reward_func": 0.6794400699436665, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0625, |
|
"rewards/xmlcount_reward_func": 0.6173125002533197, |
|
"step": 2 |
|
}, |
|
{ |
|
"completion_length": 205.28125, |
|
"epoch": 0.056338028169014086, |
|
"grad_norm": 530.9674072265625, |
|
"kl": 265.4313408136368, |
|
"learning_rate": 1.25e-06, |
|
"loss": 0.2654, |
|
"reward": 21.486406087875366, |
|
"reward_std": 6.450116660445929, |
|
"rewards/concensus_correctness_reward_func": 16.35931247472763, |
|
"rewards/consensus_reward_func": 1.625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.75, |
|
"rewards/question_recreation_reward_func": 0.8441246300935745, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.171875, |
|
"rewards/xmlcount_reward_func": 0.7360937558114529, |
|
"step": 4 |
|
}, |
|
{ |
|
"completion_length": 247.0, |
|
"epoch": 0.08450704225352113, |
|
"grad_norm": 7.194105625152588, |
|
"kl": 10.1644686460495, |
|
"learning_rate": 2.0833333333333334e-06, |
|
"loss": 0.0102, |
|
"reward": 24.599268674850464, |
|
"reward_std": 1.6305736564099789, |
|
"rewards/concensus_correctness_reward_func": 19.375, |
|
"rewards/consensus_reward_func": 1.9375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.75, |
|
"rewards/question_recreation_reward_func": 0.8394556865096092, |
|
"rewards/soft_format_reward_func": 0.03125, |
|
"rewards/strict_format_reward_func": 0.09375, |
|
"rewards/xmlcount_reward_func": 0.5723125021904707, |
|
"step": 6 |
|
}, |
|
{ |
|
"completion_length": 205.78125, |
|
"epoch": 0.11267605633802817, |
|
"grad_norm": 14.453492164611816, |
|
"kl": 29.00755314528942, |
|
"learning_rate": 2.916666666666667e-06, |
|
"loss": 0.029, |
|
"reward": 21.17924928665161, |
|
"reward_std": 7.817702278494835, |
|
"rewards/concensus_correctness_reward_func": 16.25, |
|
"rewards/consensus_reward_func": 1.625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.625, |
|
"rewards/question_recreation_reward_func": 0.7514051459729671, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.15625, |
|
"rewards/xmlcount_reward_func": 0.7715937495231628, |
|
"step": 8 |
|
}, |
|
{ |
|
"completion_length": 254.78125, |
|
"epoch": 0.14084507042253522, |
|
"grad_norm": 4.9339776039123535, |
|
"kl": 534.9701635837555, |
|
"learning_rate": 3.7500000000000005e-06, |
|
"loss": 0.535, |
|
"reward": 22.273706436157227, |
|
"reward_std": 5.927923844195902, |
|
"rewards/concensus_correctness_reward_func": 17.5, |
|
"rewards/consensus_reward_func": 1.75, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.6875, |
|
"rewards/question_recreation_reward_func": 0.7792688012123108, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.09375, |
|
"rewards/xmlcount_reward_func": 0.4475625064224005, |
|
"step": 10 |
|
}, |
|
{ |
|
"completion_length": 223.84375, |
|
"epoch": 0.16901408450704225, |
|
"grad_norm": 126.1221923828125, |
|
"kl": 111.43436747789383, |
|
"learning_rate": 4.583333333333333e-06, |
|
"loss": 0.1114, |
|
"reward": 19.623693704605103, |
|
"reward_std": 9.295258034020662, |
|
"rewards/concensus_correctness_reward_func": 15.0, |
|
"rewards/consensus_reward_func": 1.5, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.625, |
|
"rewards/question_recreation_reward_func": 0.8043808117508888, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.109375, |
|
"rewards/xmlcount_reward_func": 0.5849375165998936, |
|
"step": 12 |
|
}, |
|
{ |
|
"completion_length": 258.84375, |
|
"epoch": 0.19718309859154928, |
|
"grad_norm": 13.649314880371094, |
|
"kl": 29.206983238458633, |
|
"learning_rate": 4.999918050947891e-06, |
|
"loss": 0.0292, |
|
"reward": 18.92366051673889, |
|
"reward_std": 7.471162365749478, |
|
"rewards/concensus_correctness_reward_func": 14.375, |
|
"rewards/consensus_reward_func": 1.6875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.5, |
|
"rewards/question_recreation_reward_func": 0.851754330098629, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.078125, |
|
"rewards/xmlcount_reward_func": 0.41565624810755253, |
|
"step": 14 |
|
}, |
|
{ |
|
"completion_length": 210.9375, |
|
"epoch": 0.22535211267605634, |
|
"grad_norm": 7.758904457092285, |
|
"kl": 288.0020731687546, |
|
"learning_rate": 4.99926249076577e-06, |
|
"loss": 0.288, |
|
"reward": 18.59817200899124, |
|
"reward_std": 4.863896086812019, |
|
"rewards/concensus_correctness_reward_func": 13.75, |
|
"rewards/consensus_reward_func": 1.5625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.625, |
|
"rewards/question_recreation_reward_func": 0.6548594255000353, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.203125, |
|
"rewards/xmlcount_reward_func": 0.7870624996721745, |
|
"step": 16 |
|
}, |
|
{ |
|
"completion_length": 292.53125, |
|
"epoch": 0.2535211267605634, |
|
"grad_norm": 6.619876384735107, |
|
"kl": 22.940940707921982, |
|
"learning_rate": 4.9979515423108255e-06, |
|
"loss": 0.0229, |
|
"reward": 20.082338631153107, |
|
"reward_std": 4.752222462557256, |
|
"rewards/concensus_correctness_reward_func": 15.625, |
|
"rewards/consensus_reward_func": 1.75, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.4375, |
|
"rewards/question_recreation_reward_func": 0.8020261619240046, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0625, |
|
"rewards/xmlcount_reward_func": 0.4053125027567148, |
|
"step": 18 |
|
}, |
|
{ |
|
"completion_length": 238.28125, |
|
"epoch": 0.28169014084507044, |
|
"grad_norm": 9.172245025634766, |
|
"kl": 60.055853977799416, |
|
"learning_rate": 4.995985549356568e-06, |
|
"loss": 0.0601, |
|
"reward": 16.823714524507523, |
|
"reward_std": 5.831961344927549, |
|
"rewards/concensus_correctness_reward_func": 11.983624935150146, |
|
"rewards/consensus_reward_func": 1.6875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.4375, |
|
"rewards/question_recreation_reward_func": 0.7774647548794746, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.1875, |
|
"rewards/xmlcount_reward_func": 0.7501249983906746, |
|
"step": 20 |
|
}, |
|
{ |
|
"completion_length": 230.53125, |
|
"epoch": 0.30985915492957744, |
|
"grad_norm": 6.337810516357422, |
|
"kl": 20.89355828613043, |
|
"learning_rate": 4.993365027450576e-06, |
|
"loss": 0.0209, |
|
"reward": 22.543599009513855, |
|
"reward_std": 4.786159439012408, |
|
"rewards/concensus_correctness_reward_func": 17.5, |
|
"rewards/consensus_reward_func": 1.75, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.75, |
|
"rewards/question_recreation_reward_func": 0.654692716896534, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.1875, |
|
"rewards/xmlcount_reward_func": 0.7014062590897083, |
|
"step": 22 |
|
}, |
|
{ |
|
"completion_length": 233.15625, |
|
"epoch": 0.3380281690140845, |
|
"grad_norm": 5.86886739730835, |
|
"kl": 4.685514692217112, |
|
"learning_rate": 4.990090663779305e-06, |
|
"loss": 0.0047, |
|
"reward": 24.771531105041504, |
|
"reward_std": 2.2020363211631775, |
|
"rewards/concensus_correctness_reward_func": 19.375, |
|
"rewards/consensus_reward_func": 1.9375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.625, |
|
"rewards/question_recreation_reward_func": 0.7914998307824135, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.203125, |
|
"rewards/xmlcount_reward_func": 0.8237812481820583, |
|
"step": 24 |
|
}, |
|
{ |
|
"completion_length": 218.1875, |
|
"epoch": 0.36619718309859156, |
|
"grad_norm": 8.693346977233887, |
|
"kl": 22.532070949673653, |
|
"learning_rate": 4.986163316987877e-06, |
|
"loss": 0.0225, |
|
"reward": 23.11518883705139, |
|
"reward_std": 5.961791490204632, |
|
"rewards/concensus_correctness_reward_func": 17.5, |
|
"rewards/consensus_reward_func": 1.75, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.8125, |
|
"rewards/question_recreation_reward_func": 0.7831886559724808, |
|
"rewards/soft_format_reward_func": 0.03125, |
|
"rewards/strict_format_reward_func": 0.296875, |
|
"rewards/xmlcount_reward_func": 0.9413749948143959, |
|
"step": 26 |
|
}, |
|
{ |
|
"completion_length": 199.875, |
|
"epoch": 0.39436619718309857, |
|
"grad_norm": 45.534908294677734, |
|
"kl": 43.07646985352039, |
|
"learning_rate": 4.9815840169549216e-06, |
|
"loss": 0.0431, |
|
"reward": 22.956753134727478, |
|
"reward_std": 3.708504168316722, |
|
"rewards/concensus_correctness_reward_func": 17.5, |
|
"rewards/consensus_reward_func": 1.75, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.8125, |
|
"rewards/question_recreation_reward_func": 0.8589722141623497, |
|
"rewards/soft_format_reward_func": 0.03125, |
|
"rewards/strict_format_reward_func": 0.203125, |
|
"rewards/xmlcount_reward_func": 0.8009062558412552, |
|
"step": 28 |
|
}, |
|
{ |
|
"completion_length": 229.34375, |
|
"epoch": 0.4225352112676056, |
|
"grad_norm": 188.76895141601562, |
|
"kl": 96.50678093731403, |
|
"learning_rate": 4.976353964522509e-06, |
|
"loss": 0.0965, |
|
"reward": 22.132088541984558, |
|
"reward_std": 6.213916528970003, |
|
"rewards/concensus_correctness_reward_func": 16.875, |
|
"rewards/consensus_reward_func": 1.6875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.8125, |
|
"rewards/question_recreation_reward_func": 0.813995435833931, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.1875, |
|
"rewards/xmlcount_reward_func": 0.7555937431752682, |
|
"step": 30 |
|
}, |
|
{ |
|
"completion_length": 251.0, |
|
"epoch": 0.4507042253521127, |
|
"grad_norm": 89.95954895019531, |
|
"kl": 42.50997355952859, |
|
"learning_rate": 4.970474531181245e-06, |
|
"loss": 0.0425, |
|
"reward": 21.101596146821976, |
|
"reward_std": 3.359893566928804, |
|
"rewards/concensus_correctness_reward_func": 16.25, |
|
"rewards/consensus_reward_func": 1.875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.5, |
|
"rewards/question_recreation_reward_func": 0.8080957010388374, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.125, |
|
"rewards/xmlcount_reward_func": 0.5278750080615282, |
|
"step": 32 |
|
}, |
|
{ |
|
"completion_length": 270.4375, |
|
"epoch": 0.4788732394366197, |
|
"grad_norm": 7.159509181976318, |
|
"kl": 37.1051784530282, |
|
"learning_rate": 4.963947258710626e-06, |
|
"loss": 0.0371, |
|
"reward": 21.699138522148132, |
|
"reward_std": 2.3117441162467003, |
|
"rewards/concensus_correctness_reward_func": 16.875, |
|
"rewards/consensus_reward_func": 1.875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.5625, |
|
"rewards/question_recreation_reward_func": 0.7434196844696999, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.125, |
|
"rewards/xmlcount_reward_func": 0.5025937538594007, |
|
"step": 34 |
|
}, |
|
{ |
|
"completion_length": 234.71875, |
|
"epoch": 0.5070422535211268, |
|
"grad_norm": 8.73612117767334, |
|
"kl": 2501.0413611084223, |
|
"learning_rate": 4.9567738587747314e-06, |
|
"loss": 2.501, |
|
"reward": 16.33679434657097, |
|
"reward_std": 5.108438193798065, |
|
"rewards/concensus_correctness_reward_func": 11.875, |
|
"rewards/consensus_reward_func": 1.4375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.375, |
|
"rewards/question_recreation_reward_func": 0.7808256670832634, |
|
"rewards/soft_format_reward_func": 0.03125, |
|
"rewards/strict_format_reward_func": 0.15625, |
|
"rewards/xmlcount_reward_func": 0.6809687577188015, |
|
"step": 36 |
|
}, |
|
{ |
|
"completion_length": 278.25, |
|
"epoch": 0.5352112676056338, |
|
"grad_norm": 3.27095627784729, |
|
"kl": 10.008030999451876, |
|
"learning_rate": 4.948956212473371e-06, |
|
"loss": 0.01, |
|
"reward": 16.04705312848091, |
|
"reward_std": 2.1642781402915716, |
|
"rewards/concensus_correctness_reward_func": 11.875, |
|
"rewards/consensus_reward_func": 1.875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.0625, |
|
"rewards/question_recreation_reward_func": 0.7825527861714363, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.046875, |
|
"rewards/xmlcount_reward_func": 0.38950000517070293, |
|
"step": 38 |
|
}, |
|
{ |
|
"completion_length": 225.75, |
|
"epoch": 0.5633802816901409, |
|
"grad_norm": 9.338530540466309, |
|
"kl": 121.52913957834244, |
|
"learning_rate": 4.940496369848795e-06, |
|
"loss": 0.1215, |
|
"reward": 14.5195372402668, |
|
"reward_std": 6.823352798819542, |
|
"rewards/concensus_correctness_reward_func": 10.720062494277954, |
|
"rewards/consensus_reward_func": 1.25, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.25, |
|
"rewards/question_recreation_reward_func": 0.6889123450964689, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.078125, |
|
"rewards/xmlcount_reward_func": 0.5324374958872795, |
|
"step": 40 |
|
}, |
|
{ |
|
"completion_length": 273.71875, |
|
"epoch": 0.5915492957746479, |
|
"grad_norm": 3.6006832122802734, |
|
"kl": 9.235054649412632, |
|
"learning_rate": 4.931396549348115e-06, |
|
"loss": 0.0092, |
|
"reward": 23.575079202651978, |
|
"reward_std": 3.452487599104643, |
|
"rewards/concensus_correctness_reward_func": 18.75, |
|
"rewards/consensus_reward_func": 1.875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.625, |
|
"rewards/question_recreation_reward_func": 0.8597666844725609, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.0625, |
|
"rewards/xmlcount_reward_func": 0.38718749675899744, |
|
"step": 42 |
|
}, |
|
{ |
|
"completion_length": 228.65625, |
|
"epoch": 0.6197183098591549, |
|
"grad_norm": 184.4779510498047, |
|
"kl": 66.61591627448797, |
|
"learning_rate": 4.921659137241544e-06, |
|
"loss": 0.0666, |
|
"reward": 21.40922224521637, |
|
"reward_std": 6.415304251015186, |
|
"rewards/concensus_correctness_reward_func": 16.875, |
|
"rewards/consensus_reward_func": 1.6875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.5625, |
|
"rewards/question_recreation_reward_func": 0.788409948348999, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.046875, |
|
"rewards/xmlcount_reward_func": 0.4489374943077564, |
|
"step": 44 |
|
}, |
|
{ |
|
"completion_length": 261.5625, |
|
"epoch": 0.647887323943662, |
|
"grad_norm": 2.263350009918213, |
|
"kl": 5.652421373873949, |
|
"learning_rate": 4.911286686996648e-06, |
|
"loss": 0.0057, |
|
"reward": 22.376519441604614, |
|
"reward_std": 0.7325459104031324, |
|
"rewards/concensus_correctness_reward_func": 17.5, |
|
"rewards/consensus_reward_func": 1.9375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.5625, |
|
"rewards/question_recreation_reward_func": 0.8018633462488651, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.09375, |
|
"rewards/xmlcount_reward_func": 0.48090626299381256, |
|
"step": 46 |
|
}, |
|
{ |
|
"completion_length": 218.8125, |
|
"epoch": 0.676056338028169, |
|
"grad_norm": 6.410772323608398, |
|
"kl": 17.982181690633297, |
|
"learning_rate": 4.900281918608732e-06, |
|
"loss": 0.018, |
|
"reward": 23.878349542617798, |
|
"reward_std": 3.440878137946129, |
|
"rewards/concensus_correctness_reward_func": 18.75, |
|
"rewards/consensus_reward_func": 1.875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.6875, |
|
"rewards/question_recreation_reward_func": 0.8038181811571121, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.125, |
|
"rewards/xmlcount_reward_func": 0.6370312627404928, |
|
"step": 48 |
|
}, |
|
{ |
|
"completion_length": 216.34375, |
|
"epoch": 0.704225352112676, |
|
"grad_norm": 12.63294506072998, |
|
"kl": 20.948822245001793, |
|
"learning_rate": 4.888647717887582e-06, |
|
"loss": 0.0209, |
|
"reward": 18.739310264587402, |
|
"reward_std": 6.688391337171197, |
|
"rewards/concensus_correctness_reward_func": 14.375, |
|
"rewards/consensus_reward_func": 1.625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.375, |
|
"rewards/question_recreation_reward_func": 0.669216588139534, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.09375, |
|
"rewards/xmlcount_reward_func": 0.6013437523506582, |
|
"step": 50 |
|
}, |
|
{ |
|
"completion_length": 232.8125, |
|
"epoch": 0.7323943661971831, |
|
"grad_norm": 4.447786808013916, |
|
"kl": 24.20983089506626, |
|
"learning_rate": 4.876387135700701e-06, |
|
"loss": 0.0242, |
|
"reward": 23.035974979400635, |
|
"reward_std": 4.954922638833523, |
|
"rewards/concensus_correctness_reward_func": 18.125, |
|
"rewards/consensus_reward_func": 1.8125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.5625, |
|
"rewards/question_recreation_reward_func": 0.8577248528599739, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.109375, |
|
"rewards/xmlcount_reward_func": 0.5532500119879842, |
|
"step": 52 |
|
}, |
|
{ |
|
"completion_length": 219.03125, |
|
"epoch": 0.7605633802816901, |
|
"grad_norm": 5.603798866271973, |
|
"kl": 47.00118863582611, |
|
"learning_rate": 4.863503387173276e-06, |
|
"loss": 0.047, |
|
"reward": 17.631301164627075, |
|
"reward_std": 6.2861207174137235, |
|
"rewards/concensus_correctness_reward_func": 13.125, |
|
"rewards/consensus_reward_func": 1.5625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.5625, |
|
"rewards/question_recreation_reward_func": 0.7153325416147709, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.078125, |
|
"rewards/xmlcount_reward_func": 0.5722187510691583, |
|
"step": 54 |
|
}, |
|
{ |
|
"completion_length": 235.59375, |
|
"epoch": 0.7887323943661971, |
|
"grad_norm": 2.9642035961151123, |
|
"kl": 12.14004921168089, |
|
"learning_rate": 4.849999850845066e-06, |
|
"loss": 0.0121, |
|
"reward": 20.853686690330505, |
|
"reward_std": 3.2716546999290586, |
|
"rewards/concensus_correctness_reward_func": 16.25, |
|
"rewards/consensus_reward_func": 1.875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.375, |
|
"rewards/question_recreation_reward_func": 0.8614683747291565, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.046875, |
|
"rewards/xmlcount_reward_func": 0.42971874959766865, |
|
"step": 56 |
|
}, |
|
{ |
|
"completion_length": 204.15625, |
|
"epoch": 0.8169014084507042, |
|
"grad_norm": 26.085582733154297, |
|
"kl": 55.548476845026016, |
|
"learning_rate": 4.835880067784441e-06, |
|
"loss": 0.0555, |
|
"reward": 22.38655924797058, |
|
"reward_std": 5.256647571921349, |
|
"rewards/concensus_correctness_reward_func": 17.5, |
|
"rewards/consensus_reward_func": 1.75, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.6875, |
|
"rewards/question_recreation_reward_func": 0.7643403187394142, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.109375, |
|
"rewards/xmlcount_reward_func": 0.5597187718376517, |
|
"step": 58 |
|
}, |
|
{ |
|
"completion_length": 224.78125, |
|
"epoch": 0.8450704225352113, |
|
"grad_norm": 16.9488468170166, |
|
"kl": 40.86702236533165, |
|
"learning_rate": 4.821147740659795e-06, |
|
"loss": 0.0409, |
|
"reward": 21.230697870254517, |
|
"reward_std": 7.334933251142502, |
|
"rewards/concensus_correctness_reward_func": 16.875, |
|
"rewards/consensus_reward_func": 1.6875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.375, |
|
"rewards/question_recreation_reward_func": 0.6201041154563427, |
|
"rewards/soft_format_reward_func": 0.03125, |
|
"rewards/strict_format_reward_func": 0.09375, |
|
"rewards/xmlcount_reward_func": 0.5480937454849482, |
|
"step": 60 |
|
}, |
|
{ |
|
"completion_length": 258.875, |
|
"epoch": 0.8732394366197183, |
|
"grad_norm": 20.7008056640625, |
|
"kl": 29.40393216907978, |
|
"learning_rate": 4.805806732768585e-06, |
|
"loss": 0.0294, |
|
"reward": 20.777413338422775, |
|
"reward_std": 3.6380002200603485, |
|
"rewards/concensus_correctness_reward_func": 16.25, |
|
"rewards/consensus_reward_func": 1.8125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.3125, |
|
"rewards/question_recreation_reward_func": 0.8443503454327583, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.046875, |
|
"rewards/xmlcount_reward_func": 0.49556251242756844, |
|
"step": 62 |
|
}, |
|
{ |
|
"completion_length": 187.5625, |
|
"epoch": 0.9014084507042254, |
|
"grad_norm": 736.5669555664062, |
|
"kl": 481.64947575330734, |
|
"learning_rate": 4.789861067024253e-06, |
|
"loss": 0.4816, |
|
"reward": 21.166576385498047, |
|
"reward_std": 6.786355759948492, |
|
"rewards/concensus_correctness_reward_func": 16.875, |
|
"rewards/consensus_reward_func": 1.6875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.375, |
|
"rewards/question_recreation_reward_func": 0.5792638175189495, |
|
"rewards/soft_format_reward_func": 0.03125, |
|
"rewards/strict_format_reward_func": 0.078125, |
|
"rewards/xmlcount_reward_func": 0.540437500923872, |
|
"step": 64 |
|
}, |
|
{ |
|
"completion_length": 214.96875, |
|
"epoch": 0.9295774647887324, |
|
"grad_norm": 35.92903137207031, |
|
"kl": 103.99411916732788, |
|
"learning_rate": 4.773314924901281e-06, |
|
"loss": 0.104, |
|
"reward": 19.490996658802032, |
|
"reward_std": 7.800664484500885, |
|
"rewards/concensus_correctness_reward_func": 15.0, |
|
"rewards/consensus_reward_func": 1.5, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.4375, |
|
"rewards/question_recreation_reward_func": 0.80152777582407, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.09375, |
|
"rewards/xmlcount_reward_func": 0.6425937414169312, |
|
"step": 66 |
|
}, |
|
{ |
|
"completion_length": 235.125, |
|
"epoch": 0.9577464788732394, |
|
"grad_norm": 10.029170989990234, |
|
"kl": 18.18737083673477, |
|
"learning_rate": 4.756172645338675e-06, |
|
"loss": 0.0182, |
|
"reward": 19.079940140247345, |
|
"reward_std": 6.256794525776058, |
|
"rewards/concensus_correctness_reward_func": 15.0, |
|
"rewards/consensus_reward_func": 1.75, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.1875, |
|
"rewards/question_recreation_reward_func": 0.6954717859625816, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.43134375661611557, |
|
"step": 68 |
|
}, |
|
{ |
|
"completion_length": 226.34375, |
|
"epoch": 0.9859154929577465, |
|
"grad_norm": 5.851627349853516, |
|
"kl": 14.625127524137497, |
|
"learning_rate": 4.738438723602154e-06, |
|
"loss": 0.0146, |
|
"reward": 23.478822708129883, |
|
"reward_std": 3.651876477524638, |
|
"rewards/concensus_correctness_reward_func": 18.75, |
|
"rewards/consensus_reward_func": 1.875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.625, |
|
"rewards/question_recreation_reward_func": 0.6802600920200348, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.046875, |
|
"rewards/xmlcount_reward_func": 0.5016875043511391, |
|
"step": 70 |
|
}, |
|
{ |
|
"completion_length": 227.84375, |
|
"epoch": 1.0140845070422535, |
|
"grad_norm": 9.736303329467773, |
|
"kl": 32.25694251060486, |
|
"learning_rate": 4.720117810105341e-06, |
|
"loss": 0.0323, |
|
"reward": 21.3710036277771, |
|
"reward_std": 7.411025664303452, |
|
"rewards/concensus_correctness_reward_func": 16.875, |
|
"rewards/consensus_reward_func": 1.6875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.5, |
|
"rewards/question_recreation_reward_func": 0.8385977782309055, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.03125, |
|
"rewards/xmlcount_reward_func": 0.4386562556028366, |
|
"step": 72 |
|
}, |
|
{ |
|
"completion_length": 197.375, |
|
"epoch": 1.0422535211267605, |
|
"grad_norm": 29.349096298217773, |
|
"kl": 51.72871816158295, |
|
"learning_rate": 4.701214709190277e-06, |
|
"loss": 0.0517, |
|
"reward": 21.43280816078186, |
|
"reward_std": 7.328175559639931, |
|
"rewards/concensus_correctness_reward_func": 16.875, |
|
"rewards/consensus_reward_func": 1.6875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.5, |
|
"rewards/question_recreation_reward_func": 0.6436204127967358, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.09375, |
|
"rewards/xmlcount_reward_func": 0.6173124928027391, |
|
"step": 74 |
|
}, |
|
{ |
|
"completion_length": 245.84375, |
|
"epoch": 1.0704225352112675, |
|
"grad_norm": 43.128273010253906, |
|
"kl": 41.08760707080364, |
|
"learning_rate": 4.681734377867562e-06, |
|
"loss": 0.0411, |
|
"reward": 21.750689268112183, |
|
"reward_std": 2.0954109141603112, |
|
"rewards/concensus_correctness_reward_func": 16.875, |
|
"rewards/consensus_reward_func": 1.875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.5625, |
|
"rewards/question_recreation_reward_func": 0.8656269088387489, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.078125, |
|
"rewards/xmlcount_reward_func": 0.4944375103805214, |
|
"step": 76 |
|
}, |
|
{ |
|
"completion_length": 218.09375, |
|
"epoch": 1.0985915492957747, |
|
"grad_norm": 96.18486785888672, |
|
"kl": 93.65772761404514, |
|
"learning_rate": 4.661681924516466e-06, |
|
"loss": 0.0937, |
|
"reward": 22.785533666610718, |
|
"reward_std": 4.547428795136511, |
|
"rewards/concensus_correctness_reward_func": 18.125, |
|
"rewards/consensus_reward_func": 1.8125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.5, |
|
"rewards/question_recreation_reward_func": 0.7730333730578423, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0625, |
|
"rewards/xmlcount_reward_func": 0.5124999992549419, |
|
"step": 78 |
|
}, |
|
{ |
|
"completion_length": 189.1875, |
|
"epoch": 1.1267605633802817, |
|
"grad_norm": 13.744595527648926, |
|
"kl": 41.68859389424324, |
|
"learning_rate": 4.641062607545347e-06, |
|
"loss": 0.0417, |
|
"reward": 19.403677821159363, |
|
"reward_std": 8.246666595339775, |
|
"rewards/concensus_correctness_reward_func": 15.735687494277954, |
|
"rewards/consensus_reward_func": 1.5625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.0625, |
|
"rewards/question_recreation_reward_func": 0.5538652390241623, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.47350000962615013, |
|
"step": 80 |
|
}, |
|
{ |
|
"completion_length": 241.0, |
|
"epoch": 1.1549295774647887, |
|
"grad_norm": 3.5648226737976074, |
|
"kl": 11.036557964980602, |
|
"learning_rate": 4.61988183401272e-06, |
|
"loss": 0.011, |
|
"reward": 20.90431860089302, |
|
"reward_std": 3.5142957847565413, |
|
"rewards/concensus_correctness_reward_func": 16.25, |
|
"rewards/consensus_reward_func": 1.875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.375, |
|
"rewards/question_recreation_reward_func": 0.8119121938943863, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.0625, |
|
"rewards/xmlcount_reward_func": 0.5142812412232161, |
|
"step": 82 |
|
}, |
|
{ |
|
"completion_length": 207.3125, |
|
"epoch": 1.1830985915492958, |
|
"grad_norm": 10.732950210571289, |
|
"kl": 52.33975350856781, |
|
"learning_rate": 4.598145158209356e-06, |
|
"loss": 0.0523, |
|
"reward": 18.125176668167114, |
|
"reward_std": 10.32908346131444, |
|
"rewards/concensus_correctness_reward_func": 13.75, |
|
"rewards/consensus_reward_func": 1.375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.6875, |
|
"rewards/question_recreation_reward_func": 0.6812697537243366, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0625, |
|
"rewards/xmlcount_reward_func": 0.5689062438905239, |
|
"step": 84 |
|
}, |
|
{ |
|
"completion_length": 252.25, |
|
"epoch": 1.2112676056338028, |
|
"grad_norm": 23.430219650268555, |
|
"kl": 31.533756278455257, |
|
"learning_rate": 4.575858280201761e-06, |
|
"loss": 0.0315, |
|
"reward": 14.606143146753311, |
|
"reward_std": 3.439208870753646, |
|
"rewards/concensus_correctness_reward_func": 10.625, |
|
"rewards/consensus_reward_func": 1.8125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.0, |
|
"rewards/question_recreation_reward_func": 0.7599556222558022, |
|
"rewards/soft_format_reward_func": 0.03125, |
|
"rewards/strict_format_reward_func": 0.03125, |
|
"rewards/xmlcount_reward_func": 0.34618750773370266, |
|
"step": 86 |
|
}, |
|
{ |
|
"completion_length": 226.8125, |
|
"epoch": 1.2394366197183098, |
|
"grad_norm": 6.766458034515381, |
|
"kl": 19.861644983291626, |
|
"learning_rate": 4.5530270443374305e-06, |
|
"loss": 0.0199, |
|
"reward": 21.676340103149414, |
|
"reward_std": 6.460458487272263, |
|
"rewards/concensus_correctness_reward_func": 17.5, |
|
"rewards/consensus_reward_func": 1.75, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.3125, |
|
"rewards/question_recreation_reward_func": 0.6662770844995975, |
|
"rewards/soft_format_reward_func": 0.03125, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.40068749710917473, |
|
"step": 88 |
|
}, |
|
{ |
|
"completion_length": 206.5, |
|
"epoch": 1.267605633802817, |
|
"grad_norm": 14.366485595703125, |
|
"kl": 24.39778110384941, |
|
"learning_rate": 4.5296574377122765e-06, |
|
"loss": 0.0244, |
|
"reward": 23.023337364196777, |
|
"reward_std": 4.633859112858772, |
|
"rewards/concensus_correctness_reward_func": 18.125, |
|
"rewards/consensus_reward_func": 1.8125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.75, |
|
"rewards/question_recreation_reward_func": 0.6368377842009068, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.0625, |
|
"rewards/xmlcount_reward_func": 0.6208749823272228, |
|
"step": 90 |
|
}, |
|
{ |
|
"completion_length": 271.75, |
|
"epoch": 1.295774647887324, |
|
"grad_norm": 6.00218391418457, |
|
"kl": 18.09646901488304, |
|
"learning_rate": 4.505755588600613e-06, |
|
"loss": 0.0181, |
|
"reward": 22.362977981567383, |
|
"reward_std": 3.771769030485302, |
|
"rewards/concensus_correctness_reward_func": 18.125, |
|
"rewards/consensus_reward_func": 1.8125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.4375, |
|
"rewards/question_recreation_reward_func": 0.6680713146924973, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.28865625197067857, |
|
"step": 92 |
|
}, |
|
{ |
|
"completion_length": 221.40625, |
|
"epoch": 1.323943661971831, |
|
"grad_norm": 260.9346008300781, |
|
"kl": 85.69054782390594, |
|
"learning_rate": 4.481327764848118e-06, |
|
"loss": 0.0857, |
|
"reward": 21.98690915107727, |
|
"reward_std": 6.104137388058007, |
|
"rewards/concensus_correctness_reward_func": 17.5, |
|
"rewards/consensus_reward_func": 1.75, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.6875, |
|
"rewards/question_recreation_reward_func": 0.6172217763960361, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.4009375046007335, |
|
"step": 94 |
|
}, |
|
{ |
|
"completion_length": 242.46875, |
|
"epoch": 1.352112676056338, |
|
"grad_norm": 3.6559009552001953, |
|
"kl": 16.68967443704605, |
|
"learning_rate": 4.456380372228208e-06, |
|
"loss": 0.0167, |
|
"reward": 20.938228607177734, |
|
"reward_std": 3.3481120225042105, |
|
"rewards/concensus_correctness_reward_func": 16.25, |
|
"rewards/consensus_reward_func": 1.875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.5, |
|
"rewards/question_recreation_reward_func": 0.7461661994457245, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.03125, |
|
"rewards/xmlcount_reward_func": 0.5201875008642673, |
|
"step": 96 |
|
}, |
|
{ |
|
"completion_length": 245.59375, |
|
"epoch": 1.380281690140845, |
|
"grad_norm": 46.14374542236328, |
|
"kl": 73.46418565511703, |
|
"learning_rate": 4.430919952762226e-06, |
|
"loss": 0.0735, |
|
"reward": 19.775842905044556, |
|
"reward_std": 7.6586533188819885, |
|
"rewards/concensus_correctness_reward_func": 15.625, |
|
"rewards/consensus_reward_func": 1.5625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.5, |
|
"rewards/question_recreation_reward_func": 0.6588428560644388, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.03125, |
|
"rewards/xmlcount_reward_func": 0.38262501033023, |
|
"step": 98 |
|
}, |
|
{ |
|
"completion_length": 151.8125, |
|
"epoch": 1.408450704225352, |
|
"grad_norm": 11.883987426757812, |
|
"kl": 99.90218496322632, |
|
"learning_rate": 4.404953183003916e-06, |
|
"loss": 0.0999, |
|
"reward": 20.656064450740814, |
|
"reward_std": 5.514936912804842, |
|
"rewards/concensus_correctness_reward_func": 16.25, |
|
"rewards/consensus_reward_func": 1.625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.4375, |
|
"rewards/question_recreation_reward_func": 0.6430017780512571, |
|
"rewards/soft_format_reward_func": 0.03125, |
|
"rewards/strict_format_reward_func": 0.078125, |
|
"rewards/xmlcount_reward_func": 0.5911874920129776, |
|
"step": 100 |
|
}, |
|
{ |
|
"completion_length": 217.5625, |
|
"epoch": 1.436619718309859, |
|
"grad_norm": 48.88766098022461, |
|
"kl": 109.4358594417572, |
|
"learning_rate": 4.378486872288611e-06, |
|
"loss": 0.1094, |
|
"reward": 18.41537481546402, |
|
"reward_std": 6.753879874944687, |
|
"rewards/concensus_correctness_reward_func": 14.375, |
|
"rewards/consensus_reward_func": 1.6875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.0625, |
|
"rewards/question_recreation_reward_func": 0.694999773055315, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.0625, |
|
"rewards/xmlcount_reward_func": 0.5172499939799309, |
|
"step": 102 |
|
}, |
|
{ |
|
"completion_length": 221.125, |
|
"epoch": 1.4647887323943662, |
|
"grad_norm": 46.86466598510742, |
|
"kl": 40.41808983683586, |
|
"learning_rate": 4.3515279609476e-06, |
|
"loss": 0.0404, |
|
"reward": 22.9285147190094, |
|
"reward_std": 4.827527537941933, |
|
"rewards/concensus_correctness_reward_func": 18.125, |
|
"rewards/consensus_reward_func": 1.8125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.6875, |
|
"rewards/question_recreation_reward_func": 0.7598273046314716, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.5124375112354755, |
|
"step": 104 |
|
}, |
|
{ |
|
"completion_length": 188.25, |
|
"epoch": 1.4929577464788732, |
|
"grad_norm": 17.044279098510742, |
|
"kl": 67.19291111826897, |
|
"learning_rate": 4.324083518488151e-06, |
|
"loss": 0.0672, |
|
"reward": 19.629344820976257, |
|
"reward_std": 8.029563069343567, |
|
"rewards/concensus_correctness_reward_func": 15.625, |
|
"rewards/consensus_reward_func": 1.5625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.25, |
|
"rewards/question_recreation_reward_func": 0.5852201916277409, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.03125, |
|
"rewards/xmlcount_reward_func": 0.5753749944269657, |
|
"step": 106 |
|
}, |
|
{ |
|
"completion_length": 260.375, |
|
"epoch": 1.5211267605633803, |
|
"grad_norm": 3.7830276489257812, |
|
"kl": 26.501030012965202, |
|
"learning_rate": 4.296160741739652e-06, |
|
"loss": 0.0265, |
|
"reward": 21.420883417129517, |
|
"reward_std": 2.4508758764714003, |
|
"rewards/concensus_correctness_reward_func": 16.875, |
|
"rewards/consensus_reward_func": 1.9375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.1875, |
|
"rewards/question_recreation_reward_func": 0.7897899299860001, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.109375, |
|
"rewards/xmlcount_reward_func": 0.5060937535017729, |
|
"step": 108 |
|
}, |
|
{ |
|
"completion_length": 236.4375, |
|
"epoch": 1.5492957746478875, |
|
"grad_norm": 6.825228691101074, |
|
"kl": 20.585920438170433, |
|
"learning_rate": 4.267766952966369e-06, |
|
"loss": 0.0206, |
|
"reward": 20.2924707531929, |
|
"reward_std": 4.762420322746038, |
|
"rewards/concensus_correctness_reward_func": 15.625, |
|
"rewards/consensus_reward_func": 1.8125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.5, |
|
"rewards/question_recreation_reward_func": 0.679470956325531, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.109375, |
|
"rewards/xmlcount_reward_func": 0.5505000110715628, |
|
"step": 110 |
|
}, |
|
{ |
|
"completion_length": 204.46875, |
|
"epoch": 1.5774647887323945, |
|
"grad_norm": 13.651640892028809, |
|
"kl": 29.064430966973305, |
|
"learning_rate": 4.238909597947307e-06, |
|
"loss": 0.0291, |
|
"reward": 20.76398205757141, |
|
"reward_std": 3.667446758598089, |
|
"rewards/concensus_correctness_reward_func": 16.25, |
|
"rewards/consensus_reward_func": 1.8125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.375, |
|
"rewards/question_recreation_reward_func": 0.7045135274529457, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.09375, |
|
"rewards/xmlcount_reward_func": 0.5125937461853027, |
|
"step": 112 |
|
}, |
|
{ |
|
"completion_length": 202.5625, |
|
"epoch": 1.6056338028169015, |
|
"grad_norm": 7.285125732421875, |
|
"kl": 31.416974440217018, |
|
"learning_rate": 4.2095962440236846e-06, |
|
"loss": 0.0314, |
|
"reward": 22.71331763267517, |
|
"reward_std": 4.953943386673927, |
|
"rewards/concensus_correctness_reward_func": 18.125, |
|
"rewards/consensus_reward_func": 1.8125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.4375, |
|
"rewards/question_recreation_reward_func": 0.6919735632836819, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0625, |
|
"rewards/xmlcount_reward_func": 0.5838437452912331, |
|
"step": 114 |
|
}, |
|
{ |
|
"completion_length": 202.8125, |
|
"epoch": 1.6338028169014085, |
|
"grad_norm": 13.096663475036621, |
|
"kl": 27.956041753292084, |
|
"learning_rate": 4.179834578114531e-06, |
|
"loss": 0.028, |
|
"reward": 20.11071002483368, |
|
"reward_std": 4.602979902178049, |
|
"rewards/concensus_correctness_reward_func": 15.0, |
|
"rewards/consensus_reward_func": 1.75, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.625, |
|
"rewards/question_recreation_reward_func": 0.7899289727210999, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.15625, |
|
"rewards/xmlcount_reward_func": 0.7895312570035458, |
|
"step": 116 |
|
}, |
|
{ |
|
"completion_length": 230.1875, |
|
"epoch": 1.6619718309859155, |
|
"grad_norm": 6.774753570556641, |
|
"kl": 9.80746340751648, |
|
"learning_rate": 4.149632404700925e-06, |
|
"loss": 0.0098, |
|
"reward": 21.431988835334778, |
|
"reward_std": 2.1543020214885473, |
|
"rewards/concensus_correctness_reward_func": 16.875, |
|
"rewards/consensus_reward_func": 1.9375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.25, |
|
"rewards/question_recreation_reward_func": 0.7171135507524014, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.078125, |
|
"rewards/xmlcount_reward_func": 0.5742499995976686, |
|
"step": 118 |
|
}, |
|
{ |
|
"completion_length": 259.09375, |
|
"epoch": 1.6901408450704225, |
|
"grad_norm": 2.689204216003418, |
|
"kl": 17.400323942303658, |
|
"learning_rate": 4.118997643779401e-06, |
|
"loss": 0.0174, |
|
"reward": 22.90558958053589, |
|
"reward_std": 4.9271611254662275, |
|
"rewards/concensus_correctness_reward_func": 18.125, |
|
"rewards/consensus_reward_func": 1.8125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.6875, |
|
"rewards/question_recreation_reward_func": 0.7364025376737118, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.078125, |
|
"rewards/xmlcount_reward_func": 0.4504375047981739, |
|
"step": 120 |
|
}, |
|
{ |
|
"completion_length": 247.6875, |
|
"epoch": 1.7183098591549295, |
|
"grad_norm": 28.990882873535156, |
|
"kl": 22.44525107741356, |
|
"learning_rate": 4.087938328785071e-06, |
|
"loss": 0.0224, |
|
"reward": 21.130041539669037, |
|
"reward_std": 3.9246082678437233, |
|
"rewards/concensus_correctness_reward_func": 16.25, |
|
"rewards/consensus_reward_func": 1.8125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.5, |
|
"rewards/question_recreation_reward_func": 0.7531667724251747, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.15625, |
|
"rewards/xmlcount_reward_func": 0.6425000079907477, |
|
"step": 122 |
|
}, |
|
{ |
|
"completion_length": 214.5625, |
|
"epoch": 1.7464788732394365, |
|
"grad_norm": 17.202999114990234, |
|
"kl": 93.57761958241463, |
|
"learning_rate": 4.056462604484998e-06, |
|
"loss": 0.0936, |
|
"reward": 19.359357565641403, |
|
"reward_std": 5.1793545708060265, |
|
"rewards/concensus_correctness_reward_func": 15.0, |
|
"rewards/consensus_reward_func": 1.6875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.3125, |
|
"rewards/question_recreation_reward_func": 0.6580452099442482, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.109375, |
|
"rewards/xmlcount_reward_func": 0.5919375065714121, |
|
"step": 124 |
|
}, |
|
{ |
|
"completion_length": 249.71875, |
|
"epoch": 1.7746478873239435, |
|
"grad_norm": 12.416897773742676, |
|
"kl": 13.260371595621109, |
|
"learning_rate": 4.0245787248423614e-06, |
|
"loss": 0.0133, |
|
"reward": 22.547950267791748, |
|
"reward_std": 4.811736276373267, |
|
"rewards/concensus_correctness_reward_func": 18.125, |
|
"rewards/consensus_reward_func": 1.8125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.4375, |
|
"rewards/question_recreation_reward_func": 0.6626068912446499, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.046875, |
|
"rewards/xmlcount_reward_func": 0.44784374348819256, |
|
"step": 126 |
|
}, |
|
{ |
|
"completion_length": 253.0, |
|
"epoch": 1.8028169014084507, |
|
"grad_norm": 13.114068031311035, |
|
"kl": 28.459499150514603, |
|
"learning_rate": 3.992295050852013e-06, |
|
"loss": 0.0285, |
|
"reward": 22.122711896896362, |
|
"reward_std": 5.9889209773391485, |
|
"rewards/concensus_correctness_reward_func": 17.5, |
|
"rewards/consensus_reward_func": 1.75, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.6875, |
|
"rewards/question_recreation_reward_func": 0.7654621824622154, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.3885000068694353, |
|
"step": 128 |
|
}, |
|
{ |
|
"completion_length": 231.3125, |
|
"epoch": 1.8309859154929577, |
|
"grad_norm": 14.931551933288574, |
|
"kl": 38.51357202231884, |
|
"learning_rate": 3.959620048347938e-06, |
|
"loss": 0.0385, |
|
"reward": 19.09564107656479, |
|
"reward_std": 5.354119001887739, |
|
"rewards/concensus_correctness_reward_func": 14.375, |
|
"rewards/consensus_reward_func": 1.6875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.5, |
|
"rewards/question_recreation_reward_func": 0.686109896749258, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.140625, |
|
"rewards/xmlcount_reward_func": 0.7064062356948853, |
|
"step": 130 |
|
}, |
|
{ |
|
"completion_length": 224.78125, |
|
"epoch": 1.8591549295774648, |
|
"grad_norm": 168.16796875, |
|
"kl": 134.45312885940075, |
|
"learning_rate": 3.9265622857832455e-06, |
|
"loss": 0.1345, |
|
"reward": 19.73101794719696, |
|
"reward_std": 4.9529455080628395, |
|
"rewards/concensus_correctness_reward_func": 15.10393750667572, |
|
"rewards/consensus_reward_func": 1.6875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.5625, |
|
"rewards/question_recreation_reward_func": 0.7391119040548801, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.078125, |
|
"rewards/xmlcount_reward_func": 0.5598437488079071, |
|
"step": 132 |
|
}, |
|
{ |
|
"completion_length": 191.3125, |
|
"epoch": 1.887323943661972, |
|
"grad_norm": 144.23934936523438, |
|
"kl": 160.06939086318016, |
|
"learning_rate": 3.893130431983234e-06, |
|
"loss": 0.1601, |
|
"reward": 20.553396463394165, |
|
"reward_std": 7.810650005936623, |
|
"rewards/concensus_correctness_reward_func": 16.25, |
|
"rewards/consensus_reward_func": 1.625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.375, |
|
"rewards/question_recreation_reward_func": 0.6331466361880302, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.078125, |
|
"rewards/xmlcount_reward_func": 0.5765000004321337, |
|
"step": 134 |
|
}, |
|
{ |
|
"completion_length": 218.75, |
|
"epoch": 1.915492957746479, |
|
"grad_norm": 30.639406204223633, |
|
"kl": 59.116331934928894, |
|
"learning_rate": 3.8593332538721465e-06, |
|
"loss": 0.0591, |
|
"reward": 19.54647660255432, |
|
"reward_std": 8.99451743811369, |
|
"rewards/concensus_correctness_reward_func": 15.625, |
|
"rewards/consensus_reward_func": 1.5625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.1875, |
|
"rewards/question_recreation_reward_func": 0.6372890621423721, |
|
"rewards/soft_format_reward_func": 0.03125, |
|
"rewards/strict_format_reward_func": 0.0625, |
|
"rewards/xmlcount_reward_func": 0.4404374985024333, |
|
"step": 136 |
|
}, |
|
{ |
|
"completion_length": 196.875, |
|
"epoch": 1.943661971830986, |
|
"grad_norm": 57.319923400878906, |
|
"kl": 38.51234859973192, |
|
"learning_rate": 3.825179614174195e-06, |
|
"loss": 0.0385, |
|
"reward": 21.283878982067108, |
|
"reward_std": 2.0606489591300488, |
|
"rewards/concensus_correctness_reward_func": 16.875, |
|
"rewards/consensus_reward_func": 1.875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.375, |
|
"rewards/question_recreation_reward_func": 0.4702228233218193, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.0625, |
|
"rewards/xmlcount_reward_func": 0.6105312407016754, |
|
"step": 138 |
|
}, |
|
{ |
|
"completion_length": 212.4375, |
|
"epoch": 1.971830985915493, |
|
"grad_norm": 8.84663200378418, |
|
"kl": 27.299983263015747, |
|
"learning_rate": 3.790678469089465e-06, |
|
"loss": 0.0273, |
|
"reward": 14.866298139095306, |
|
"reward_std": 6.414532793685794, |
|
"rewards/concensus_correctness_reward_func": 11.25, |
|
"rewards/consensus_reward_func": 1.625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.6875, |
|
"rewards/question_recreation_reward_func": 0.7097668498754501, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.09375, |
|
"rewards/xmlcount_reward_func": 0.48465626407414675, |
|
"step": 140 |
|
}, |
|
{ |
|
"completion_length": 183.71875, |
|
"epoch": 2.0, |
|
"grad_norm": 20.581981658935547, |
|
"kl": 2941.267915993929, |
|
"learning_rate": 3.7558388659453052e-06, |
|
"loss": 2.9413, |
|
"reward": 22.23113512992859, |
|
"reward_std": 6.688113525509834, |
|
"rewards/concensus_correctness_reward_func": 17.5, |
|
"rewards/consensus_reward_func": 1.75, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.5625, |
|
"rewards/question_recreation_reward_func": 0.6908227056264877, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.09375, |
|
"rewards/xmlcount_reward_func": 0.6340624932199717, |
|
"step": 142 |
|
}, |
|
{ |
|
"completion_length": 229.9375, |
|
"epoch": 2.028169014084507, |
|
"grad_norm": 4.256288528442383, |
|
"kl": 450.68155094981194, |
|
"learning_rate": 3.720669940823827e-06, |
|
"loss": 0.4507, |
|
"reward": 19.878115504980087, |
|
"reward_std": 4.9296186761930585, |
|
"rewards/concensus_correctness_reward_func": 15.625, |
|
"rewards/consensus_reward_func": 1.75, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.375, |
|
"rewards/question_recreation_reward_func": 0.5736463665962219, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.078125, |
|
"rewards/xmlcount_reward_func": 0.47634374257177114, |
|
"step": 144 |
|
}, |
|
{ |
|
"completion_length": 199.96875, |
|
"epoch": 2.056338028169014, |
|
"grad_norm": 134.83917236328125, |
|
"kl": 53.38550490140915, |
|
"learning_rate": 3.6851809161661206e-06, |
|
"loss": 0.0534, |
|
"reward": 23.20932626724243, |
|
"reward_std": 4.892109964042902, |
|
"rewards/concensus_correctness_reward_func": 18.125, |
|
"rewards/consensus_reward_func": 1.8125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.75, |
|
"rewards/question_recreation_reward_func": 0.7212322875857353, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.109375, |
|
"rewards/xmlcount_reward_func": 0.6912187561392784, |
|
"step": 146 |
|
}, |
|
{ |
|
"completion_length": 171.8125, |
|
"epoch": 2.084507042253521, |
|
"grad_norm": 15.478096008300781, |
|
"kl": 43.08566951751709, |
|
"learning_rate": 3.649381098353834e-06, |
|
"loss": 0.0431, |
|
"reward": 21.71215844154358, |
|
"reward_std": 6.466344892978668, |
|
"rewards/concensus_correctness_reward_func": 17.5, |
|
"rewards/consensus_reward_func": 1.75, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.1875, |
|
"rewards/question_recreation_reward_func": 0.46212736517190933, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.09375, |
|
"rewards/xmlcount_reward_func": 0.7031562626361847, |
|
"step": 148 |
|
}, |
|
{ |
|
"completion_length": 158.28125, |
|
"epoch": 2.112676056338028, |
|
"grad_norm": 356.259033203125, |
|
"kl": 216.86275094747543, |
|
"learning_rate": 3.613279875268731e-06, |
|
"loss": 0.2169, |
|
"reward": 17.928013503551483, |
|
"reward_std": 8.778308073699009, |
|
"rewards/concensus_correctness_reward_func": 13.75, |
|
"rewards/consensus_reward_func": 1.375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.375, |
|
"rewards/question_recreation_reward_func": 0.568356541916728, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.140625, |
|
"rewards/xmlcount_reward_func": 0.7190312705934048, |
|
"step": 150 |
|
}, |
|
{ |
|
"completion_length": 182.75, |
|
"epoch": 2.140845070422535, |
|
"grad_norm": 6.1676530838012695, |
|
"kl": 86.0618257522583, |
|
"learning_rate": 3.5768867138308872e-06, |
|
"loss": 0.0861, |
|
"reward": 16.761336520314217, |
|
"reward_std": 5.004373461008072, |
|
"rewards/concensus_correctness_reward_func": 13.125, |
|
"rewards/consensus_reward_func": 1.5, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.9375, |
|
"rewards/question_recreation_reward_func": 0.5807425025850534, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.046875, |
|
"rewards/xmlcount_reward_func": 0.5555937569588423, |
|
"step": 152 |
|
}, |
|
{ |
|
"completion_length": 216.46875, |
|
"epoch": 2.169014084507042, |
|
"grad_norm": 7.545609474182129, |
|
"kl": 17.122317761182785, |
|
"learning_rate": 3.540211157516149e-06, |
|
"loss": 0.0171, |
|
"reward": 21.572845339775085, |
|
"reward_std": 5.354643169790506, |
|
"rewards/concensus_correctness_reward_func": 17.5, |
|
"rewards/consensus_reward_func": 1.75, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.1875, |
|
"rewards/question_recreation_reward_func": 0.6150016514584422, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.046875, |
|
"rewards/xmlcount_reward_func": 0.45784375444054604, |
|
"step": 154 |
|
}, |
|
{ |
|
"completion_length": 217.90625, |
|
"epoch": 2.1971830985915495, |
|
"grad_norm": 4.116415977478027, |
|
"kl": 25.3076790869236, |
|
"learning_rate": 3.503262823853527e-06, |
|
"loss": 0.0253, |
|
"reward": 21.371933221817017, |
|
"reward_std": 7.111734602600336, |
|
"rewards/concensus_correctness_reward_func": 16.875, |
|
"rewards/consensus_reward_func": 1.6875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.5625, |
|
"rewards/question_recreation_reward_func": 0.6787459887564182, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.03125, |
|
"rewards/xmlcount_reward_func": 0.5369375087320805, |
|
"step": 156 |
|
}, |
|
{ |
|
"completion_length": 223.84375, |
|
"epoch": 2.2253521126760565, |
|
"grad_norm": 4.25111198425293, |
|
"kl": 8.69906596839428, |
|
"learning_rate": 3.466051401903162e-06, |
|
"loss": 0.0087, |
|
"reward": 19.322344303131104, |
|
"reward_std": 0.9026554934680462, |
|
"rewards/concensus_correctness_reward_func": 15.0, |
|
"rewards/consensus_reward_func": 1.9375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.9375, |
|
"rewards/question_recreation_reward_func": 0.7088755983859301, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.09375, |
|
"rewards/xmlcount_reward_func": 0.6447187531739473, |
|
"step": 158 |
|
}, |
|
{ |
|
"completion_length": 241.375, |
|
"epoch": 2.2535211267605635, |
|
"grad_norm": 72.4315414428711, |
|
"kl": 48.62941300868988, |
|
"learning_rate": 3.428586649715542e-06, |
|
"loss": 0.0486, |
|
"reward": 21.37134912610054, |
|
"reward_std": 2.193010773509741, |
|
"rewards/concensus_correctness_reward_func": 16.875, |
|
"rewards/consensus_reward_func": 1.875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.375, |
|
"rewards/question_recreation_reward_func": 0.6839431263506413, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.09375, |
|
"rewards/xmlcount_reward_func": 0.4686562530696392, |
|
"step": 160 |
|
}, |
|
{ |
|
"completion_length": 214.46875, |
|
"epoch": 2.2816901408450705, |
|
"grad_norm": 26.874465942382812, |
|
"kl": 46.34515926241875, |
|
"learning_rate": 3.3908783917726123e-06, |
|
"loss": 0.0463, |
|
"reward": 18.52027067542076, |
|
"reward_std": 6.44943779706955, |
|
"rewards/concensus_correctness_reward_func": 14.375, |
|
"rewards/consensus_reward_func": 1.6875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.25, |
|
"rewards/question_recreation_reward_func": 0.5618643239140511, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.078125, |
|
"rewards/xmlcount_reward_func": 0.552156250923872, |
|
"step": 162 |
|
}, |
|
{ |
|
"completion_length": 200.21875, |
|
"epoch": 2.3098591549295775, |
|
"grad_norm": 43.46574783325195, |
|
"kl": 67.54259772598743, |
|
"learning_rate": 3.3529365164114903e-06, |
|
"loss": 0.0675, |
|
"reward": 21.3968608379364, |
|
"reward_std": 7.711536236514803, |
|
"rewards/concensus_correctness_reward_func": 16.875, |
|
"rewards/consensus_reward_func": 1.6875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.5, |
|
"rewards/question_recreation_reward_func": 0.6364228781312704, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.09375, |
|
"rewards/xmlcount_reward_func": 0.5885625015944242, |
|
"step": 164 |
|
}, |
|
{ |
|
"completion_length": 193.875, |
|
"epoch": 2.3380281690140845, |
|
"grad_norm": 14.524393081665039, |
|
"kl": 96.5325955748558, |
|
"learning_rate": 3.314770973231408e-06, |
|
"loss": 0.0965, |
|
"reward": 18.271313101053238, |
|
"reward_std": 7.526230916380882, |
|
"rewards/concensus_correctness_reward_func": 14.480687499046326, |
|
"rewards/consensus_reward_func": 1.5625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.1875, |
|
"rewards/question_recreation_reward_func": 0.4511258793063462, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.046875, |
|
"rewards/xmlcount_reward_func": 0.5426250174641609, |
|
"step": 166 |
|
}, |
|
{ |
|
"completion_length": 163.3125, |
|
"epoch": 2.3661971830985915, |
|
"grad_norm": 1158.546630859375, |
|
"kl": 972.9655037075281, |
|
"learning_rate": 3.276391770484606e-06, |
|
"loss": 0.973, |
|
"reward": 16.986168384552002, |
|
"reward_std": 9.901727393269539, |
|
"rewards/concensus_correctness_reward_func": 13.125, |
|
"rewards/consensus_reward_func": 1.3125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.125, |
|
"rewards/question_recreation_reward_func": 0.5950123034417629, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.140625, |
|
"rewards/xmlcount_reward_func": 0.6880312561988831, |
|
"step": 168 |
|
}, |
|
{ |
|
"completion_length": 199.28125, |
|
"epoch": 2.3943661971830985, |
|
"grad_norm": 15.329193115234375, |
|
"kl": 43.722544223070145, |
|
"learning_rate": 3.2378089724518464e-06, |
|
"loss": 0.0437, |
|
"reward": 21.943373203277588, |
|
"reward_std": 5.313693807460368, |
|
"rewards/concensus_correctness_reward_func": 17.5, |
|
"rewards/consensus_reward_func": 1.75, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.1875, |
|
"rewards/question_recreation_reward_func": 0.7041860446333885, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.15625, |
|
"rewards/xmlcount_reward_func": 0.6454374976456165, |
|
"step": 170 |
|
}, |
|
{ |
|
"completion_length": 197.15625, |
|
"epoch": 2.4225352112676055, |
|
"grad_norm": 13.730399131774902, |
|
"kl": 15.106778889894485, |
|
"learning_rate": 3.1990326968032225e-06, |
|
"loss": 0.0151, |
|
"reward": 22.867514848709106, |
|
"reward_std": 4.801642283797264, |
|
"rewards/concensus_correctness_reward_func": 18.125, |
|
"rewards/consensus_reward_func": 1.8125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.4375, |
|
"rewards/question_recreation_reward_func": 0.6767030283808708, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.125, |
|
"rewards/xmlcount_reward_func": 0.6751875132322311, |
|
"step": 172 |
|
}, |
|
{ |
|
"completion_length": 169.75, |
|
"epoch": 2.4507042253521125, |
|
"grad_norm": 6.74544095993042, |
|
"kl": 42.9027735888958, |
|
"learning_rate": 3.160073111944983e-06, |
|
"loss": 0.0429, |
|
"reward": 17.841490387916565, |
|
"reward_std": 7.046665458008647, |
|
"rewards/concensus_correctness_reward_func": 13.75, |
|
"rewards/consensus_reward_func": 1.625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.1875, |
|
"rewards/question_recreation_reward_func": 0.5759594012051821, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.0625, |
|
"rewards/xmlcount_reward_func": 0.6249062493443489, |
|
"step": 174 |
|
}, |
|
{ |
|
"completion_length": 238.6875, |
|
"epoch": 2.4788732394366195, |
|
"grad_norm": 42.492393493652344, |
|
"kl": 50.04497802257538, |
|
"learning_rate": 3.1209404343530374e-06, |
|
"loss": 0.05, |
|
"reward": 19.691066712141037, |
|
"reward_std": 3.8624095655977726, |
|
"rewards/concensus_correctness_reward_func": 15.625, |
|
"rewards/consensus_reward_func": 1.8125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.0625, |
|
"rewards/question_recreation_reward_func": 0.6070978939533234, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.078125, |
|
"rewards/xmlcount_reward_func": 0.5058437511324883, |
|
"step": 176 |
|
}, |
|
{ |
|
"completion_length": 221.40625, |
|
"epoch": 2.507042253521127, |
|
"grad_norm": 4.4059295654296875, |
|
"kl": 15.536839783191681, |
|
"learning_rate": 3.081644925893866e-06, |
|
"loss": 0.0155, |
|
"reward": 17.259127408266068, |
|
"reward_std": 4.80191726796329, |
|
"rewards/concensus_correctness_reward_func": 13.125, |
|
"rewards/consensus_reward_func": 1.8125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.0625, |
|
"rewards/question_recreation_reward_func": 0.7241274192929268, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.03125, |
|
"rewards/xmlcount_reward_func": 0.5037499945610762, |
|
"step": 178 |
|
}, |
|
{ |
|
"completion_length": 220.0625, |
|
"epoch": 2.535211267605634, |
|
"grad_norm": 51.83406066894531, |
|
"kl": 44.841193079948425, |
|
"learning_rate": 3.0421968911335196e-06, |
|
"loss": 0.0448, |
|
"reward": 20.607652366161346, |
|
"reward_std": 2.3559402879327536, |
|
"rewards/concensus_correctness_reward_func": 16.25, |
|
"rewards/consensus_reward_func": 1.75, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.3125, |
|
"rewards/question_recreation_reward_func": 0.669371597468853, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0625, |
|
"rewards/xmlcount_reward_func": 0.5632812529802322, |
|
"step": 180 |
|
}, |
|
{ |
|
"completion_length": 207.96875, |
|
"epoch": 2.563380281690141, |
|
"grad_norm": 20.143775939941406, |
|
"kl": 44.13751931488514, |
|
"learning_rate": 3.002606674635432e-06, |
|
"loss": 0.0441, |
|
"reward": 21.463626384735107, |
|
"reward_std": 6.160396963357925, |
|
"rewards/concensus_correctness_reward_func": 17.59318745136261, |
|
"rewards/consensus_reward_func": 1.75, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.0, |
|
"rewards/question_recreation_reward_func": 0.5963759236037731, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.046875, |
|
"rewards/xmlcount_reward_func": 0.4771875059232116, |
|
"step": 182 |
|
}, |
|
{ |
|
"completion_length": 222.28125, |
|
"epoch": 2.591549295774648, |
|
"grad_norm": 11.883401870727539, |
|
"kl": 20.277920335531235, |
|
"learning_rate": 2.9628846582477305e-06, |
|
"loss": 0.0203, |
|
"reward": 19.031234979629517, |
|
"reward_std": 6.665901035070419, |
|
"rewards/concensus_correctness_reward_func": 15.0, |
|
"rewards/consensus_reward_func": 1.6875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.3125, |
|
"rewards/question_recreation_reward_func": 0.5251727998256683, |
|
"rewards/soft_format_reward_func": 0.03125, |
|
"rewards/strict_format_reward_func": 0.078125, |
|
"rewards/xmlcount_reward_func": 0.3966874983161688, |
|
"step": 184 |
|
}, |
|
{ |
|
"completion_length": 181.90625, |
|
"epoch": 2.619718309859155, |
|
"grad_norm": 8.755311965942383, |
|
"kl": 25.77407142519951, |
|
"learning_rate": 2.923041258380779e-06, |
|
"loss": 0.0258, |
|
"reward": 20.905356884002686, |
|
"reward_std": 6.6025552824139595, |
|
"rewards/concensus_correctness_reward_func": 16.875, |
|
"rewards/consensus_reward_func": 1.6875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.1875, |
|
"rewards/question_recreation_reward_func": 0.5499190725386143, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.046875, |
|
"rewards/xmlcount_reward_func": 0.5585625115782022, |
|
"step": 186 |
|
}, |
|
{ |
|
"completion_length": 184.28125, |
|
"epoch": 2.647887323943662, |
|
"grad_norm": 16.596654891967773, |
|
"kl": 47.27063727378845, |
|
"learning_rate": 2.883086923275658e-06, |
|
"loss": 0.0473, |
|
"reward": 20.340314149856567, |
|
"reward_std": 8.395816408097744, |
|
"rewards/concensus_correctness_reward_func": 15.625, |
|
"rewards/consensus_reward_func": 1.5625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.625, |
|
"rewards/question_recreation_reward_func": 0.7232826687395573, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.125, |
|
"rewards/xmlcount_reward_func": 0.6639062352478504, |
|
"step": 188 |
|
}, |
|
{ |
|
"completion_length": 245.90625, |
|
"epoch": 2.676056338028169, |
|
"grad_norm": 7.123233318328857, |
|
"kl": 11.423286348581314, |
|
"learning_rate": 2.8430321302642887e-06, |
|
"loss": 0.0114, |
|
"reward": 24.368746280670166, |
|
"reward_std": 2.0759390871971846, |
|
"rewards/concensus_correctness_reward_func": 19.375, |
|
"rewards/consensus_reward_func": 1.9375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.6875, |
|
"rewards/question_recreation_reward_func": 0.6851216927170753, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.078125, |
|
"rewards/xmlcount_reward_func": 0.5898749995976686, |
|
"step": 190 |
|
}, |
|
{ |
|
"completion_length": 195.46875, |
|
"epoch": 2.704225352112676, |
|
"grad_norm": 18.82990074157715, |
|
"kl": 58.429032266139984, |
|
"learning_rate": 2.8028873830219373e-06, |
|
"loss": 0.0584, |
|
"reward": 16.25366249680519, |
|
"reward_std": 5.261145170778036, |
|
"rewards/concensus_correctness_reward_func": 12.5, |
|
"rewards/consensus_reward_func": 1.6875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.875, |
|
"rewards/question_recreation_reward_func": 0.5097565241158009, |
|
"rewards/soft_format_reward_func": 0.03125, |
|
"rewards/strict_format_reward_func": 0.09375, |
|
"rewards/xmlcount_reward_func": 0.5564062632620335, |
|
"step": 192 |
|
}, |
|
{ |
|
"completion_length": 213.34375, |
|
"epoch": 2.732394366197183, |
|
"grad_norm": 14.168484687805176, |
|
"kl": 110.85996335744858, |
|
"learning_rate": 2.76266320881281e-06, |
|
"loss": 0.1109, |
|
"reward": 21.16573476791382, |
|
"reward_std": 6.517005235888064, |
|
"rewards/concensus_correctness_reward_func": 16.875, |
|
"rewards/consensus_reward_func": 1.6875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.25, |
|
"rewards/question_recreation_reward_func": 0.6182350441813469, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.078125, |
|
"rewards/xmlcount_reward_func": 0.6568750068545341, |
|
"step": 194 |
|
}, |
|
{ |
|
"completion_length": 267.03125, |
|
"epoch": 2.76056338028169, |
|
"grad_norm": 18.253278732299805, |
|
"kl": 138.1692279279232, |
|
"learning_rate": 2.7223701557294574e-06, |
|
"loss": 0.1382, |
|
"reward": 18.324547559022903, |
|
"reward_std": 6.471590518951416, |
|
"rewards/concensus_correctness_reward_func": 14.375, |
|
"rewards/consensus_reward_func": 1.6875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.0, |
|
"rewards/question_recreation_reward_func": 0.6808598078787327, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.078125, |
|
"rewards/xmlcount_reward_func": 0.48743749409914017, |
|
"step": 196 |
|
}, |
|
{ |
|
"completion_length": 204.375, |
|
"epoch": 2.788732394366197, |
|
"grad_norm": 15.189406394958496, |
|
"kl": 26.855069160461426, |
|
"learning_rate": 2.6820187899267203e-06, |
|
"loss": 0.0269, |
|
"reward": 19.546295881271362, |
|
"reward_std": 8.134475693106651, |
|
"rewards/concensus_correctness_reward_func": 15.625, |
|
"rewards/consensus_reward_func": 1.5625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.25, |
|
"rewards/question_recreation_reward_func": 0.5997647196054459, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.046875, |
|
"rewards/xmlcount_reward_func": 0.44653125666081905, |
|
"step": 198 |
|
}, |
|
{ |
|
"completion_length": 160.71875, |
|
"epoch": 2.816901408450704, |
|
"grad_norm": 8.699529647827148, |
|
"kl": 49.28822618722916, |
|
"learning_rate": 2.641619692850941e-06, |
|
"loss": 0.0493, |
|
"reward": 19.428176522254944, |
|
"reward_std": 7.277277044951916, |
|
"rewards/concensus_correctness_reward_func": 15.0, |
|
"rewards/consensus_reward_func": 1.5, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.5, |
|
"rewards/question_recreation_reward_func": 0.6063638776540756, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.125, |
|
"rewards/xmlcount_reward_func": 0.6811874993145466, |
|
"step": 200 |
|
}, |
|
{ |
|
"completion_length": 200.40625, |
|
"epoch": 2.845070422535211, |
|
"grad_norm": 19.380477905273438, |
|
"kl": 95.13921695947647, |
|
"learning_rate": 2.6011834584651597e-06, |
|
"loss": 0.0951, |
|
"reward": 18.081178903579712, |
|
"reward_std": 8.80796305835247, |
|
"rewards/concensus_correctness_reward_func": 14.467000007629395, |
|
"rewards/consensus_reward_func": 1.4375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.9375, |
|
"rewards/question_recreation_reward_func": 0.7127412371337414, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.046875, |
|
"rewards/xmlcount_reward_func": 0.47956248791888356, |
|
"step": 202 |
|
}, |
|
{ |
|
"completion_length": 185.75, |
|
"epoch": 2.873239436619718, |
|
"grad_norm": 21.92093276977539, |
|
"kl": 70.87698519229889, |
|
"learning_rate": 2.560720690471033e-06, |
|
"loss": 0.0709, |
|
"reward": 19.574060678482056, |
|
"reward_std": 7.992755997925997, |
|
"rewards/concensus_correctness_reward_func": 15.625, |
|
"rewards/consensus_reward_func": 1.5625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.0, |
|
"rewards/question_recreation_reward_func": 0.6473107412457466, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.0625, |
|
"rewards/xmlcount_reward_func": 0.6611249968409538, |
|
"step": 204 |
|
}, |
|
{ |
|
"completion_length": 197.0, |
|
"epoch": 2.9014084507042255, |
|
"grad_norm": 38.08381271362305, |
|
"kl": 111.72683045268059, |
|
"learning_rate": 2.5202419995281966e-06, |
|
"loss": 0.1117, |
|
"reward": 17.2239950299263, |
|
"reward_std": 7.70870977640152, |
|
"rewards/concensus_correctness_reward_func": 13.75, |
|
"rewards/consensus_reward_func": 1.5625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.6875, |
|
"rewards/question_recreation_reward_func": 0.6894014775753021, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.046875, |
|
"rewards/xmlcount_reward_func": 0.48771873861551285, |
|
"step": 206 |
|
}, |
|
{ |
|
"completion_length": 221.0, |
|
"epoch": 2.9295774647887325, |
|
"grad_norm": 12.506996154785156, |
|
"kl": 839.4179282784462, |
|
"learning_rate": 2.4797580004718038e-06, |
|
"loss": 0.8394, |
|
"reward": 19.14959552884102, |
|
"reward_std": 6.2375103905797005, |
|
"rewards/concensus_correctness_reward_func": 15.0, |
|
"rewards/consensus_reward_func": 1.75, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.125, |
|
"rewards/question_recreation_reward_func": 0.7049080766737461, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.046875, |
|
"rewards/xmlcount_reward_func": 0.507187508046627, |
|
"step": 208 |
|
}, |
|
{ |
|
"completion_length": 184.46875, |
|
"epoch": 2.9577464788732395, |
|
"grad_norm": 33.34077453613281, |
|
"kl": 28.567565202713013, |
|
"learning_rate": 2.4392793095289677e-06, |
|
"loss": 0.0286, |
|
"reward": 20.221239745616913, |
|
"reward_std": 3.982272831723094, |
|
"rewards/concensus_correctness_reward_func": 15.729062557220459, |
|
"rewards/consensus_reward_func": 1.8125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.25, |
|
"rewards/question_recreation_reward_func": 0.6523019410669804, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.09375, |
|
"rewards/xmlcount_reward_func": 0.6680000089108944, |
|
"step": 210 |
|
}, |
|
{ |
|
"completion_length": 199.375, |
|
"epoch": 2.9859154929577465, |
|
"grad_norm": 15.138080596923828, |
|
"kl": 101.21163132786751, |
|
"learning_rate": 2.3988165415348416e-06, |
|
"loss": 0.1012, |
|
"reward": 18.773864269256592, |
|
"reward_std": 7.821951035410166, |
|
"rewards/concensus_correctness_reward_func": 15.10393750667572, |
|
"rewards/consensus_reward_func": 1.5, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.9375, |
|
"rewards/question_recreation_reward_func": 0.5722080431878567, |
|
"rewards/soft_format_reward_func": 0.0625, |
|
"rewards/strict_format_reward_func": 0.09375, |
|
"rewards/xmlcount_reward_func": 0.5039687566459179, |
|
"step": 212 |
|
}, |
|
{ |
|
"completion_length": 200.09375, |
|
"epoch": 3.0140845070422535, |
|
"grad_norm": 27.673036575317383, |
|
"kl": 81.34071454405785, |
|
"learning_rate": 2.358380307149059e-06, |
|
"loss": 0.0813, |
|
"reward": 19.80060565471649, |
|
"reward_std": 6.877108983695507, |
|
"rewards/concensus_correctness_reward_func": 15.625, |
|
"rewards/consensus_reward_func": 1.5625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.25, |
|
"rewards/question_recreation_reward_func": 0.6550744473934174, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.109375, |
|
"rewards/xmlcount_reward_func": 0.5830312483012676, |
|
"step": 214 |
|
}, |
|
{ |
|
"completion_length": 197.53125, |
|
"epoch": 3.0422535211267605, |
|
"grad_norm": 14.81261157989502, |
|
"kl": 53.70039749145508, |
|
"learning_rate": 2.31798121007328e-06, |
|
"loss": 0.0537, |
|
"reward": 20.988556504249573, |
|
"reward_std": 6.804740943014622, |
|
"rewards/concensus_correctness_reward_func": 16.875, |
|
"rewards/consensus_reward_func": 1.6875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.25, |
|
"rewards/question_recreation_reward_func": 0.5584000945091248, |
|
"rewards/soft_format_reward_func": 0.046875, |
|
"rewards/strict_format_reward_func": 0.0625, |
|
"rewards/xmlcount_reward_func": 0.5082812439650297, |
|
"step": 216 |
|
}, |
|
{ |
|
"completion_length": 198.4375, |
|
"epoch": 3.0704225352112675, |
|
"grad_norm": 10.039848327636719, |
|
"kl": 47.19896852970123, |
|
"learning_rate": 2.2776298442705434e-06, |
|
"loss": 0.0472, |
|
"reward": 20.60348665714264, |
|
"reward_std": 8.125480651855469, |
|
"rewards/concensus_correctness_reward_func": 16.25, |
|
"rewards/consensus_reward_func": 1.625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.4375, |
|
"rewards/question_recreation_reward_func": 0.7180799320340157, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0625, |
|
"rewards/xmlcount_reward_func": 0.5104062519967556, |
|
"step": 218 |
|
}, |
|
{ |
|
"completion_length": 177.53125, |
|
"epoch": 3.0985915492957745, |
|
"grad_norm": 16.117830276489258, |
|
"kl": 72.61402860283852, |
|
"learning_rate": 2.2373367911871904e-06, |
|
"loss": 0.0726, |
|
"reward": 19.261219561100006, |
|
"reward_std": 4.884680893737823, |
|
"rewards/concensus_correctness_reward_func": 15.625, |
|
"rewards/consensus_reward_func": 1.5625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.8125, |
|
"rewards/question_recreation_reward_func": 0.522126174531877, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.109375, |
|
"rewards/xmlcount_reward_func": 0.6140937507152557, |
|
"step": 220 |
|
}, |
|
{ |
|
"completion_length": 191.625, |
|
"epoch": 3.1267605633802815, |
|
"grad_norm": 17.173952102661133, |
|
"kl": 478.0856115221977, |
|
"learning_rate": 2.1971126169780636e-06, |
|
"loss": 0.4781, |
|
"reward": 16.67675158381462, |
|
"reward_std": 7.77602906152606, |
|
"rewards/concensus_correctness_reward_func": 13.125, |
|
"rewards/consensus_reward_func": 1.5625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.0, |
|
"rewards/question_recreation_reward_func": 0.641875833272934, |
|
"rewards/soft_format_reward_func": 0.03125, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.31612499337643385, |
|
"step": 222 |
|
}, |
|
{ |
|
"completion_length": 237.59375, |
|
"epoch": 3.1549295774647885, |
|
"grad_norm": 10.077739715576172, |
|
"kl": 40.82688173651695, |
|
"learning_rate": 2.1569678697357126e-06, |
|
"loss": 0.0408, |
|
"reward": 20.142229437828064, |
|
"reward_std": 3.685103869996965, |
|
"rewards/concensus_correctness_reward_func": 16.25, |
|
"rewards/consensus_reward_func": 1.875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.0, |
|
"rewards/question_recreation_reward_func": 0.561760637909174, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.43984376545995474, |
|
"step": 224 |
|
}, |
|
{ |
|
"completion_length": 199.59375, |
|
"epoch": 3.183098591549296, |
|
"grad_norm": 5.315944671630859, |
|
"kl": 17.577760875225067, |
|
"learning_rate": 2.1169130767243424e-06, |
|
"loss": 0.0176, |
|
"reward": 18.28103494644165, |
|
"reward_std": 7.611427519470453, |
|
"rewards/concensus_correctness_reward_func": 14.375, |
|
"rewards/consensus_reward_func": 1.6875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.875, |
|
"rewards/question_recreation_reward_func": 0.6480660997331142, |
|
"rewards/soft_format_reward_func": 0.03125, |
|
"rewards/strict_format_reward_func": 0.09375, |
|
"rewards/xmlcount_reward_func": 0.5704687498509884, |
|
"step": 226 |
|
}, |
|
{ |
|
"completion_length": 185.5625, |
|
"epoch": 3.211267605633803, |
|
"grad_norm": 8.0359525680542, |
|
"kl": 58.93609178066254, |
|
"learning_rate": 2.0769587416192212e-06, |
|
"loss": 0.0589, |
|
"reward": 20.35518643260002, |
|
"reward_std": 3.533026445657015, |
|
"rewards/concensus_correctness_reward_func": 16.25, |
|
"rewards/consensus_reward_func": 1.8125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.1875, |
|
"rewards/question_recreation_reward_func": 0.527998685836792, |
|
"rewards/soft_format_reward_func": 0.03125, |
|
"rewards/strict_format_reward_func": 0.046875, |
|
"rewards/xmlcount_reward_func": 0.4990624990314245, |
|
"step": 228 |
|
}, |
|
{ |
|
"completion_length": 189.3125, |
|
"epoch": 3.23943661971831, |
|
"grad_norm": 26.13204574584961, |
|
"kl": 53.234640538692474, |
|
"learning_rate": 2.0371153417522703e-06, |
|
"loss": 0.0532, |
|
"reward": 18.698314785957336, |
|
"reward_std": 9.453598627820611, |
|
"rewards/concensus_correctness_reward_func": 15.0, |
|
"rewards/consensus_reward_func": 1.5, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.125, |
|
"rewards/question_recreation_reward_func": 0.4973145886324346, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.046875, |
|
"rewards/xmlcount_reward_func": 0.5291249938309193, |
|
"step": 230 |
|
}, |
|
{ |
|
"completion_length": 190.59375, |
|
"epoch": 3.267605633802817, |
|
"grad_norm": 13.507369995117188, |
|
"kl": 39.84981770813465, |
|
"learning_rate": 1.9973933253645684e-06, |
|
"loss": 0.0398, |
|
"reward": 19.292374074459076, |
|
"reward_std": 6.246133454144001, |
|
"rewards/concensus_correctness_reward_func": 15.0, |
|
"rewards/consensus_reward_func": 1.75, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.1875, |
|
"rewards/question_recreation_reward_func": 0.6897490993142128, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0625, |
|
"rewards/xmlcount_reward_func": 0.6026250012218952, |
|
"step": 232 |
|
}, |
|
{ |
|
"completion_length": 201.1875, |
|
"epoch": 3.295774647887324, |
|
"grad_norm": 8.261385917663574, |
|
"kl": 47.00834572315216, |
|
"learning_rate": 1.9578031088664812e-06, |
|
"loss": 0.047, |
|
"reward": 18.45159488916397, |
|
"reward_std": 7.484951298683882, |
|
"rewards/concensus_correctness_reward_func": 14.375, |
|
"rewards/consensus_reward_func": 1.6875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.0, |
|
"rewards/question_recreation_reward_func": 0.659157432615757, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.09375, |
|
"rewards/xmlcount_reward_func": 0.6361875031143427, |
|
"step": 234 |
|
}, |
|
{ |
|
"completion_length": 202.21875, |
|
"epoch": 3.323943661971831, |
|
"grad_norm": 38.87091064453125, |
|
"kl": 38.5699297785759, |
|
"learning_rate": 1.9183550741061354e-06, |
|
"loss": 0.0386, |
|
"reward": 19.85414829850197, |
|
"reward_std": 2.194763625971973, |
|
"rewards/concensus_correctness_reward_func": 16.25, |
|
"rewards/consensus_reward_func": 1.875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.5625, |
|
"rewards/question_recreation_reward_func": 0.5745858605951071, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0625, |
|
"rewards/xmlcount_reward_func": 0.5295624881982803, |
|
"step": 236 |
|
}, |
|
{ |
|
"completion_length": 181.5625, |
|
"epoch": 3.352112676056338, |
|
"grad_norm": 3900.5302734375, |
|
"kl": 2038.339742898941, |
|
"learning_rate": 1.8790595656469628e-06, |
|
"loss": 2.0383, |
|
"reward": 16.795432448387146, |
|
"reward_std": 7.932508982717991, |
|
"rewards/concensus_correctness_reward_func": 13.125, |
|
"rewards/consensus_reward_func": 1.375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.9375, |
|
"rewards/question_recreation_reward_func": 0.5185572430491447, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.140625, |
|
"rewards/xmlcount_reward_func": 0.6987500065006316, |
|
"step": 238 |
|
}, |
|
{ |
|
"completion_length": 212.875, |
|
"epoch": 3.380281690140845, |
|
"grad_norm": 15.849955558776855, |
|
"kl": 66.65974473953247, |
|
"learning_rate": 1.8399268880550174e-06, |
|
"loss": 0.0667, |
|
"reward": 21.17803716659546, |
|
"reward_std": 6.859333042055368, |
|
"rewards/concensus_correctness_reward_func": 16.875, |
|
"rewards/consensus_reward_func": 1.6875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.25, |
|
"rewards/question_recreation_reward_func": 0.7103495234623551, |
|
"rewards/soft_format_reward_func": 0.03125, |
|
"rewards/strict_format_reward_func": 0.109375, |
|
"rewards/xmlcount_reward_func": 0.5145624913275242, |
|
"step": 240 |
|
}, |
|
{ |
|
"completion_length": 226.53125, |
|
"epoch": 3.408450704225352, |
|
"grad_norm": 15.772849082946777, |
|
"kl": 89.09097853302956, |
|
"learning_rate": 1.800967303196778e-06, |
|
"loss": 0.0891, |
|
"reward": 22.494590759277344, |
|
"reward_std": 5.302706576883793, |
|
"rewards/concensus_correctness_reward_func": 18.125, |
|
"rewards/consensus_reward_func": 1.8125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.1875, |
|
"rewards/question_recreation_reward_func": 0.668371744453907, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.125, |
|
"rewards/xmlcount_reward_func": 0.5762187307700515, |
|
"step": 242 |
|
}, |
|
{ |
|
"completion_length": 164.46875, |
|
"epoch": 3.436619718309859, |
|
"grad_norm": 71.73149108886719, |
|
"kl": 298.8348106145859, |
|
"learning_rate": 1.7621910275481544e-06, |
|
"loss": 0.2988, |
|
"reward": 18.83104568719864, |
|
"reward_std": 7.157097928225994, |
|
"rewards/concensus_correctness_reward_func": 15.0, |
|
"rewards/consensus_reward_func": 1.5, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.1875, |
|
"rewards/question_recreation_reward_func": 0.5665771998465061, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0625, |
|
"rewards/xmlcount_reward_func": 0.514468751847744, |
|
"step": 244 |
|
}, |
|
{ |
|
"completion_length": 176.375, |
|
"epoch": 3.464788732394366, |
|
"grad_norm": 8.527463912963867, |
|
"kl": 49.205901980400085, |
|
"learning_rate": 1.7236082295153948e-06, |
|
"loss": 0.0492, |
|
"reward": 16.045627415180206, |
|
"reward_std": 9.77073048055172, |
|
"rewards/concensus_correctness_reward_func": 12.5, |
|
"rewards/consensus_reward_func": 1.4375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.9375, |
|
"rewards/question_recreation_reward_func": 0.5426900889724493, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.078125, |
|
"rewards/xmlcount_reward_func": 0.5498124919831753, |
|
"step": 246 |
|
}, |
|
{ |
|
"completion_length": 172.0625, |
|
"epoch": 3.492957746478873, |
|
"grad_norm": 17.22605323791504, |
|
"kl": 81.85796847939491, |
|
"learning_rate": 1.685229026768593e-06, |
|
"loss": 0.0819, |
|
"reward": 21.084633946418762, |
|
"reward_std": 5.291271213442087, |
|
"rewards/concensus_correctness_reward_func": 17.5, |
|
"rewards/consensus_reward_func": 1.75, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.6875, |
|
"rewards/question_recreation_reward_func": 0.5807589311152697, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.046875, |
|
"rewards/xmlcount_reward_func": 0.5195000041276217, |
|
"step": 248 |
|
}, |
|
{ |
|
"completion_length": 201.03125, |
|
"epoch": 3.52112676056338, |
|
"grad_norm": 11.301921844482422, |
|
"kl": 69.70909404754639, |
|
"learning_rate": 1.6470634835885097e-06, |
|
"loss": 0.0697, |
|
"reward": 14.430978834629059, |
|
"reward_std": 11.139893352985382, |
|
"rewards/concensus_correctness_reward_func": 11.25, |
|
"rewards/consensus_reward_func": 1.3125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.9375, |
|
"rewards/question_recreation_reward_func": 0.6216037422418594, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.3093750001862645, |
|
"step": 250 |
|
}, |
|
{ |
|
"completion_length": 175.59375, |
|
"epoch": 3.5492957746478875, |
|
"grad_norm": 10.406546592712402, |
|
"kl": 58.96156430244446, |
|
"learning_rate": 1.6091216082273875e-06, |
|
"loss": 0.059, |
|
"reward": 20.337380409240723, |
|
"reward_std": 7.825136856175959, |
|
"rewards/concensus_correctness_reward_func": 16.25, |
|
"rewards/consensus_reward_func": 1.625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.25, |
|
"rewards/question_recreation_reward_func": 0.5557556711137295, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0625, |
|
"rewards/xmlcount_reward_func": 0.5941249951720238, |
|
"step": 252 |
|
}, |
|
{ |
|
"completion_length": 198.15625, |
|
"epoch": 3.5774647887323945, |
|
"grad_norm": 9.735517501831055, |
|
"kl": 53.85496670007706, |
|
"learning_rate": 1.5714133502844591e-06, |
|
"loss": 0.0539, |
|
"reward": 19.757203698158264, |
|
"reward_std": 6.970965705811977, |
|
"rewards/concensus_correctness_reward_func": 16.25, |
|
"rewards/consensus_reward_func": 1.625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.6875, |
|
"rewards/question_recreation_reward_func": 0.45339070353657007, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0625, |
|
"rewards/xmlcount_reward_func": 0.678812500089407, |
|
"step": 254 |
|
}, |
|
{ |
|
"completion_length": 203.59375, |
|
"epoch": 3.6056338028169015, |
|
"grad_norm": 21.339231491088867, |
|
"kl": 40.247629791498184, |
|
"learning_rate": 1.5339485980968383e-06, |
|
"loss": 0.0402, |
|
"reward": 17.9938845038414, |
|
"reward_std": 5.043777231127024, |
|
"rewards/concensus_correctness_reward_func": 14.375, |
|
"rewards/consensus_reward_func": 1.6875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.8125, |
|
"rewards/question_recreation_reward_func": 0.6552903726696968, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.4323437474668026, |
|
"step": 256 |
|
}, |
|
{ |
|
"completion_length": 195.59375, |
|
"epoch": 3.6338028169014085, |
|
"grad_norm": 14.654716491699219, |
|
"kl": 22.20736539363861, |
|
"learning_rate": 1.4967371761464738e-06, |
|
"loss": 0.0222, |
|
"reward": 22.278719663619995, |
|
"reward_std": 3.614749798551202, |
|
"rewards/concensus_correctness_reward_func": 18.125, |
|
"rewards/consensus_reward_func": 1.8125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.125, |
|
"rewards/question_recreation_reward_func": 0.6758448034524918, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.5403749942779541, |
|
"step": 258 |
|
}, |
|
{ |
|
"completion_length": 190.34375, |
|
"epoch": 3.6619718309859155, |
|
"grad_norm": 29.445497512817383, |
|
"kl": 109.69821217656136, |
|
"learning_rate": 1.4597888424838519e-06, |
|
"loss": 0.1097, |
|
"reward": 19.653862476348877, |
|
"reward_std": 7.802595116198063, |
|
"rewards/concensus_correctness_reward_func": 16.25, |
|
"rewards/consensus_reward_func": 1.625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.8125, |
|
"rewards/question_recreation_reward_func": 0.5701128906803206, |
|
"rewards/soft_format_reward_func": 0.03125, |
|
"rewards/strict_format_reward_func": 0.03125, |
|
"rewards/xmlcount_reward_func": 0.33375000255182385, |
|
"step": 260 |
|
}, |
|
{ |
|
"completion_length": 218.8125, |
|
"epoch": 3.6901408450704225, |
|
"grad_norm": 9.643898010253906, |
|
"kl": 33.315061807632446, |
|
"learning_rate": 1.4231132861691128e-06, |
|
"loss": 0.0333, |
|
"reward": 21.43911099433899, |
|
"reward_std": 6.217341110110283, |
|
"rewards/concensus_correctness_reward_func": 17.5, |
|
"rewards/consensus_reward_func": 1.75, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.0625, |
|
"rewards/question_recreation_reward_func": 0.6770484782755375, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.4339374974370003, |
|
"step": 262 |
|
}, |
|
{ |
|
"completion_length": 216.78125, |
|
"epoch": 3.7183098591549295, |
|
"grad_norm": 14.2260103225708, |
|
"kl": 37.71351116895676, |
|
"learning_rate": 1.3867201247312697e-06, |
|
"loss": 0.0377, |
|
"reward": 19.555387258529663, |
|
"reward_std": 7.710960239171982, |
|
"rewards/concensus_correctness_reward_func": 16.25, |
|
"rewards/consensus_reward_func": 1.625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.6875, |
|
"rewards/question_recreation_reward_func": 0.5077626127749681, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.0625, |
|
"rewards/xmlcount_reward_func": 0.4069999912753701, |
|
"step": 264 |
|
}, |
|
{ |
|
"completion_length": 194.875, |
|
"epoch": 3.7464788732394365, |
|
"grad_norm": 9.09065055847168, |
|
"kl": 14.421021282672882, |
|
"learning_rate": 1.3506189016461674e-06, |
|
"loss": 0.0144, |
|
"reward": 20.84186053276062, |
|
"reward_std": 4.1744231805205345, |
|
"rewards/concensus_correctness_reward_func": 16.25, |
|
"rewards/consensus_reward_func": 1.875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.25, |
|
"rewards/question_recreation_reward_func": 0.6275481916964054, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.15625, |
|
"rewards/xmlcount_reward_func": 0.6674374938011169, |
|
"step": 266 |
|
}, |
|
{ |
|
"completion_length": 217.53125, |
|
"epoch": 3.7746478873239435, |
|
"grad_norm": 5.975257873535156, |
|
"kl": 37.89040416479111, |
|
"learning_rate": 1.3148190838338804e-06, |
|
"loss": 0.0379, |
|
"reward": 19.20604932308197, |
|
"reward_std": 5.332434967160225, |
|
"rewards/concensus_correctness_reward_func": 15.0, |
|
"rewards/consensus_reward_func": 1.75, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.125, |
|
"rewards/question_recreation_reward_func": 0.6709554120898247, |
|
"rewards/soft_format_reward_func": 0.03125, |
|
"rewards/strict_format_reward_func": 0.09375, |
|
"rewards/xmlcount_reward_func": 0.5350937426555902, |
|
"step": 268 |
|
}, |
|
{ |
|
"completion_length": 181.125, |
|
"epoch": 3.802816901408451, |
|
"grad_norm": 30.6917667388916, |
|
"kl": 1376.993093073368, |
|
"learning_rate": 1.2793300591761742e-06, |
|
"loss": 1.377, |
|
"reward": 18.036881029605865, |
|
"reward_std": 6.742033764719963, |
|
"rewards/concensus_correctness_reward_func": 14.375, |
|
"rewards/consensus_reward_func": 1.6875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.9375, |
|
"rewards/question_recreation_reward_func": 0.5031619630753994, |
|
"rewards/soft_format_reward_func": 0.046875, |
|
"rewards/strict_format_reward_func": 0.09375, |
|
"rewards/xmlcount_reward_func": 0.3930937433615327, |
|
"step": 270 |
|
}, |
|
{ |
|
"completion_length": 176.6875, |
|
"epoch": 3.830985915492958, |
|
"grad_norm": 20.22711944580078, |
|
"kl": 30287.698419213295, |
|
"learning_rate": 1.2441611340546958e-06, |
|
"loss": 30.2877, |
|
"reward": 16.03737948834896, |
|
"reward_std": 5.3479601461440325, |
|
"rewards/concensus_correctness_reward_func": 12.5, |
|
"rewards/consensus_reward_func": 1.625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.8125, |
|
"rewards/question_recreation_reward_func": 0.5265668611973524, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.03125, |
|
"rewards/xmlcount_reward_func": 0.5420624995604157, |
|
"step": 272 |
|
}, |
|
{ |
|
"completion_length": 200.90625, |
|
"epoch": 3.859154929577465, |
|
"grad_norm": 39.15205764770508, |
|
"kl": 100.48482239246368, |
|
"learning_rate": 1.2093215309105352e-06, |
|
"loss": 0.1005, |
|
"reward": 19.245975971221924, |
|
"reward_std": 9.297940351068974, |
|
"rewards/concensus_correctness_reward_func": 15.625, |
|
"rewards/consensus_reward_func": 1.5625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.0, |
|
"rewards/question_recreation_reward_func": 0.4970390796661377, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.0625, |
|
"rewards/xmlcount_reward_func": 0.48331249598413706, |
|
"step": 274 |
|
}, |
|
{ |
|
"completion_length": 183.25, |
|
"epoch": 3.887323943661972, |
|
"grad_norm": 22.197479248046875, |
|
"kl": 61.005871653556824, |
|
"learning_rate": 1.1748203858258056e-06, |
|
"loss": 0.061, |
|
"reward": 17.168922126293182, |
|
"reward_std": 7.946846023201942, |
|
"rewards/concensus_correctness_reward_func": 13.75, |
|
"rewards/consensus_reward_func": 1.5625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.8125, |
|
"rewards/question_recreation_reward_func": 0.4897347055375576, |
|
"rewards/soft_format_reward_func": 0.03125, |
|
"rewards/strict_format_reward_func": 0.046875, |
|
"rewards/xmlcount_reward_func": 0.47606249898672104, |
|
"step": 276 |
|
}, |
|
{ |
|
"completion_length": 202.65625, |
|
"epoch": 3.915492957746479, |
|
"grad_norm": 11.439094543457031, |
|
"kl": 89.05792760848999, |
|
"learning_rate": 1.140666746127854e-06, |
|
"loss": 0.0891, |
|
"reward": 19.30165010690689, |
|
"reward_std": 5.133161583915353, |
|
"rewards/concensus_correctness_reward_func": 15.103812456130981, |
|
"rewards/consensus_reward_func": 1.5, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.5, |
|
"rewards/question_recreation_reward_func": 0.5337746478617191, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.0625, |
|
"rewards/xmlcount_reward_func": 0.5859374962747097, |
|
"step": 278 |
|
}, |
|
{ |
|
"completion_length": 216.40625, |
|
"epoch": 3.943661971830986, |
|
"grad_norm": 27.769229888916016, |
|
"kl": 141.26176762580872, |
|
"learning_rate": 1.1068695680167665e-06, |
|
"loss": 0.1413, |
|
"reward": 18.381448954343796, |
|
"reward_std": 6.32744001224637, |
|
"rewards/concensus_correctness_reward_func": 15.0, |
|
"rewards/consensus_reward_func": 1.6875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.8125, |
|
"rewards/question_recreation_reward_func": 0.41957394033670425, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.03125, |
|
"rewards/xmlcount_reward_func": 0.43062499538064003, |
|
"step": 280 |
|
}, |
|
{ |
|
"completion_length": 203.59375, |
|
"epoch": 3.971830985915493, |
|
"grad_norm": 25.721193313598633, |
|
"kl": 42.546354830265045, |
|
"learning_rate": 1.0734377142167549e-06, |
|
"loss": 0.0425, |
|
"reward": 17.76998621225357, |
|
"reward_std": 6.668866345658898, |
|
"rewards/concensus_correctness_reward_func": 14.375, |
|
"rewards/consensus_reward_func": 1.5625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.6875, |
|
"rewards/question_recreation_reward_func": 0.585392065346241, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.046875, |
|
"rewards/xmlcount_reward_func": 0.49709375388920307, |
|
"step": 282 |
|
}, |
|
{ |
|
"completion_length": 177.125, |
|
"epoch": 4.0, |
|
"grad_norm": 15.914278984069824, |
|
"kl": 69.96249252557755, |
|
"learning_rate": 1.0403799516520619e-06, |
|
"loss": 0.07, |
|
"reward": 18.440218448638916, |
|
"reward_std": 6.883068062365055, |
|
"rewards/concensus_correctness_reward_func": 15.0, |
|
"rewards/consensus_reward_func": 1.5, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.8125, |
|
"rewards/question_recreation_reward_func": 0.6001559719443321, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.046875, |
|
"rewards/xmlcount_reward_func": 0.48068750463426113, |
|
"step": 284 |
|
}, |
|
{ |
|
"completion_length": 191.46875, |
|
"epoch": 4.028169014084507, |
|
"grad_norm": 18.65995979309082, |
|
"kl": 66.47544574737549, |
|
"learning_rate": 1.0077049491479874e-06, |
|
"loss": 0.0665, |
|
"reward": 22.233580589294434, |
|
"reward_std": 4.90995267778635, |
|
"rewards/concensus_correctness_reward_func": 18.125, |
|
"rewards/consensus_reward_func": 1.8125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.0625, |
|
"rewards/question_recreation_reward_func": 0.5808308683335781, |
|
"rewards/soft_format_reward_func": 0.03125, |
|
"rewards/strict_format_reward_func": 0.0625, |
|
"rewards/xmlcount_reward_func": 0.5589999947696924, |
|
"step": 286 |
|
}, |
|
{ |
|
"completion_length": 167.0, |
|
"epoch": 4.056338028169014, |
|
"grad_norm": 21.48002815246582, |
|
"kl": 208.1701105237007, |
|
"learning_rate": 9.754212751576386e-07, |
|
"loss": 0.2082, |
|
"reward": 16.22745645046234, |
|
"reward_std": 10.876746878027916, |
|
"rewards/concensus_correctness_reward_func": 13.125, |
|
"rewards/consensus_reward_func": 1.3125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.625, |
|
"rewards/question_recreation_reward_func": 0.6086752116680145, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.046875, |
|
"rewards/xmlcount_reward_func": 0.49378124438226223, |
|
"step": 288 |
|
}, |
|
{ |
|
"completion_length": 214.75, |
|
"epoch": 4.084507042253521, |
|
"grad_norm": 12.544981002807617, |
|
"kl": 36.315159887075424, |
|
"learning_rate": 9.435373955150032e-07, |
|
"loss": 0.0363, |
|
"reward": 21.26410162448883, |
|
"reward_std": 5.238973140716553, |
|
"rewards/concensus_correctness_reward_func": 17.5, |
|
"rewards/consensus_reward_func": 1.75, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.875, |
|
"rewards/question_recreation_reward_func": 0.6026639565825462, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.03125, |
|
"rewards/xmlcount_reward_func": 0.5051874946802855, |
|
"step": 290 |
|
}, |
|
{ |
|
"completion_length": 178.8125, |
|
"epoch": 4.112676056338028, |
|
"grad_norm": 145.86680603027344, |
|
"kl": 139.64250326156616, |
|
"learning_rate": 9.120616712149291e-07, |
|
"loss": 0.1396, |
|
"reward": 18.312809228897095, |
|
"reward_std": 10.484295897185802, |
|
"rewards/concensus_correctness_reward_func": 15.0, |
|
"rewards/consensus_reward_func": 1.5, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.75, |
|
"rewards/question_recreation_reward_func": 0.5245908284559846, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.046875, |
|
"rewards/xmlcount_reward_func": 0.49134374037384987, |
|
"step": 292 |
|
}, |
|
{ |
|
"completion_length": 190.4375, |
|
"epoch": 4.140845070422535, |
|
"grad_norm": 18.74555778503418, |
|
"kl": 50.41434487700462, |
|
"learning_rate": 8.810023562206e-07, |
|
"loss": 0.0504, |
|
"reward": 17.838220357894897, |
|
"reward_std": 9.871636435389519, |
|
"rewards/concensus_correctness_reward_func": 14.375, |
|
"rewards/consensus_reward_func": 1.4375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.0, |
|
"rewards/question_recreation_reward_func": 0.5950643494725227, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.39940624311566353, |
|
"step": 294 |
|
}, |
|
{ |
|
"completion_length": 205.09375, |
|
"epoch": 4.169014084507042, |
|
"grad_norm": 15.046422004699707, |
|
"kl": 29.03540551662445, |
|
"learning_rate": 8.503675952990756e-07, |
|
"loss": 0.029, |
|
"reward": 21.843222498893738, |
|
"reward_std": 5.1342647187411785, |
|
"rewards/concensus_correctness_reward_func": 17.5, |
|
"rewards/consensus_reward_func": 1.75, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.375, |
|
"rewards/question_recreation_reward_func": 0.6643783301115036, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.046875, |
|
"rewards/xmlcount_reward_func": 0.5069687478244305, |
|
"step": 296 |
|
}, |
|
{ |
|
"completion_length": 165.78125, |
|
"epoch": 4.197183098591549, |
|
"grad_norm": 17.604740142822266, |
|
"kl": 51.230829417705536, |
|
"learning_rate": 8.20165421885469e-07, |
|
"loss": 0.0512, |
|
"reward": 17.63426423072815, |
|
"reward_std": 6.369495037943125, |
|
"rewards/concensus_correctness_reward_func": 14.375, |
|
"rewards/consensus_reward_func": 1.6875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.625, |
|
"rewards/question_recreation_reward_func": 0.4940767101943493, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.43706249073147774, |
|
"step": 298 |
|
}, |
|
{ |
|
"completion_length": 235.375, |
|
"epoch": 4.225352112676056, |
|
"grad_norm": 6.080423355102539, |
|
"kl": 13.085180699825287, |
|
"learning_rate": 7.904037559763162e-07, |
|
"loss": 0.0131, |
|
"reward": 19.109760493040085, |
|
"reward_std": 1.1591962296515703, |
|
"rewards/concensus_correctness_reward_func": 15.0, |
|
"rewards/consensus_reward_func": 1.9375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.875, |
|
"rewards/question_recreation_reward_func": 0.6586043164134026, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.078125, |
|
"rewards/xmlcount_reward_func": 0.5605312576517463, |
|
"step": 300 |
|
}, |
|
{ |
|
"completion_length": 194.9375, |
|
"epoch": 4.253521126760563, |
|
"grad_norm": 11.369367599487305, |
|
"kl": 44.47002148628235, |
|
"learning_rate": 7.610904020526938e-07, |
|
"loss": 0.0445, |
|
"reward": 16.67498344182968, |
|
"reward_std": 9.237527802586555, |
|
"rewards/concensus_correctness_reward_func": 13.125, |
|
"rewards/consensus_reward_func": 1.5, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.6875, |
|
"rewards/question_recreation_reward_func": 0.5777959898114204, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.109375, |
|
"rewards/xmlcount_reward_func": 0.6753124929964542, |
|
"step": 302 |
|
}, |
|
{ |
|
"completion_length": 189.65625, |
|
"epoch": 4.28169014084507, |
|
"grad_norm": 21.256389617919922, |
|
"kl": 71.40908312797546, |
|
"learning_rate": 7.322330470336314e-07, |
|
"loss": 0.0714, |
|
"reward": 20.69408369064331, |
|
"reward_std": 6.768543675541878, |
|
"rewards/concensus_correctness_reward_func": 16.875, |
|
"rewards/consensus_reward_func": 1.6875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.9375, |
|
"rewards/question_recreation_reward_func": 0.6492089293897152, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.046875, |
|
"rewards/xmlcount_reward_func": 0.49800000712275505, |
|
"step": 304 |
|
}, |
|
{ |
|
"completion_length": 195.875, |
|
"epoch": 4.309859154929577, |
|
"grad_norm": 7.1469597816467285, |
|
"kl": 33.71372902393341, |
|
"learning_rate": 7.038392582603481e-07, |
|
"loss": 0.0337, |
|
"reward": 20.510346174240112, |
|
"reward_std": 6.731610176153481, |
|
"rewards/concensus_correctness_reward_func": 16.875, |
|
"rewards/consensus_reward_func": 1.6875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.875, |
|
"rewards/question_recreation_reward_func": 0.5321269854903221, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.046875, |
|
"rewards/xmlcount_reward_func": 0.4938437454402447, |
|
"step": 306 |
|
}, |
|
{ |
|
"completion_length": 177.1875, |
|
"epoch": 4.338028169014084, |
|
"grad_norm": 292.14501953125, |
|
"kl": 235.51378059387207, |
|
"learning_rate": 6.759164815118493e-07, |
|
"loss": 0.2355, |
|
"reward": 16.430650651454926, |
|
"reward_std": 5.297965854406357, |
|
"rewards/concensus_correctness_reward_func": 12.5, |
|
"rewards/consensus_reward_func": 1.6875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.875, |
|
"rewards/question_recreation_reward_func": 0.616213109344244, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.09375, |
|
"rewards/xmlcount_reward_func": 0.6581875011324883, |
|
"step": 308 |
|
}, |
|
{ |
|
"completion_length": 216.96875, |
|
"epoch": 4.366197183098592, |
|
"grad_norm": 6.305967807769775, |
|
"kl": 341.7059046626091, |
|
"learning_rate": 6.484720390524008e-07, |
|
"loss": 0.3417, |
|
"reward": 21.65046501159668, |
|
"reward_std": 6.0755148604512215, |
|
"rewards/concensus_correctness_reward_func": 17.5, |
|
"rewards/consensus_reward_func": 1.75, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.1875, |
|
"rewards/question_recreation_reward_func": 0.5807776674628258, |
|
"rewards/soft_format_reward_func": 0.03125, |
|
"rewards/strict_format_reward_func": 0.046875, |
|
"rewards/xmlcount_reward_func": 0.5540624931454659, |
|
"step": 310 |
|
}, |
|
{ |
|
"completion_length": 173.625, |
|
"epoch": 4.394366197183099, |
|
"grad_norm": 102.45231628417969, |
|
"kl": 167.21936225891113, |
|
"learning_rate": 6.2151312771139e-07, |
|
"loss": 0.1672, |
|
"reward": 18.467550039291382, |
|
"reward_std": 6.471917539834976, |
|
"rewards/concensus_correctness_reward_func": 15.0, |
|
"rewards/consensus_reward_func": 1.5, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.0, |
|
"rewards/question_recreation_reward_func": 0.3949254211038351, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.046875, |
|
"rewards/xmlcount_reward_func": 0.5257500112056732, |
|
"step": 312 |
|
}, |
|
{ |
|
"completion_length": 151.34375, |
|
"epoch": 4.422535211267606, |
|
"grad_norm": 75.45765686035156, |
|
"kl": 130.43919348716736, |
|
"learning_rate": 5.950468169960846e-07, |
|
"loss": 0.1304, |
|
"reward": 17.570589900016785, |
|
"reward_std": 9.95434544980526, |
|
"rewards/concensus_correctness_reward_func": 14.375, |
|
"rewards/consensus_reward_func": 1.4375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.625, |
|
"rewards/question_recreation_reward_func": 0.4281215965747833, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.109375, |
|
"rewards/xmlcount_reward_func": 0.579968761652708, |
|
"step": 314 |
|
}, |
|
{ |
|
"completion_length": 194.28125, |
|
"epoch": 4.450704225352113, |
|
"grad_norm": 150.11294555664062, |
|
"kl": 140.0380249619484, |
|
"learning_rate": 5.690800472377747e-07, |
|
"loss": 0.14, |
|
"reward": 19.77779531478882, |
|
"reward_std": 8.302450500428677, |
|
"rewards/concensus_correctness_reward_func": 16.25, |
|
"rewards/consensus_reward_func": 1.625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.875, |
|
"rewards/question_recreation_reward_func": 0.48335786536335945, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.046875, |
|
"rewards/xmlcount_reward_func": 0.4975624978542328, |
|
"step": 316 |
|
}, |
|
{ |
|
"completion_length": 175.59375, |
|
"epoch": 4.47887323943662, |
|
"grad_norm": 18.749839782714844, |
|
"kl": 35.669027864933014, |
|
"learning_rate": 5.436196277717928e-07, |
|
"loss": 0.0357, |
|
"reward": 20.80094337463379, |
|
"reward_std": 6.281726138666272, |
|
"rewards/concensus_correctness_reward_func": 17.5, |
|
"rewards/consensus_reward_func": 1.75, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.4375, |
|
"rewards/question_recreation_reward_func": 0.5344437230378389, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.03125, |
|
"rewards/xmlcount_reward_func": 0.5477499924600124, |
|
"step": 318 |
|
}, |
|
{ |
|
"completion_length": 250.53125, |
|
"epoch": 4.507042253521127, |
|
"grad_norm": 20.69882583618164, |
|
"kl": 60.95956812798977, |
|
"learning_rate": 5.186722351518822e-07, |
|
"loss": 0.061, |
|
"reward": 16.334015995264053, |
|
"reward_std": 5.083772074431181, |
|
"rewards/concensus_correctness_reward_func": 12.5, |
|
"rewards/consensus_reward_func": 1.75, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.6875, |
|
"rewards/question_recreation_reward_func": 0.6849223002791405, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.078125, |
|
"rewards/xmlcount_reward_func": 0.6334687490016222, |
|
"step": 320 |
|
}, |
|
{ |
|
"completion_length": 163.90625, |
|
"epoch": 4.535211267605634, |
|
"grad_norm": 25.738649368286133, |
|
"kl": 61.802970230579376, |
|
"learning_rate": 4.94244411399388e-07, |
|
"loss": 0.0618, |
|
"reward": 21.02661967277527, |
|
"reward_std": 6.551919437944889, |
|
"rewards/concensus_correctness_reward_func": 17.5, |
|
"rewards/consensus_reward_func": 1.75, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.625, |
|
"rewards/question_recreation_reward_func": 0.497026052325964, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0625, |
|
"rewards/xmlcount_reward_func": 0.592093750834465, |
|
"step": 322 |
|
}, |
|
{ |
|
"completion_length": 164.8125, |
|
"epoch": 4.563380281690141, |
|
"grad_norm": 256.33447265625, |
|
"kl": 241.42089343070984, |
|
"learning_rate": 4.703425622877239e-07, |
|
"loss": 0.2414, |
|
"reward": 17.63269305229187, |
|
"reward_std": 10.951422438025475, |
|
"rewards/concensus_correctness_reward_func": 14.375, |
|
"rewards/consensus_reward_func": 1.4375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.6875, |
|
"rewards/question_recreation_reward_func": 0.450318006798625, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.078125, |
|
"rewards/xmlcount_reward_func": 0.6042499914765358, |
|
"step": 324 |
|
}, |
|
{ |
|
"completion_length": 165.125, |
|
"epoch": 4.591549295774648, |
|
"grad_norm": 31.652605056762695, |
|
"kl": 43.60177397727966, |
|
"learning_rate": 4.469729556625704e-07, |
|
"loss": 0.0436, |
|
"reward": 19.482061743736267, |
|
"reward_std": 8.377652376890182, |
|
"rewards/concensus_correctness_reward_func": 15.625, |
|
"rewards/consensus_reward_func": 1.5625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.125, |
|
"rewards/question_recreation_reward_func": 0.5043744444847107, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.046875, |
|
"rewards/xmlcount_reward_func": 0.6183125004172325, |
|
"step": 326 |
|
}, |
|
{ |
|
"completion_length": 175.96875, |
|
"epoch": 4.619718309859155, |
|
"grad_norm": 10.242480278015137, |
|
"kl": 21.965113878250122, |
|
"learning_rate": 4.2414171979824e-07, |
|
"loss": 0.022, |
|
"reward": 20.588115096092224, |
|
"reward_std": 6.254963330924511, |
|
"rewards/concensus_correctness_reward_func": 16.875, |
|
"rewards/consensus_reward_func": 1.6875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.875, |
|
"rewards/question_recreation_reward_func": 0.47283417731523514, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.078125, |
|
"rewards/xmlcount_reward_func": 0.5996562596410513, |
|
"step": 328 |
|
}, |
|
{ |
|
"completion_length": 206.90625, |
|
"epoch": 4.647887323943662, |
|
"grad_norm": 114.71504211425781, |
|
"kl": 82.849600315094, |
|
"learning_rate": 4.0185484179064427e-07, |
|
"loss": 0.0828, |
|
"reward": 16.668977200984955, |
|
"reward_std": 6.640440072864294, |
|
"rewards/concensus_correctness_reward_func": 13.75, |
|
"rewards/consensus_reward_func": 1.5625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.3125, |
|
"rewards/question_recreation_reward_func": 0.6327583584934473, |
|
"rewards/soft_format_reward_func": 0.03125, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.3799687549471855, |
|
"step": 330 |
|
}, |
|
{ |
|
"completion_length": 193.78125, |
|
"epoch": 4.676056338028169, |
|
"grad_norm": 58.762516021728516, |
|
"kl": 94.41752421855927, |
|
"learning_rate": 3.801181659872805e-07, |
|
"loss": 0.0944, |
|
"reward": 17.93424743413925, |
|
"reward_std": 7.9948363825678825, |
|
"rewards/concensus_correctness_reward_func": 13.75, |
|
"rewards/consensus_reward_func": 1.5625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.1875, |
|
"rewards/question_recreation_reward_func": 0.615435041487217, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.15625, |
|
"rewards/xmlcount_reward_func": 0.6625625044107437, |
|
"step": 332 |
|
}, |
|
{ |
|
"completion_length": 205.65625, |
|
"epoch": 4.704225352112676, |
|
"grad_norm": 16.824317932128906, |
|
"kl": 88.85696315765381, |
|
"learning_rate": 3.5893739245465465e-07, |
|
"loss": 0.0889, |
|
"reward": 13.142742186784744, |
|
"reward_std": 8.422066152095795, |
|
"rewards/concensus_correctness_reward_func": 10.0, |
|
"rewards/consensus_reward_func": 1.3125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.625, |
|
"rewards/question_recreation_reward_func": 0.5604296084493399, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.09375, |
|
"rewards/xmlcount_reward_func": 0.551062498241663, |
|
"step": 334 |
|
}, |
|
{ |
|
"completion_length": 191.28125, |
|
"epoch": 4.732394366197183, |
|
"grad_norm": 28.791719436645508, |
|
"kl": 14.142897069454193, |
|
"learning_rate": 3.383180754835344e-07, |
|
"loss": 0.0141, |
|
"reward": 15.803372830152512, |
|
"reward_std": 6.4000239027664065, |
|
"rewards/concensus_correctness_reward_func": 12.5, |
|
"rewards/consensus_reward_func": 1.75, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.5625, |
|
"rewards/question_recreation_reward_func": 0.3909041713923216, |
|
"rewards/soft_format_reward_func": 0.03125, |
|
"rewards/strict_format_reward_func": 0.078125, |
|
"rewards/xmlcount_reward_func": 0.4905937425792217, |
|
"step": 336 |
|
}, |
|
{ |
|
"completion_length": 184.5625, |
|
"epoch": 4.76056338028169, |
|
"grad_norm": 16.70457649230957, |
|
"kl": 52.15749150514603, |
|
"learning_rate": 3.182656221324384e-07, |
|
"loss": 0.0522, |
|
"reward": 20.181472659111023, |
|
"reward_std": 5.007015394046903, |
|
"rewards/concensus_correctness_reward_func": 16.25, |
|
"rewards/consensus_reward_func": 1.625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.125, |
|
"rewards/question_recreation_reward_func": 0.5429101679474115, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.03125, |
|
"rewards/xmlcount_reward_func": 0.6073125060647726, |
|
"step": 338 |
|
}, |
|
{ |
|
"completion_length": 194.09375, |
|
"epoch": 4.788732394366197, |
|
"grad_norm": 9.146194458007812, |
|
"kl": 70.11677631735802, |
|
"learning_rate": 2.98785290809723e-07, |
|
"loss": 0.0701, |
|
"reward": 18.03154420852661, |
|
"reward_std": 6.602232605218887, |
|
"rewards/concensus_correctness_reward_func": 14.375, |
|
"rewards/consensus_reward_func": 1.625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.875, |
|
"rewards/question_recreation_reward_func": 0.5392944887280464, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0625, |
|
"rewards/xmlcount_reward_func": 0.5547500140964985, |
|
"step": 340 |
|
}, |
|
{ |
|
"completion_length": 201.0625, |
|
"epoch": 4.816901408450704, |
|
"grad_norm": 10.930744171142578, |
|
"kl": 49.3902553319931, |
|
"learning_rate": 2.798821898946588e-07, |
|
"loss": 0.0494, |
|
"reward": 17.96576575934887, |
|
"reward_std": 6.204115567728877, |
|
"rewards/concensus_correctness_reward_func": 14.375, |
|
"rewards/consensus_reward_func": 1.5625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.875, |
|
"rewards/question_recreation_reward_func": 0.5690469006076455, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.5685937367379665, |
|
"step": 342 |
|
}, |
|
{ |
|
"completion_length": 183.75, |
|
"epoch": 4.845070422535211, |
|
"grad_norm": 11.14244556427002, |
|
"kl": 530.0989896059036, |
|
"learning_rate": 2.615612763978462e-07, |
|
"loss": 0.5301, |
|
"reward": 20.103400349617004, |
|
"reward_std": 8.350226640701294, |
|
"rewards/concensus_correctness_reward_func": 16.25, |
|
"rewards/consensus_reward_func": 1.625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.9375, |
|
"rewards/question_recreation_reward_func": 0.5339003503322601, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.109375, |
|
"rewards/xmlcount_reward_func": 0.647624995559454, |
|
"step": 344 |
|
}, |
|
{ |
|
"completion_length": 188.6875, |
|
"epoch": 4.873239436619718, |
|
"grad_norm": 77.03536987304688, |
|
"kl": 119.32902491092682, |
|
"learning_rate": 2.438273546613257e-07, |
|
"loss": 0.1193, |
|
"reward": 15.51081308722496, |
|
"reward_std": 6.711298692971468, |
|
"rewards/concensus_correctness_reward_func": 11.875, |
|
"rewards/consensus_reward_func": 1.5625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.8125, |
|
"rewards/question_recreation_reward_func": 0.6048446670174599, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.09375, |
|
"rewards/xmlcount_reward_func": 0.5622187480330467, |
|
"step": 346 |
|
}, |
|
{ |
|
"completion_length": 173.5, |
|
"epoch": 4.901408450704225, |
|
"grad_norm": 10.03520679473877, |
|
"kl": 680.569248855114, |
|
"learning_rate": 2.2668507509871957e-07, |
|
"loss": 0.6806, |
|
"reward": 20.39792001247406, |
|
"reward_std": 7.9413245394825935, |
|
"rewards/concensus_correctness_reward_func": 16.25, |
|
"rewards/consensus_reward_func": 1.625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.1875, |
|
"rewards/question_recreation_reward_func": 0.6092325560748577, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.078125, |
|
"rewards/xmlcount_reward_func": 0.6480624973773956, |
|
"step": 348 |
|
}, |
|
{ |
|
"completion_length": 211.9375, |
|
"epoch": 4.929577464788732, |
|
"grad_norm": 9.544289588928223, |
|
"kl": 67.18813559412956, |
|
"learning_rate": 2.1013893297574777e-07, |
|
"loss": 0.0672, |
|
"reward": 18.139901995658875, |
|
"reward_std": 7.58577387034893, |
|
"rewards/concensus_correctness_reward_func": 14.375, |
|
"rewards/consensus_reward_func": 1.625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.0625, |
|
"rewards/question_recreation_reward_func": 0.5312769636511803, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.046875, |
|
"rewards/xmlcount_reward_func": 0.48362499848008156, |
|
"step": 350 |
|
}, |
|
{ |
|
"completion_length": 158.875, |
|
"epoch": 4.957746478873239, |
|
"grad_norm": 1220.3043212890625, |
|
"kl": 410.9177169203758, |
|
"learning_rate": 1.9419326723141534e-07, |
|
"loss": 0.4109, |
|
"reward": 20.147993981838226, |
|
"reward_std": 6.549997612833977, |
|
"rewards/concensus_correctness_reward_func": 16.25, |
|
"rewards/consensus_reward_func": 1.625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.1875, |
|
"rewards/question_recreation_reward_func": 0.4576812032610178, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.078125, |
|
"rewards/xmlcount_reward_func": 0.5496875047683716, |
|
"step": 352 |
|
}, |
|
{ |
|
"completion_length": 189.53125, |
|
"epoch": 4.985915492957746, |
|
"grad_norm": 107952.3671875, |
|
"kl": 50374.548123419285, |
|
"learning_rate": 1.788522593402059e-07, |
|
"loss": 50.3746, |
|
"reward": 19.29557180404663, |
|
"reward_std": 9.341163873672485, |
|
"rewards/concensus_correctness_reward_func": 15.625, |
|
"rewards/consensus_reward_func": 1.5625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.0625, |
|
"rewards/question_recreation_reward_func": 0.4647905360907316, |
|
"rewards/soft_format_reward_func": 0.046875, |
|
"rewards/strict_format_reward_func": 0.0625, |
|
"rewards/xmlcount_reward_func": 0.47140624839812517, |
|
"step": 354 |
|
}, |
|
{ |
|
"completion_length": 173.28125, |
|
"epoch": 5.014084507042254, |
|
"grad_norm": 20.18059730529785, |
|
"kl": 56.19126904010773, |
|
"learning_rate": 1.6411993221555928e-07, |
|
"loss": 0.0562, |
|
"reward": 18.442645728588104, |
|
"reward_std": 7.794232741463929, |
|
"rewards/concensus_correctness_reward_func": 15.0, |
|
"rewards/consensus_reward_func": 1.5, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.9375, |
|
"rewards/question_recreation_reward_func": 0.4293326549232006, |
|
"rewards/soft_format_reward_func": 0.03125, |
|
"rewards/strict_format_reward_func": 0.03125, |
|
"rewards/xmlcount_reward_func": 0.5133124887943268, |
|
"step": 356 |
|
}, |
|
{ |
|
"completion_length": 197.28125, |
|
"epoch": 5.042253521126761, |
|
"grad_norm": 98472.9296875, |
|
"kl": 42849.533732414246, |
|
"learning_rate": 1.5000014915493467e-07, |
|
"loss": 42.8495, |
|
"reward": 18.569206684827805, |
|
"reward_std": 6.2984634116292, |
|
"rewards/concensus_correctness_reward_func": 15.0, |
|
"rewards/consensus_reward_func": 1.75, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.6875, |
|
"rewards/question_recreation_reward_func": 0.6072378233075142, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.49321874510496855, |
|
"step": 358 |
|
}, |
|
{ |
|
"completion_length": 180.75, |
|
"epoch": 5.070422535211268, |
|
"grad_norm": 17.634912490844727, |
|
"kl": 57.633513152599335, |
|
"learning_rate": 1.3649661282672478e-07, |
|
"loss": 0.0576, |
|
"reward": 16.21313813328743, |
|
"reward_std": 6.453056633472443, |
|
"rewards/concensus_correctness_reward_func": 13.125, |
|
"rewards/consensus_reward_func": 1.5, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.5625, |
|
"rewards/question_recreation_reward_func": 0.5248567461967468, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.46953125298023224, |
|
"step": 360 |
|
}, |
|
{ |
|
"completion_length": 181.9375, |
|
"epoch": 5.098591549295775, |
|
"grad_norm": 6.022618293762207, |
|
"kl": 55.502021461725235, |
|
"learning_rate": 1.2361286429929953e-07, |
|
"loss": 0.0555, |
|
"reward": 21.646337032318115, |
|
"reward_std": 4.948778457939625, |
|
"rewards/concensus_correctness_reward_func": 17.5, |
|
"rewards/consensus_reward_func": 1.75, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.1875, |
|
"rewards/question_recreation_reward_func": 0.500399325042963, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.078125, |
|
"rewards/xmlcount_reward_func": 0.6146875005215406, |
|
"step": 362 |
|
}, |
|
{ |
|
"completion_length": 218.21875, |
|
"epoch": 5.126760563380282, |
|
"grad_norm": 24.87447166442871, |
|
"kl": 70.74051466584206, |
|
"learning_rate": 1.1135228211241827e-07, |
|
"loss": 0.0707, |
|
"reward": 14.870485126972198, |
|
"reward_std": 8.201804894953966, |
|
"rewards/concensus_correctness_reward_func": 11.875, |
|
"rewards/consensus_reward_func": 1.4375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.5, |
|
"rewards/question_recreation_reward_func": 0.6112355031073093, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.4311249917373061, |
|
"step": 364 |
|
}, |
|
{ |
|
"completion_length": 200.875, |
|
"epoch": 5.154929577464789, |
|
"grad_norm": 17.180267333984375, |
|
"kl": 47.48751229047775, |
|
"learning_rate": 9.97180813912682e-08, |
|
"loss": 0.0475, |
|
"reward": 16.328749358654022, |
|
"reward_std": 8.015842709690332, |
|
"rewards/concensus_correctness_reward_func": 13.125, |
|
"rewards/consensus_reward_func": 1.5625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.625, |
|
"rewards/question_recreation_reward_func": 0.43224936723709106, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0625, |
|
"rewards/xmlcount_reward_func": 0.5215000063180923, |
|
"step": 366 |
|
}, |
|
{ |
|
"completion_length": 160.96875, |
|
"epoch": 5.183098591549296, |
|
"grad_norm": 22.877119064331055, |
|
"kl": 52.39796483516693, |
|
"learning_rate": 8.871331300335322e-08, |
|
"loss": 0.0524, |
|
"reward": 20.07635807991028, |
|
"reward_std": 5.351797789335251, |
|
"rewards/concensus_correctness_reward_func": 16.875, |
|
"rewards/consensus_reward_func": 1.6875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.5625, |
|
"rewards/question_recreation_reward_func": 0.4254519008100033, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.5259062610566616, |
|
"step": 368 |
|
}, |
|
{ |
|
"completion_length": 187.78125, |
|
"epoch": 5.211267605633803, |
|
"grad_norm": 13.897777557373047, |
|
"kl": 37.57750755548477, |
|
"learning_rate": 7.834086275845587e-08, |
|
"loss": 0.0376, |
|
"reward": 18.33556878566742, |
|
"reward_std": 9.281399443745613, |
|
"rewards/concensus_correctness_reward_func": 15.0, |
|
"rewards/consensus_reward_func": 1.5, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.6875, |
|
"rewards/question_recreation_reward_func": 0.4743501963093877, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.125, |
|
"rewards/xmlcount_reward_func": 0.5487187346443534, |
|
"step": 370 |
|
}, |
|
{ |
|
"completion_length": 191.625, |
|
"epoch": 5.23943661971831, |
|
"grad_norm": 50.67697525024414, |
|
"kl": 90.23001140356064, |
|
"learning_rate": 6.860345065188512e-08, |
|
"loss": 0.0902, |
|
"reward": 20.43578866124153, |
|
"reward_std": 3.805197238922119, |
|
"rewards/concensus_correctness_reward_func": 16.25, |
|
"rewards/consensus_reward_func": 1.8125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.0, |
|
"rewards/question_recreation_reward_func": 0.5983824506402016, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.125, |
|
"rewards/xmlcount_reward_func": 0.634281262755394, |
|
"step": 372 |
|
}, |
|
{ |
|
"completion_length": 148.75, |
|
"epoch": 5.267605633802817, |
|
"grad_norm": 24.013933181762695, |
|
"kl": 979.8315967023373, |
|
"learning_rate": 5.9503630151205025e-08, |
|
"loss": 0.9798, |
|
"reward": 14.10513174533844, |
|
"reward_std": 8.168088547885418, |
|
"rewards/concensus_correctness_reward_func": 11.25, |
|
"rewards/consensus_reward_func": 1.25, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.75, |
|
"rewards/question_recreation_reward_func": 0.42822520434856415, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.046875, |
|
"rewards/xmlcount_reward_func": 0.36440624482929707, |
|
"step": 374 |
|
}, |
|
{ |
|
"completion_length": 200.625, |
|
"epoch": 5.295774647887324, |
|
"grad_norm": 10426.9599609375, |
|
"kl": 5901.08397424221, |
|
"learning_rate": 5.104378752663008e-08, |
|
"loss": 5.9011, |
|
"reward": 19.324188113212585, |
|
"reward_std": 8.048752292990685, |
|
"rewards/concensus_correctness_reward_func": 15.625, |
|
"rewards/consensus_reward_func": 1.5625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.9375, |
|
"rewards/question_recreation_reward_func": 0.5999692752957344, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0625, |
|
"rewards/xmlcount_reward_func": 0.5367187485098839, |
|
"step": 376 |
|
}, |
|
{ |
|
"completion_length": 227.375, |
|
"epoch": 5.323943661971831, |
|
"grad_norm": 14.172988891601562, |
|
"kl": 192.83756294846535, |
|
"learning_rate": 4.3226141225268804e-08, |
|
"loss": 0.1928, |
|
"reward": 16.749698162078857, |
|
"reward_std": 10.971801988780499, |
|
"rewards/concensus_correctness_reward_func": 13.75, |
|
"rewards/consensus_reward_func": 1.375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.6875, |
|
"rewards/question_recreation_reward_func": 0.45266704447567463, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.046875, |
|
"rewards/xmlcount_reward_func": 0.4376562498509884, |
|
"step": 378 |
|
}, |
|
{ |
|
"completion_length": 175.5, |
|
"epoch": 5.352112676056338, |
|
"grad_norm": 226.3981170654297, |
|
"kl": 166.27400428056717, |
|
"learning_rate": 3.605274128937464e-08, |
|
"loss": 0.1663, |
|
"reward": 17.65935444831848, |
|
"reward_std": 5.609901927411556, |
|
"rewards/concensus_correctness_reward_func": 14.375, |
|
"rewards/consensus_reward_func": 1.6875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.5625, |
|
"rewards/question_recreation_reward_func": 0.43151059560477734, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0625, |
|
"rewards/xmlcount_reward_func": 0.5403437651693821, |
|
"step": 380 |
|
}, |
|
{ |
|
"completion_length": 177.84375, |
|
"epoch": 5.380281690140845, |
|
"grad_norm": 11.149412155151367, |
|
"kl": 77.37025237083435, |
|
"learning_rate": 2.9525468818755455e-08, |
|
"loss": 0.0774, |
|
"reward": 16.605177223682404, |
|
"reward_std": 7.419661745429039, |
|
"rewards/concensus_correctness_reward_func": 13.125, |
|
"rewards/consensus_reward_func": 1.5625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.6875, |
|
"rewards/question_recreation_reward_func": 0.6258334219455719, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0625, |
|
"rewards/xmlcount_reward_func": 0.5418437719345093, |
|
"step": 382 |
|
}, |
|
{ |
|
"completion_length": 209.125, |
|
"epoch": 5.408450704225352, |
|
"grad_norm": 211.39231872558594, |
|
"kl": 465.8537292480469, |
|
"learning_rate": 2.3646035477491726e-08, |
|
"loss": 0.4659, |
|
"reward": 18.196722507476807, |
|
"reward_std": 2.805102661252022, |
|
"rewards/concensus_correctness_reward_func": 14.375, |
|
"rewards/consensus_reward_func": 1.75, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.8125, |
|
"rewards/question_recreation_reward_func": 0.5984417237341404, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.09375, |
|
"rewards/xmlcount_reward_func": 0.5670312382280827, |
|
"step": 384 |
|
}, |
|
{ |
|
"completion_length": 202.34375, |
|
"epoch": 5.436619718309859, |
|
"grad_norm": 1496.8240966796875, |
|
"kl": 205.22770684957504, |
|
"learning_rate": 1.841598304507891e-08, |
|
"loss": 0.2052, |
|
"reward": 14.782392874360085, |
|
"reward_std": 6.89620116353035, |
|
"rewards/concensus_correctness_reward_func": 11.875, |
|
"rewards/consensus_reward_func": 1.4375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.625, |
|
"rewards/question_recreation_reward_func": 0.47601788584142923, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.3532499959692359, |
|
"step": 386 |
|
}, |
|
{ |
|
"completion_length": 167.96875, |
|
"epoch": 5.464788732394366, |
|
"grad_norm": 20.663803100585938, |
|
"kl": 39.010592728853226, |
|
"learning_rate": 1.383668301212393e-08, |
|
"loss": 0.039, |
|
"reward": 18.894488275051117, |
|
"reward_std": 7.656329156830907, |
|
"rewards/concensus_correctness_reward_func": 15.625, |
|
"rewards/consensus_reward_func": 1.5625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.75, |
|
"rewards/question_recreation_reward_func": 0.48589482717216015, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.03125, |
|
"rewards/xmlcount_reward_func": 0.4242187514901161, |
|
"step": 388 |
|
}, |
|
{ |
|
"completion_length": 165.15625, |
|
"epoch": 5.492957746478873, |
|
"grad_norm": 565.16455078125, |
|
"kl": 197.70873486995697, |
|
"learning_rate": 9.90933622069562e-09, |
|
"loss": 0.1977, |
|
"reward": 17.981685161590576, |
|
"reward_std": 6.837831487879157, |
|
"rewards/concensus_correctness_reward_func": 14.375, |
|
"rewards/consensus_reward_func": 1.6875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.75, |
|
"rewards/question_recreation_reward_func": 0.5238723792135715, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0625, |
|
"rewards/xmlcount_reward_func": 0.5828124936670065, |
|
"step": 390 |
|
}, |
|
{ |
|
"completion_length": 182.0625, |
|
"epoch": 5.52112676056338, |
|
"grad_norm": 8.814075469970703, |
|
"kl": 79.46024709939957, |
|
"learning_rate": 6.634972549423857e-09, |
|
"loss": 0.0795, |
|
"reward": 16.639443710446358, |
|
"reward_std": 6.897542349994183, |
|
"rewards/concensus_correctness_reward_func": 13.125, |
|
"rewards/consensus_reward_func": 1.3125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.9375, |
|
"rewards/question_recreation_reward_func": 0.6201934851706028, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.09375, |
|
"rewards/xmlcount_reward_func": 0.5348749901168048, |
|
"step": 392 |
|
}, |
|
{ |
|
"completion_length": 155.0, |
|
"epoch": 5.549295774647887, |
|
"grad_norm": 509.7919006347656, |
|
"kl": 377.71353951096535, |
|
"learning_rate": 4.01445064343281e-09, |
|
"loss": 0.3777, |
|
"reward": 19.832693457603455, |
|
"reward_std": 6.739683650434017, |
|
"rewards/concensus_correctness_reward_func": 16.25, |
|
"rewards/consensus_reward_func": 1.625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.75, |
|
"rewards/question_recreation_reward_func": 0.48700585681945086, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.078125, |
|
"rewards/xmlcount_reward_func": 0.6425625011324883, |
|
"step": 394 |
|
}, |
|
{ |
|
"completion_length": 200.125, |
|
"epoch": 5.577464788732394, |
|
"grad_norm": 38.70952224731445, |
|
"kl": 128.81593072414398, |
|
"learning_rate": 2.048457689174943e-09, |
|
"loss": 0.1288, |
|
"reward": 19.608071088790894, |
|
"reward_std": 9.304894164204597, |
|
"rewards/concensus_correctness_reward_func": 15.625, |
|
"rewards/consensus_reward_func": 1.5625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.0625, |
|
"rewards/question_recreation_reward_func": 0.6501020789146423, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.09375, |
|
"rewards/xmlcount_reward_func": 0.5985937379300594, |
|
"step": 396 |
|
}, |
|
{ |
|
"completion_length": 146.4375, |
|
"epoch": 5.605633802816901, |
|
"grad_norm": 10.314447402954102, |
|
"kl": 102.41072046756744, |
|
"learning_rate": 7.375092342298828e-10, |
|
"loss": 0.1024, |
|
"reward": 17.96188724040985, |
|
"reward_std": 8.251108340919018, |
|
"rewards/concensus_correctness_reward_func": 14.375, |
|
"rewards/consensus_reward_func": 1.4375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 1.0, |
|
"rewards/question_recreation_reward_func": 0.5060124294832349, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.0625, |
|
"rewards/xmlcount_reward_func": 0.5652500111609697, |
|
"step": 398 |
|
}, |
|
{ |
|
"completion_length": 160.90625, |
|
"epoch": 5.633802816901408, |
|
"grad_norm": 83.75353240966797, |
|
"kl": 102.39953392744064, |
|
"learning_rate": 8.194905210923143e-11, |
|
"loss": 0.1024, |
|
"reward": 19.95249879360199, |
|
"reward_std": 8.20958011969924, |
|
"rewards/concensus_correctness_reward_func": 16.25, |
|
"rewards/consensus_reward_func": 1.625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.8125, |
|
"rewards/question_recreation_reward_func": 0.5820612944662571, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.078125, |
|
"rewards/xmlcount_reward_func": 0.6048125065863132, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 5.633802816901408, |
|
"step": 400, |
|
"total_flos": 0.0, |
|
"train_loss": 0.7875715676811523, |
|
"train_runtime": 3812.444, |
|
"train_samples_per_second": 1.679, |
|
"train_steps_per_second": 0.105 |
|
} |
|
], |
|
"logging_steps": 2, |
|
"max_steps": 400, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 6, |
|
"save_steps": 25, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|