itufilum's picture
End of training
0c509be verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.633802816901408,
"eval_steps": 500,
"global_step": 400,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 194.03125,
"epoch": 0.028169014084507043,
"grad_norm": 15.590116500854492,
"kl": 67.96075868606567,
"learning_rate": 4.1666666666666667e-07,
"loss": 0.068,
"reward": 18.734252750873566,
"reward_std": 6.550818961113691,
"rewards/concensus_correctness_reward_func": 14.375,
"rewards/consensus_reward_func": 1.5625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.4375,
"rewards/question_recreation_reward_func": 0.6794400699436665,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0625,
"rewards/xmlcount_reward_func": 0.6173125002533197,
"step": 2
},
{
"completion_length": 205.28125,
"epoch": 0.056338028169014086,
"grad_norm": 530.9674072265625,
"kl": 265.4313408136368,
"learning_rate": 1.25e-06,
"loss": 0.2654,
"reward": 21.486406087875366,
"reward_std": 6.450116660445929,
"rewards/concensus_correctness_reward_func": 16.35931247472763,
"rewards/consensus_reward_func": 1.625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.75,
"rewards/question_recreation_reward_func": 0.8441246300935745,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.171875,
"rewards/xmlcount_reward_func": 0.7360937558114529,
"step": 4
},
{
"completion_length": 247.0,
"epoch": 0.08450704225352113,
"grad_norm": 7.194105625152588,
"kl": 10.1644686460495,
"learning_rate": 2.0833333333333334e-06,
"loss": 0.0102,
"reward": 24.599268674850464,
"reward_std": 1.6305736564099789,
"rewards/concensus_correctness_reward_func": 19.375,
"rewards/consensus_reward_func": 1.9375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.75,
"rewards/question_recreation_reward_func": 0.8394556865096092,
"rewards/soft_format_reward_func": 0.03125,
"rewards/strict_format_reward_func": 0.09375,
"rewards/xmlcount_reward_func": 0.5723125021904707,
"step": 6
},
{
"completion_length": 205.78125,
"epoch": 0.11267605633802817,
"grad_norm": 14.453492164611816,
"kl": 29.00755314528942,
"learning_rate": 2.916666666666667e-06,
"loss": 0.029,
"reward": 21.17924928665161,
"reward_std": 7.817702278494835,
"rewards/concensus_correctness_reward_func": 16.25,
"rewards/consensus_reward_func": 1.625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.625,
"rewards/question_recreation_reward_func": 0.7514051459729671,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.15625,
"rewards/xmlcount_reward_func": 0.7715937495231628,
"step": 8
},
{
"completion_length": 254.78125,
"epoch": 0.14084507042253522,
"grad_norm": 4.9339776039123535,
"kl": 534.9701635837555,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.535,
"reward": 22.273706436157227,
"reward_std": 5.927923844195902,
"rewards/concensus_correctness_reward_func": 17.5,
"rewards/consensus_reward_func": 1.75,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.6875,
"rewards/question_recreation_reward_func": 0.7792688012123108,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.09375,
"rewards/xmlcount_reward_func": 0.4475625064224005,
"step": 10
},
{
"completion_length": 223.84375,
"epoch": 0.16901408450704225,
"grad_norm": 126.1221923828125,
"kl": 111.43436747789383,
"learning_rate": 4.583333333333333e-06,
"loss": 0.1114,
"reward": 19.623693704605103,
"reward_std": 9.295258034020662,
"rewards/concensus_correctness_reward_func": 15.0,
"rewards/consensus_reward_func": 1.5,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.625,
"rewards/question_recreation_reward_func": 0.8043808117508888,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.109375,
"rewards/xmlcount_reward_func": 0.5849375165998936,
"step": 12
},
{
"completion_length": 258.84375,
"epoch": 0.19718309859154928,
"grad_norm": 13.649314880371094,
"kl": 29.206983238458633,
"learning_rate": 4.999918050947891e-06,
"loss": 0.0292,
"reward": 18.92366051673889,
"reward_std": 7.471162365749478,
"rewards/concensus_correctness_reward_func": 14.375,
"rewards/consensus_reward_func": 1.6875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.5,
"rewards/question_recreation_reward_func": 0.851754330098629,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.078125,
"rewards/xmlcount_reward_func": 0.41565624810755253,
"step": 14
},
{
"completion_length": 210.9375,
"epoch": 0.22535211267605634,
"grad_norm": 7.758904457092285,
"kl": 288.0020731687546,
"learning_rate": 4.99926249076577e-06,
"loss": 0.288,
"reward": 18.59817200899124,
"reward_std": 4.863896086812019,
"rewards/concensus_correctness_reward_func": 13.75,
"rewards/consensus_reward_func": 1.5625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.625,
"rewards/question_recreation_reward_func": 0.6548594255000353,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.203125,
"rewards/xmlcount_reward_func": 0.7870624996721745,
"step": 16
},
{
"completion_length": 292.53125,
"epoch": 0.2535211267605634,
"grad_norm": 6.619876384735107,
"kl": 22.940940707921982,
"learning_rate": 4.9979515423108255e-06,
"loss": 0.0229,
"reward": 20.082338631153107,
"reward_std": 4.752222462557256,
"rewards/concensus_correctness_reward_func": 15.625,
"rewards/consensus_reward_func": 1.75,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.4375,
"rewards/question_recreation_reward_func": 0.8020261619240046,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0625,
"rewards/xmlcount_reward_func": 0.4053125027567148,
"step": 18
},
{
"completion_length": 238.28125,
"epoch": 0.28169014084507044,
"grad_norm": 9.172245025634766,
"kl": 60.055853977799416,
"learning_rate": 4.995985549356568e-06,
"loss": 0.0601,
"reward": 16.823714524507523,
"reward_std": 5.831961344927549,
"rewards/concensus_correctness_reward_func": 11.983624935150146,
"rewards/consensus_reward_func": 1.6875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.4375,
"rewards/question_recreation_reward_func": 0.7774647548794746,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.1875,
"rewards/xmlcount_reward_func": 0.7501249983906746,
"step": 20
},
{
"completion_length": 230.53125,
"epoch": 0.30985915492957744,
"grad_norm": 6.337810516357422,
"kl": 20.89355828613043,
"learning_rate": 4.993365027450576e-06,
"loss": 0.0209,
"reward": 22.543599009513855,
"reward_std": 4.786159439012408,
"rewards/concensus_correctness_reward_func": 17.5,
"rewards/consensus_reward_func": 1.75,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.75,
"rewards/question_recreation_reward_func": 0.654692716896534,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.1875,
"rewards/xmlcount_reward_func": 0.7014062590897083,
"step": 22
},
{
"completion_length": 233.15625,
"epoch": 0.3380281690140845,
"grad_norm": 5.86886739730835,
"kl": 4.685514692217112,
"learning_rate": 4.990090663779305e-06,
"loss": 0.0047,
"reward": 24.771531105041504,
"reward_std": 2.2020363211631775,
"rewards/concensus_correctness_reward_func": 19.375,
"rewards/consensus_reward_func": 1.9375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.625,
"rewards/question_recreation_reward_func": 0.7914998307824135,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.203125,
"rewards/xmlcount_reward_func": 0.8237812481820583,
"step": 24
},
{
"completion_length": 218.1875,
"epoch": 0.36619718309859156,
"grad_norm": 8.693346977233887,
"kl": 22.532070949673653,
"learning_rate": 4.986163316987877e-06,
"loss": 0.0225,
"reward": 23.11518883705139,
"reward_std": 5.961791490204632,
"rewards/concensus_correctness_reward_func": 17.5,
"rewards/consensus_reward_func": 1.75,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.8125,
"rewards/question_recreation_reward_func": 0.7831886559724808,
"rewards/soft_format_reward_func": 0.03125,
"rewards/strict_format_reward_func": 0.296875,
"rewards/xmlcount_reward_func": 0.9413749948143959,
"step": 26
},
{
"completion_length": 199.875,
"epoch": 0.39436619718309857,
"grad_norm": 45.534908294677734,
"kl": 43.07646985352039,
"learning_rate": 4.9815840169549216e-06,
"loss": 0.0431,
"reward": 22.956753134727478,
"reward_std": 3.708504168316722,
"rewards/concensus_correctness_reward_func": 17.5,
"rewards/consensus_reward_func": 1.75,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.8125,
"rewards/question_recreation_reward_func": 0.8589722141623497,
"rewards/soft_format_reward_func": 0.03125,
"rewards/strict_format_reward_func": 0.203125,
"rewards/xmlcount_reward_func": 0.8009062558412552,
"step": 28
},
{
"completion_length": 229.34375,
"epoch": 0.4225352112676056,
"grad_norm": 188.76895141601562,
"kl": 96.50678093731403,
"learning_rate": 4.976353964522509e-06,
"loss": 0.0965,
"reward": 22.132088541984558,
"reward_std": 6.213916528970003,
"rewards/concensus_correctness_reward_func": 16.875,
"rewards/consensus_reward_func": 1.6875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.8125,
"rewards/question_recreation_reward_func": 0.813995435833931,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.1875,
"rewards/xmlcount_reward_func": 0.7555937431752682,
"step": 30
},
{
"completion_length": 251.0,
"epoch": 0.4507042253521127,
"grad_norm": 89.95954895019531,
"kl": 42.50997355952859,
"learning_rate": 4.970474531181245e-06,
"loss": 0.0425,
"reward": 21.101596146821976,
"reward_std": 3.359893566928804,
"rewards/concensus_correctness_reward_func": 16.25,
"rewards/consensus_reward_func": 1.875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.5,
"rewards/question_recreation_reward_func": 0.8080957010388374,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.125,
"rewards/xmlcount_reward_func": 0.5278750080615282,
"step": 32
},
{
"completion_length": 270.4375,
"epoch": 0.4788732394366197,
"grad_norm": 7.159509181976318,
"kl": 37.1051784530282,
"learning_rate": 4.963947258710626e-06,
"loss": 0.0371,
"reward": 21.699138522148132,
"reward_std": 2.3117441162467003,
"rewards/concensus_correctness_reward_func": 16.875,
"rewards/consensus_reward_func": 1.875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.5625,
"rewards/question_recreation_reward_func": 0.7434196844696999,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.125,
"rewards/xmlcount_reward_func": 0.5025937538594007,
"step": 34
},
{
"completion_length": 234.71875,
"epoch": 0.5070422535211268,
"grad_norm": 8.73612117767334,
"kl": 2501.0413611084223,
"learning_rate": 4.9567738587747314e-06,
"loss": 2.501,
"reward": 16.33679434657097,
"reward_std": 5.108438193798065,
"rewards/concensus_correctness_reward_func": 11.875,
"rewards/consensus_reward_func": 1.4375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.375,
"rewards/question_recreation_reward_func": 0.7808256670832634,
"rewards/soft_format_reward_func": 0.03125,
"rewards/strict_format_reward_func": 0.15625,
"rewards/xmlcount_reward_func": 0.6809687577188015,
"step": 36
},
{
"completion_length": 278.25,
"epoch": 0.5352112676056338,
"grad_norm": 3.27095627784729,
"kl": 10.008030999451876,
"learning_rate": 4.948956212473371e-06,
"loss": 0.01,
"reward": 16.04705312848091,
"reward_std": 2.1642781402915716,
"rewards/concensus_correctness_reward_func": 11.875,
"rewards/consensus_reward_func": 1.875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.0625,
"rewards/question_recreation_reward_func": 0.7825527861714363,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": 0.38950000517070293,
"step": 38
},
{
"completion_length": 225.75,
"epoch": 0.5633802816901409,
"grad_norm": 9.338530540466309,
"kl": 121.52913957834244,
"learning_rate": 4.940496369848795e-06,
"loss": 0.1215,
"reward": 14.5195372402668,
"reward_std": 6.823352798819542,
"rewards/concensus_correctness_reward_func": 10.720062494277954,
"rewards/consensus_reward_func": 1.25,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.25,
"rewards/question_recreation_reward_func": 0.6889123450964689,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.078125,
"rewards/xmlcount_reward_func": 0.5324374958872795,
"step": 40
},
{
"completion_length": 273.71875,
"epoch": 0.5915492957746479,
"grad_norm": 3.6006832122802734,
"kl": 9.235054649412632,
"learning_rate": 4.931396549348115e-06,
"loss": 0.0092,
"reward": 23.575079202651978,
"reward_std": 3.452487599104643,
"rewards/concensus_correctness_reward_func": 18.75,
"rewards/consensus_reward_func": 1.875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.625,
"rewards/question_recreation_reward_func": 0.8597666844725609,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.0625,
"rewards/xmlcount_reward_func": 0.38718749675899744,
"step": 42
},
{
"completion_length": 228.65625,
"epoch": 0.6197183098591549,
"grad_norm": 184.4779510498047,
"kl": 66.61591627448797,
"learning_rate": 4.921659137241544e-06,
"loss": 0.0666,
"reward": 21.40922224521637,
"reward_std": 6.415304251015186,
"rewards/concensus_correctness_reward_func": 16.875,
"rewards/consensus_reward_func": 1.6875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.5625,
"rewards/question_recreation_reward_func": 0.788409948348999,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": 0.4489374943077564,
"step": 44
},
{
"completion_length": 261.5625,
"epoch": 0.647887323943662,
"grad_norm": 2.263350009918213,
"kl": 5.652421373873949,
"learning_rate": 4.911286686996648e-06,
"loss": 0.0057,
"reward": 22.376519441604614,
"reward_std": 0.7325459104031324,
"rewards/concensus_correctness_reward_func": 17.5,
"rewards/consensus_reward_func": 1.9375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.5625,
"rewards/question_recreation_reward_func": 0.8018633462488651,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.09375,
"rewards/xmlcount_reward_func": 0.48090626299381256,
"step": 46
},
{
"completion_length": 218.8125,
"epoch": 0.676056338028169,
"grad_norm": 6.410772323608398,
"kl": 17.982181690633297,
"learning_rate": 4.900281918608732e-06,
"loss": 0.018,
"reward": 23.878349542617798,
"reward_std": 3.440878137946129,
"rewards/concensus_correctness_reward_func": 18.75,
"rewards/consensus_reward_func": 1.875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.6875,
"rewards/question_recreation_reward_func": 0.8038181811571121,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.125,
"rewards/xmlcount_reward_func": 0.6370312627404928,
"step": 48
},
{
"completion_length": 216.34375,
"epoch": 0.704225352112676,
"grad_norm": 12.63294506072998,
"kl": 20.948822245001793,
"learning_rate": 4.888647717887582e-06,
"loss": 0.0209,
"reward": 18.739310264587402,
"reward_std": 6.688391337171197,
"rewards/concensus_correctness_reward_func": 14.375,
"rewards/consensus_reward_func": 1.625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.375,
"rewards/question_recreation_reward_func": 0.669216588139534,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.09375,
"rewards/xmlcount_reward_func": 0.6013437523506582,
"step": 50
},
{
"completion_length": 232.8125,
"epoch": 0.7323943661971831,
"grad_norm": 4.447786808013916,
"kl": 24.20983089506626,
"learning_rate": 4.876387135700701e-06,
"loss": 0.0242,
"reward": 23.035974979400635,
"reward_std": 4.954922638833523,
"rewards/concensus_correctness_reward_func": 18.125,
"rewards/consensus_reward_func": 1.8125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.5625,
"rewards/question_recreation_reward_func": 0.8577248528599739,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.109375,
"rewards/xmlcount_reward_func": 0.5532500119879842,
"step": 52
},
{
"completion_length": 219.03125,
"epoch": 0.7605633802816901,
"grad_norm": 5.603798866271973,
"kl": 47.00118863582611,
"learning_rate": 4.863503387173276e-06,
"loss": 0.047,
"reward": 17.631301164627075,
"reward_std": 6.2861207174137235,
"rewards/concensus_correctness_reward_func": 13.125,
"rewards/consensus_reward_func": 1.5625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.5625,
"rewards/question_recreation_reward_func": 0.7153325416147709,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.078125,
"rewards/xmlcount_reward_func": 0.5722187510691583,
"step": 54
},
{
"completion_length": 235.59375,
"epoch": 0.7887323943661971,
"grad_norm": 2.9642035961151123,
"kl": 12.14004921168089,
"learning_rate": 4.849999850845066e-06,
"loss": 0.0121,
"reward": 20.853686690330505,
"reward_std": 3.2716546999290586,
"rewards/concensus_correctness_reward_func": 16.25,
"rewards/consensus_reward_func": 1.875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.375,
"rewards/question_recreation_reward_func": 0.8614683747291565,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": 0.42971874959766865,
"step": 56
},
{
"completion_length": 204.15625,
"epoch": 0.8169014084507042,
"grad_norm": 26.085582733154297,
"kl": 55.548476845026016,
"learning_rate": 4.835880067784441e-06,
"loss": 0.0555,
"reward": 22.38655924797058,
"reward_std": 5.256647571921349,
"rewards/concensus_correctness_reward_func": 17.5,
"rewards/consensus_reward_func": 1.75,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.6875,
"rewards/question_recreation_reward_func": 0.7643403187394142,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.109375,
"rewards/xmlcount_reward_func": 0.5597187718376517,
"step": 58
},
{
"completion_length": 224.78125,
"epoch": 0.8450704225352113,
"grad_norm": 16.9488468170166,
"kl": 40.86702236533165,
"learning_rate": 4.821147740659795e-06,
"loss": 0.0409,
"reward": 21.230697870254517,
"reward_std": 7.334933251142502,
"rewards/concensus_correctness_reward_func": 16.875,
"rewards/consensus_reward_func": 1.6875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.375,
"rewards/question_recreation_reward_func": 0.6201041154563427,
"rewards/soft_format_reward_func": 0.03125,
"rewards/strict_format_reward_func": 0.09375,
"rewards/xmlcount_reward_func": 0.5480937454849482,
"step": 60
},
{
"completion_length": 258.875,
"epoch": 0.8732394366197183,
"grad_norm": 20.7008056640625,
"kl": 29.40393216907978,
"learning_rate": 4.805806732768585e-06,
"loss": 0.0294,
"reward": 20.777413338422775,
"reward_std": 3.6380002200603485,
"rewards/concensus_correctness_reward_func": 16.25,
"rewards/consensus_reward_func": 1.8125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.3125,
"rewards/question_recreation_reward_func": 0.8443503454327583,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": 0.49556251242756844,
"step": 62
},
{
"completion_length": 187.5625,
"epoch": 0.9014084507042254,
"grad_norm": 736.5669555664062,
"kl": 481.64947575330734,
"learning_rate": 4.789861067024253e-06,
"loss": 0.4816,
"reward": 21.166576385498047,
"reward_std": 6.786355759948492,
"rewards/concensus_correctness_reward_func": 16.875,
"rewards/consensus_reward_func": 1.6875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.375,
"rewards/question_recreation_reward_func": 0.5792638175189495,
"rewards/soft_format_reward_func": 0.03125,
"rewards/strict_format_reward_func": 0.078125,
"rewards/xmlcount_reward_func": 0.540437500923872,
"step": 64
},
{
"completion_length": 214.96875,
"epoch": 0.9295774647887324,
"grad_norm": 35.92903137207031,
"kl": 103.99411916732788,
"learning_rate": 4.773314924901281e-06,
"loss": 0.104,
"reward": 19.490996658802032,
"reward_std": 7.800664484500885,
"rewards/concensus_correctness_reward_func": 15.0,
"rewards/consensus_reward_func": 1.5,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.4375,
"rewards/question_recreation_reward_func": 0.80152777582407,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.09375,
"rewards/xmlcount_reward_func": 0.6425937414169312,
"step": 66
},
{
"completion_length": 235.125,
"epoch": 0.9577464788732394,
"grad_norm": 10.029170989990234,
"kl": 18.18737083673477,
"learning_rate": 4.756172645338675e-06,
"loss": 0.0182,
"reward": 19.079940140247345,
"reward_std": 6.256794525776058,
"rewards/concensus_correctness_reward_func": 15.0,
"rewards/consensus_reward_func": 1.75,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.1875,
"rewards/question_recreation_reward_func": 0.6954717859625816,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.43134375661611557,
"step": 68
},
{
"completion_length": 226.34375,
"epoch": 0.9859154929577465,
"grad_norm": 5.851627349853516,
"kl": 14.625127524137497,
"learning_rate": 4.738438723602154e-06,
"loss": 0.0146,
"reward": 23.478822708129883,
"reward_std": 3.651876477524638,
"rewards/concensus_correctness_reward_func": 18.75,
"rewards/consensus_reward_func": 1.875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.625,
"rewards/question_recreation_reward_func": 0.6802600920200348,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": 0.5016875043511391,
"step": 70
},
{
"completion_length": 227.84375,
"epoch": 1.0140845070422535,
"grad_norm": 9.736303329467773,
"kl": 32.25694251060486,
"learning_rate": 4.720117810105341e-06,
"loss": 0.0323,
"reward": 21.3710036277771,
"reward_std": 7.411025664303452,
"rewards/concensus_correctness_reward_func": 16.875,
"rewards/consensus_reward_func": 1.6875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.5,
"rewards/question_recreation_reward_func": 0.8385977782309055,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.03125,
"rewards/xmlcount_reward_func": 0.4386562556028366,
"step": 72
},
{
"completion_length": 197.375,
"epoch": 1.0422535211267605,
"grad_norm": 29.349096298217773,
"kl": 51.72871816158295,
"learning_rate": 4.701214709190277e-06,
"loss": 0.0517,
"reward": 21.43280816078186,
"reward_std": 7.328175559639931,
"rewards/concensus_correctness_reward_func": 16.875,
"rewards/consensus_reward_func": 1.6875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.5,
"rewards/question_recreation_reward_func": 0.6436204127967358,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.09375,
"rewards/xmlcount_reward_func": 0.6173124928027391,
"step": 74
},
{
"completion_length": 245.84375,
"epoch": 1.0704225352112675,
"grad_norm": 43.128273010253906,
"kl": 41.08760707080364,
"learning_rate": 4.681734377867562e-06,
"loss": 0.0411,
"reward": 21.750689268112183,
"reward_std": 2.0954109141603112,
"rewards/concensus_correctness_reward_func": 16.875,
"rewards/consensus_reward_func": 1.875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.5625,
"rewards/question_recreation_reward_func": 0.8656269088387489,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.078125,
"rewards/xmlcount_reward_func": 0.4944375103805214,
"step": 76
},
{
"completion_length": 218.09375,
"epoch": 1.0985915492957747,
"grad_norm": 96.18486785888672,
"kl": 93.65772761404514,
"learning_rate": 4.661681924516466e-06,
"loss": 0.0937,
"reward": 22.785533666610718,
"reward_std": 4.547428795136511,
"rewards/concensus_correctness_reward_func": 18.125,
"rewards/consensus_reward_func": 1.8125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.5,
"rewards/question_recreation_reward_func": 0.7730333730578423,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0625,
"rewards/xmlcount_reward_func": 0.5124999992549419,
"step": 78
},
{
"completion_length": 189.1875,
"epoch": 1.1267605633802817,
"grad_norm": 13.744595527648926,
"kl": 41.68859389424324,
"learning_rate": 4.641062607545347e-06,
"loss": 0.0417,
"reward": 19.403677821159363,
"reward_std": 8.246666595339775,
"rewards/concensus_correctness_reward_func": 15.735687494277954,
"rewards/consensus_reward_func": 1.5625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.0625,
"rewards/question_recreation_reward_func": 0.5538652390241623,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.47350000962615013,
"step": 80
},
{
"completion_length": 241.0,
"epoch": 1.1549295774647887,
"grad_norm": 3.5648226737976074,
"kl": 11.036557964980602,
"learning_rate": 4.61988183401272e-06,
"loss": 0.011,
"reward": 20.90431860089302,
"reward_std": 3.5142957847565413,
"rewards/concensus_correctness_reward_func": 16.25,
"rewards/consensus_reward_func": 1.875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.375,
"rewards/question_recreation_reward_func": 0.8119121938943863,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.0625,
"rewards/xmlcount_reward_func": 0.5142812412232161,
"step": 82
},
{
"completion_length": 207.3125,
"epoch": 1.1830985915492958,
"grad_norm": 10.732950210571289,
"kl": 52.33975350856781,
"learning_rate": 4.598145158209356e-06,
"loss": 0.0523,
"reward": 18.125176668167114,
"reward_std": 10.32908346131444,
"rewards/concensus_correctness_reward_func": 13.75,
"rewards/consensus_reward_func": 1.375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.6875,
"rewards/question_recreation_reward_func": 0.6812697537243366,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0625,
"rewards/xmlcount_reward_func": 0.5689062438905239,
"step": 84
},
{
"completion_length": 252.25,
"epoch": 1.2112676056338028,
"grad_norm": 23.430219650268555,
"kl": 31.533756278455257,
"learning_rate": 4.575858280201761e-06,
"loss": 0.0315,
"reward": 14.606143146753311,
"reward_std": 3.439208870753646,
"rewards/concensus_correctness_reward_func": 10.625,
"rewards/consensus_reward_func": 1.8125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.0,
"rewards/question_recreation_reward_func": 0.7599556222558022,
"rewards/soft_format_reward_func": 0.03125,
"rewards/strict_format_reward_func": 0.03125,
"rewards/xmlcount_reward_func": 0.34618750773370266,
"step": 86
},
{
"completion_length": 226.8125,
"epoch": 1.2394366197183098,
"grad_norm": 6.766458034515381,
"kl": 19.861644983291626,
"learning_rate": 4.5530270443374305e-06,
"loss": 0.0199,
"reward": 21.676340103149414,
"reward_std": 6.460458487272263,
"rewards/concensus_correctness_reward_func": 17.5,
"rewards/consensus_reward_func": 1.75,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.3125,
"rewards/question_recreation_reward_func": 0.6662770844995975,
"rewards/soft_format_reward_func": 0.03125,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.40068749710917473,
"step": 88
},
{
"completion_length": 206.5,
"epoch": 1.267605633802817,
"grad_norm": 14.366485595703125,
"kl": 24.39778110384941,
"learning_rate": 4.5296574377122765e-06,
"loss": 0.0244,
"reward": 23.023337364196777,
"reward_std": 4.633859112858772,
"rewards/concensus_correctness_reward_func": 18.125,
"rewards/consensus_reward_func": 1.8125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.75,
"rewards/question_recreation_reward_func": 0.6368377842009068,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.0625,
"rewards/xmlcount_reward_func": 0.6208749823272228,
"step": 90
},
{
"completion_length": 271.75,
"epoch": 1.295774647887324,
"grad_norm": 6.00218391418457,
"kl": 18.09646901488304,
"learning_rate": 4.505755588600613e-06,
"loss": 0.0181,
"reward": 22.362977981567383,
"reward_std": 3.771769030485302,
"rewards/concensus_correctness_reward_func": 18.125,
"rewards/consensus_reward_func": 1.8125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.4375,
"rewards/question_recreation_reward_func": 0.6680713146924973,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.28865625197067857,
"step": 92
},
{
"completion_length": 221.40625,
"epoch": 1.323943661971831,
"grad_norm": 260.9346008300781,
"kl": 85.69054782390594,
"learning_rate": 4.481327764848118e-06,
"loss": 0.0857,
"reward": 21.98690915107727,
"reward_std": 6.104137388058007,
"rewards/concensus_correctness_reward_func": 17.5,
"rewards/consensus_reward_func": 1.75,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.6875,
"rewards/question_recreation_reward_func": 0.6172217763960361,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.4009375046007335,
"step": 94
},
{
"completion_length": 242.46875,
"epoch": 1.352112676056338,
"grad_norm": 3.6559009552001953,
"kl": 16.68967443704605,
"learning_rate": 4.456380372228208e-06,
"loss": 0.0167,
"reward": 20.938228607177734,
"reward_std": 3.3481120225042105,
"rewards/concensus_correctness_reward_func": 16.25,
"rewards/consensus_reward_func": 1.875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.5,
"rewards/question_recreation_reward_func": 0.7461661994457245,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.03125,
"rewards/xmlcount_reward_func": 0.5201875008642673,
"step": 96
},
{
"completion_length": 245.59375,
"epoch": 1.380281690140845,
"grad_norm": 46.14374542236328,
"kl": 73.46418565511703,
"learning_rate": 4.430919952762226e-06,
"loss": 0.0735,
"reward": 19.775842905044556,
"reward_std": 7.6586533188819885,
"rewards/concensus_correctness_reward_func": 15.625,
"rewards/consensus_reward_func": 1.5625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.5,
"rewards/question_recreation_reward_func": 0.6588428560644388,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.03125,
"rewards/xmlcount_reward_func": 0.38262501033023,
"step": 98
},
{
"completion_length": 151.8125,
"epoch": 1.408450704225352,
"grad_norm": 11.883987426757812,
"kl": 99.90218496322632,
"learning_rate": 4.404953183003916e-06,
"loss": 0.0999,
"reward": 20.656064450740814,
"reward_std": 5.514936912804842,
"rewards/concensus_correctness_reward_func": 16.25,
"rewards/consensus_reward_func": 1.625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.4375,
"rewards/question_recreation_reward_func": 0.6430017780512571,
"rewards/soft_format_reward_func": 0.03125,
"rewards/strict_format_reward_func": 0.078125,
"rewards/xmlcount_reward_func": 0.5911874920129776,
"step": 100
},
{
"completion_length": 217.5625,
"epoch": 1.436619718309859,
"grad_norm": 48.88766098022461,
"kl": 109.4358594417572,
"learning_rate": 4.378486872288611e-06,
"loss": 0.1094,
"reward": 18.41537481546402,
"reward_std": 6.753879874944687,
"rewards/concensus_correctness_reward_func": 14.375,
"rewards/consensus_reward_func": 1.6875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.0625,
"rewards/question_recreation_reward_func": 0.694999773055315,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.0625,
"rewards/xmlcount_reward_func": 0.5172499939799309,
"step": 102
},
{
"completion_length": 221.125,
"epoch": 1.4647887323943662,
"grad_norm": 46.86466598510742,
"kl": 40.41808983683586,
"learning_rate": 4.3515279609476e-06,
"loss": 0.0404,
"reward": 22.9285147190094,
"reward_std": 4.827527537941933,
"rewards/concensus_correctness_reward_func": 18.125,
"rewards/consensus_reward_func": 1.8125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.6875,
"rewards/question_recreation_reward_func": 0.7598273046314716,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.5124375112354755,
"step": 104
},
{
"completion_length": 188.25,
"epoch": 1.4929577464788732,
"grad_norm": 17.044279098510742,
"kl": 67.19291111826897,
"learning_rate": 4.324083518488151e-06,
"loss": 0.0672,
"reward": 19.629344820976257,
"reward_std": 8.029563069343567,
"rewards/concensus_correctness_reward_func": 15.625,
"rewards/consensus_reward_func": 1.5625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.25,
"rewards/question_recreation_reward_func": 0.5852201916277409,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.03125,
"rewards/xmlcount_reward_func": 0.5753749944269657,
"step": 106
},
{
"completion_length": 260.375,
"epoch": 1.5211267605633803,
"grad_norm": 3.7830276489257812,
"kl": 26.501030012965202,
"learning_rate": 4.296160741739652e-06,
"loss": 0.0265,
"reward": 21.420883417129517,
"reward_std": 2.4508758764714003,
"rewards/concensus_correctness_reward_func": 16.875,
"rewards/consensus_reward_func": 1.9375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.1875,
"rewards/question_recreation_reward_func": 0.7897899299860001,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.109375,
"rewards/xmlcount_reward_func": 0.5060937535017729,
"step": 108
},
{
"completion_length": 236.4375,
"epoch": 1.5492957746478875,
"grad_norm": 6.825228691101074,
"kl": 20.585920438170433,
"learning_rate": 4.267766952966369e-06,
"loss": 0.0206,
"reward": 20.2924707531929,
"reward_std": 4.762420322746038,
"rewards/concensus_correctness_reward_func": 15.625,
"rewards/consensus_reward_func": 1.8125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.5,
"rewards/question_recreation_reward_func": 0.679470956325531,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.109375,
"rewards/xmlcount_reward_func": 0.5505000110715628,
"step": 110
},
{
"completion_length": 204.46875,
"epoch": 1.5774647887323945,
"grad_norm": 13.651640892028809,
"kl": 29.064430966973305,
"learning_rate": 4.238909597947307e-06,
"loss": 0.0291,
"reward": 20.76398205757141,
"reward_std": 3.667446758598089,
"rewards/concensus_correctness_reward_func": 16.25,
"rewards/consensus_reward_func": 1.8125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.375,
"rewards/question_recreation_reward_func": 0.7045135274529457,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.09375,
"rewards/xmlcount_reward_func": 0.5125937461853027,
"step": 112
},
{
"completion_length": 202.5625,
"epoch": 1.6056338028169015,
"grad_norm": 7.285125732421875,
"kl": 31.416974440217018,
"learning_rate": 4.2095962440236846e-06,
"loss": 0.0314,
"reward": 22.71331763267517,
"reward_std": 4.953943386673927,
"rewards/concensus_correctness_reward_func": 18.125,
"rewards/consensus_reward_func": 1.8125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.4375,
"rewards/question_recreation_reward_func": 0.6919735632836819,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0625,
"rewards/xmlcount_reward_func": 0.5838437452912331,
"step": 114
},
{
"completion_length": 202.8125,
"epoch": 1.6338028169014085,
"grad_norm": 13.096663475036621,
"kl": 27.956041753292084,
"learning_rate": 4.179834578114531e-06,
"loss": 0.028,
"reward": 20.11071002483368,
"reward_std": 4.602979902178049,
"rewards/concensus_correctness_reward_func": 15.0,
"rewards/consensus_reward_func": 1.75,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.625,
"rewards/question_recreation_reward_func": 0.7899289727210999,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.15625,
"rewards/xmlcount_reward_func": 0.7895312570035458,
"step": 116
},
{
"completion_length": 230.1875,
"epoch": 1.6619718309859155,
"grad_norm": 6.774753570556641,
"kl": 9.80746340751648,
"learning_rate": 4.149632404700925e-06,
"loss": 0.0098,
"reward": 21.431988835334778,
"reward_std": 2.1543020214885473,
"rewards/concensus_correctness_reward_func": 16.875,
"rewards/consensus_reward_func": 1.9375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.25,
"rewards/question_recreation_reward_func": 0.7171135507524014,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.078125,
"rewards/xmlcount_reward_func": 0.5742499995976686,
"step": 118
},
{
"completion_length": 259.09375,
"epoch": 1.6901408450704225,
"grad_norm": 2.689204216003418,
"kl": 17.400323942303658,
"learning_rate": 4.118997643779401e-06,
"loss": 0.0174,
"reward": 22.90558958053589,
"reward_std": 4.9271611254662275,
"rewards/concensus_correctness_reward_func": 18.125,
"rewards/consensus_reward_func": 1.8125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.6875,
"rewards/question_recreation_reward_func": 0.7364025376737118,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.078125,
"rewards/xmlcount_reward_func": 0.4504375047981739,
"step": 120
},
{
"completion_length": 247.6875,
"epoch": 1.7183098591549295,
"grad_norm": 28.990882873535156,
"kl": 22.44525107741356,
"learning_rate": 4.087938328785071e-06,
"loss": 0.0224,
"reward": 21.130041539669037,
"reward_std": 3.9246082678437233,
"rewards/concensus_correctness_reward_func": 16.25,
"rewards/consensus_reward_func": 1.8125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.5,
"rewards/question_recreation_reward_func": 0.7531667724251747,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.15625,
"rewards/xmlcount_reward_func": 0.6425000079907477,
"step": 122
},
{
"completion_length": 214.5625,
"epoch": 1.7464788732394365,
"grad_norm": 17.202999114990234,
"kl": 93.57761958241463,
"learning_rate": 4.056462604484998e-06,
"loss": 0.0936,
"reward": 19.359357565641403,
"reward_std": 5.1793545708060265,
"rewards/concensus_correctness_reward_func": 15.0,
"rewards/consensus_reward_func": 1.6875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.3125,
"rewards/question_recreation_reward_func": 0.6580452099442482,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.109375,
"rewards/xmlcount_reward_func": 0.5919375065714121,
"step": 124
},
{
"completion_length": 249.71875,
"epoch": 1.7746478873239435,
"grad_norm": 12.416897773742676,
"kl": 13.260371595621109,
"learning_rate": 4.0245787248423614e-06,
"loss": 0.0133,
"reward": 22.547950267791748,
"reward_std": 4.811736276373267,
"rewards/concensus_correctness_reward_func": 18.125,
"rewards/consensus_reward_func": 1.8125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.4375,
"rewards/question_recreation_reward_func": 0.6626068912446499,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": 0.44784374348819256,
"step": 126
},
{
"completion_length": 253.0,
"epoch": 1.8028169014084507,
"grad_norm": 13.114068031311035,
"kl": 28.459499150514603,
"learning_rate": 3.992295050852013e-06,
"loss": 0.0285,
"reward": 22.122711896896362,
"reward_std": 5.9889209773391485,
"rewards/concensus_correctness_reward_func": 17.5,
"rewards/consensus_reward_func": 1.75,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.6875,
"rewards/question_recreation_reward_func": 0.7654621824622154,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.3885000068694353,
"step": 128
},
{
"completion_length": 231.3125,
"epoch": 1.8309859154929577,
"grad_norm": 14.931551933288574,
"kl": 38.51357202231884,
"learning_rate": 3.959620048347938e-06,
"loss": 0.0385,
"reward": 19.09564107656479,
"reward_std": 5.354119001887739,
"rewards/concensus_correctness_reward_func": 14.375,
"rewards/consensus_reward_func": 1.6875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.5,
"rewards/question_recreation_reward_func": 0.686109896749258,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.140625,
"rewards/xmlcount_reward_func": 0.7064062356948853,
"step": 130
},
{
"completion_length": 224.78125,
"epoch": 1.8591549295774648,
"grad_norm": 168.16796875,
"kl": 134.45312885940075,
"learning_rate": 3.9265622857832455e-06,
"loss": 0.1345,
"reward": 19.73101794719696,
"reward_std": 4.9529455080628395,
"rewards/concensus_correctness_reward_func": 15.10393750667572,
"rewards/consensus_reward_func": 1.6875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.5625,
"rewards/question_recreation_reward_func": 0.7391119040548801,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.078125,
"rewards/xmlcount_reward_func": 0.5598437488079071,
"step": 132
},
{
"completion_length": 191.3125,
"epoch": 1.887323943661972,
"grad_norm": 144.23934936523438,
"kl": 160.06939086318016,
"learning_rate": 3.893130431983234e-06,
"loss": 0.1601,
"reward": 20.553396463394165,
"reward_std": 7.810650005936623,
"rewards/concensus_correctness_reward_func": 16.25,
"rewards/consensus_reward_func": 1.625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.375,
"rewards/question_recreation_reward_func": 0.6331466361880302,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.078125,
"rewards/xmlcount_reward_func": 0.5765000004321337,
"step": 134
},
{
"completion_length": 218.75,
"epoch": 1.915492957746479,
"grad_norm": 30.639406204223633,
"kl": 59.116331934928894,
"learning_rate": 3.8593332538721465e-06,
"loss": 0.0591,
"reward": 19.54647660255432,
"reward_std": 8.99451743811369,
"rewards/concensus_correctness_reward_func": 15.625,
"rewards/consensus_reward_func": 1.5625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.1875,
"rewards/question_recreation_reward_func": 0.6372890621423721,
"rewards/soft_format_reward_func": 0.03125,
"rewards/strict_format_reward_func": 0.0625,
"rewards/xmlcount_reward_func": 0.4404374985024333,
"step": 136
},
{
"completion_length": 196.875,
"epoch": 1.943661971830986,
"grad_norm": 57.319923400878906,
"kl": 38.51234859973192,
"learning_rate": 3.825179614174195e-06,
"loss": 0.0385,
"reward": 21.283878982067108,
"reward_std": 2.0606489591300488,
"rewards/concensus_correctness_reward_func": 16.875,
"rewards/consensus_reward_func": 1.875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.375,
"rewards/question_recreation_reward_func": 0.4702228233218193,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.0625,
"rewards/xmlcount_reward_func": 0.6105312407016754,
"step": 138
},
{
"completion_length": 212.4375,
"epoch": 1.971830985915493,
"grad_norm": 8.84663200378418,
"kl": 27.299983263015747,
"learning_rate": 3.790678469089465e-06,
"loss": 0.0273,
"reward": 14.866298139095306,
"reward_std": 6.414532793685794,
"rewards/concensus_correctness_reward_func": 11.25,
"rewards/consensus_reward_func": 1.625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.6875,
"rewards/question_recreation_reward_func": 0.7097668498754501,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.09375,
"rewards/xmlcount_reward_func": 0.48465626407414675,
"step": 140
},
{
"completion_length": 183.71875,
"epoch": 2.0,
"grad_norm": 20.581981658935547,
"kl": 2941.267915993929,
"learning_rate": 3.7558388659453052e-06,
"loss": 2.9413,
"reward": 22.23113512992859,
"reward_std": 6.688113525509834,
"rewards/concensus_correctness_reward_func": 17.5,
"rewards/consensus_reward_func": 1.75,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.5625,
"rewards/question_recreation_reward_func": 0.6908227056264877,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.09375,
"rewards/xmlcount_reward_func": 0.6340624932199717,
"step": 142
},
{
"completion_length": 229.9375,
"epoch": 2.028169014084507,
"grad_norm": 4.256288528442383,
"kl": 450.68155094981194,
"learning_rate": 3.720669940823827e-06,
"loss": 0.4507,
"reward": 19.878115504980087,
"reward_std": 4.9296186761930585,
"rewards/concensus_correctness_reward_func": 15.625,
"rewards/consensus_reward_func": 1.75,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.375,
"rewards/question_recreation_reward_func": 0.5736463665962219,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.078125,
"rewards/xmlcount_reward_func": 0.47634374257177114,
"step": 144
},
{
"completion_length": 199.96875,
"epoch": 2.056338028169014,
"grad_norm": 134.83917236328125,
"kl": 53.38550490140915,
"learning_rate": 3.6851809161661206e-06,
"loss": 0.0534,
"reward": 23.20932626724243,
"reward_std": 4.892109964042902,
"rewards/concensus_correctness_reward_func": 18.125,
"rewards/consensus_reward_func": 1.8125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.75,
"rewards/question_recreation_reward_func": 0.7212322875857353,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.109375,
"rewards/xmlcount_reward_func": 0.6912187561392784,
"step": 146
},
{
"completion_length": 171.8125,
"epoch": 2.084507042253521,
"grad_norm": 15.478096008300781,
"kl": 43.08566951751709,
"learning_rate": 3.649381098353834e-06,
"loss": 0.0431,
"reward": 21.71215844154358,
"reward_std": 6.466344892978668,
"rewards/concensus_correctness_reward_func": 17.5,
"rewards/consensus_reward_func": 1.75,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.1875,
"rewards/question_recreation_reward_func": 0.46212736517190933,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.09375,
"rewards/xmlcount_reward_func": 0.7031562626361847,
"step": 148
},
{
"completion_length": 158.28125,
"epoch": 2.112676056338028,
"grad_norm": 356.259033203125,
"kl": 216.86275094747543,
"learning_rate": 3.613279875268731e-06,
"loss": 0.2169,
"reward": 17.928013503551483,
"reward_std": 8.778308073699009,
"rewards/concensus_correctness_reward_func": 13.75,
"rewards/consensus_reward_func": 1.375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.375,
"rewards/question_recreation_reward_func": 0.568356541916728,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.140625,
"rewards/xmlcount_reward_func": 0.7190312705934048,
"step": 150
},
{
"completion_length": 182.75,
"epoch": 2.140845070422535,
"grad_norm": 6.1676530838012695,
"kl": 86.0618257522583,
"learning_rate": 3.5768867138308872e-06,
"loss": 0.0861,
"reward": 16.761336520314217,
"reward_std": 5.004373461008072,
"rewards/concensus_correctness_reward_func": 13.125,
"rewards/consensus_reward_func": 1.5,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.9375,
"rewards/question_recreation_reward_func": 0.5807425025850534,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": 0.5555937569588423,
"step": 152
},
{
"completion_length": 216.46875,
"epoch": 2.169014084507042,
"grad_norm": 7.545609474182129,
"kl": 17.122317761182785,
"learning_rate": 3.540211157516149e-06,
"loss": 0.0171,
"reward": 21.572845339775085,
"reward_std": 5.354643169790506,
"rewards/concensus_correctness_reward_func": 17.5,
"rewards/consensus_reward_func": 1.75,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.1875,
"rewards/question_recreation_reward_func": 0.6150016514584422,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": 0.45784375444054604,
"step": 154
},
{
"completion_length": 217.90625,
"epoch": 2.1971830985915495,
"grad_norm": 4.116415977478027,
"kl": 25.3076790869236,
"learning_rate": 3.503262823853527e-06,
"loss": 0.0253,
"reward": 21.371933221817017,
"reward_std": 7.111734602600336,
"rewards/concensus_correctness_reward_func": 16.875,
"rewards/consensus_reward_func": 1.6875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.5625,
"rewards/question_recreation_reward_func": 0.6787459887564182,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.03125,
"rewards/xmlcount_reward_func": 0.5369375087320805,
"step": 156
},
{
"completion_length": 223.84375,
"epoch": 2.2253521126760565,
"grad_norm": 4.25111198425293,
"kl": 8.69906596839428,
"learning_rate": 3.466051401903162e-06,
"loss": 0.0087,
"reward": 19.322344303131104,
"reward_std": 0.9026554934680462,
"rewards/concensus_correctness_reward_func": 15.0,
"rewards/consensus_reward_func": 1.9375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.9375,
"rewards/question_recreation_reward_func": 0.7088755983859301,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.09375,
"rewards/xmlcount_reward_func": 0.6447187531739473,
"step": 158
},
{
"completion_length": 241.375,
"epoch": 2.2535211267605635,
"grad_norm": 72.4315414428711,
"kl": 48.62941300868988,
"learning_rate": 3.428586649715542e-06,
"loss": 0.0486,
"reward": 21.37134912610054,
"reward_std": 2.193010773509741,
"rewards/concensus_correctness_reward_func": 16.875,
"rewards/consensus_reward_func": 1.875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.375,
"rewards/question_recreation_reward_func": 0.6839431263506413,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.09375,
"rewards/xmlcount_reward_func": 0.4686562530696392,
"step": 160
},
{
"completion_length": 214.46875,
"epoch": 2.2816901408450705,
"grad_norm": 26.874465942382812,
"kl": 46.34515926241875,
"learning_rate": 3.3908783917726123e-06,
"loss": 0.0463,
"reward": 18.52027067542076,
"reward_std": 6.44943779706955,
"rewards/concensus_correctness_reward_func": 14.375,
"rewards/consensus_reward_func": 1.6875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.25,
"rewards/question_recreation_reward_func": 0.5618643239140511,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.078125,
"rewards/xmlcount_reward_func": 0.552156250923872,
"step": 162
},
{
"completion_length": 200.21875,
"epoch": 2.3098591549295775,
"grad_norm": 43.46574783325195,
"kl": 67.54259772598743,
"learning_rate": 3.3529365164114903e-06,
"loss": 0.0675,
"reward": 21.3968608379364,
"reward_std": 7.711536236514803,
"rewards/concensus_correctness_reward_func": 16.875,
"rewards/consensus_reward_func": 1.6875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.5,
"rewards/question_recreation_reward_func": 0.6364228781312704,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.09375,
"rewards/xmlcount_reward_func": 0.5885625015944242,
"step": 164
},
{
"completion_length": 193.875,
"epoch": 2.3380281690140845,
"grad_norm": 14.524393081665039,
"kl": 96.5325955748558,
"learning_rate": 3.314770973231408e-06,
"loss": 0.0965,
"reward": 18.271313101053238,
"reward_std": 7.526230916380882,
"rewards/concensus_correctness_reward_func": 14.480687499046326,
"rewards/consensus_reward_func": 1.5625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.1875,
"rewards/question_recreation_reward_func": 0.4511258793063462,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": 0.5426250174641609,
"step": 166
},
{
"completion_length": 163.3125,
"epoch": 2.3661971830985915,
"grad_norm": 1158.546630859375,
"kl": 972.9655037075281,
"learning_rate": 3.276391770484606e-06,
"loss": 0.973,
"reward": 16.986168384552002,
"reward_std": 9.901727393269539,
"rewards/concensus_correctness_reward_func": 13.125,
"rewards/consensus_reward_func": 1.3125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.125,
"rewards/question_recreation_reward_func": 0.5950123034417629,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.140625,
"rewards/xmlcount_reward_func": 0.6880312561988831,
"step": 168
},
{
"completion_length": 199.28125,
"epoch": 2.3943661971830985,
"grad_norm": 15.329193115234375,
"kl": 43.722544223070145,
"learning_rate": 3.2378089724518464e-06,
"loss": 0.0437,
"reward": 21.943373203277588,
"reward_std": 5.313693807460368,
"rewards/concensus_correctness_reward_func": 17.5,
"rewards/consensus_reward_func": 1.75,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.1875,
"rewards/question_recreation_reward_func": 0.7041860446333885,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.15625,
"rewards/xmlcount_reward_func": 0.6454374976456165,
"step": 170
},
{
"completion_length": 197.15625,
"epoch": 2.4225352112676055,
"grad_norm": 13.730399131774902,
"kl": 15.106778889894485,
"learning_rate": 3.1990326968032225e-06,
"loss": 0.0151,
"reward": 22.867514848709106,
"reward_std": 4.801642283797264,
"rewards/concensus_correctness_reward_func": 18.125,
"rewards/consensus_reward_func": 1.8125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.4375,
"rewards/question_recreation_reward_func": 0.6767030283808708,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.125,
"rewards/xmlcount_reward_func": 0.6751875132322311,
"step": 172
},
{
"completion_length": 169.75,
"epoch": 2.4507042253521125,
"grad_norm": 6.74544095993042,
"kl": 42.9027735888958,
"learning_rate": 3.160073111944983e-06,
"loss": 0.0429,
"reward": 17.841490387916565,
"reward_std": 7.046665458008647,
"rewards/concensus_correctness_reward_func": 13.75,
"rewards/consensus_reward_func": 1.625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.1875,
"rewards/question_recreation_reward_func": 0.5759594012051821,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.0625,
"rewards/xmlcount_reward_func": 0.6249062493443489,
"step": 174
},
{
"completion_length": 238.6875,
"epoch": 2.4788732394366195,
"grad_norm": 42.492393493652344,
"kl": 50.04497802257538,
"learning_rate": 3.1209404343530374e-06,
"loss": 0.05,
"reward": 19.691066712141037,
"reward_std": 3.8624095655977726,
"rewards/concensus_correctness_reward_func": 15.625,
"rewards/consensus_reward_func": 1.8125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.0625,
"rewards/question_recreation_reward_func": 0.6070978939533234,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.078125,
"rewards/xmlcount_reward_func": 0.5058437511324883,
"step": 176
},
{
"completion_length": 221.40625,
"epoch": 2.507042253521127,
"grad_norm": 4.4059295654296875,
"kl": 15.536839783191681,
"learning_rate": 3.081644925893866e-06,
"loss": 0.0155,
"reward": 17.259127408266068,
"reward_std": 4.80191726796329,
"rewards/concensus_correctness_reward_func": 13.125,
"rewards/consensus_reward_func": 1.8125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.0625,
"rewards/question_recreation_reward_func": 0.7241274192929268,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.03125,
"rewards/xmlcount_reward_func": 0.5037499945610762,
"step": 178
},
{
"completion_length": 220.0625,
"epoch": 2.535211267605634,
"grad_norm": 51.83406066894531,
"kl": 44.841193079948425,
"learning_rate": 3.0421968911335196e-06,
"loss": 0.0448,
"reward": 20.607652366161346,
"reward_std": 2.3559402879327536,
"rewards/concensus_correctness_reward_func": 16.25,
"rewards/consensus_reward_func": 1.75,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.3125,
"rewards/question_recreation_reward_func": 0.669371597468853,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0625,
"rewards/xmlcount_reward_func": 0.5632812529802322,
"step": 180
},
{
"completion_length": 207.96875,
"epoch": 2.563380281690141,
"grad_norm": 20.143775939941406,
"kl": 44.13751931488514,
"learning_rate": 3.002606674635432e-06,
"loss": 0.0441,
"reward": 21.463626384735107,
"reward_std": 6.160396963357925,
"rewards/concensus_correctness_reward_func": 17.59318745136261,
"rewards/consensus_reward_func": 1.75,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.0,
"rewards/question_recreation_reward_func": 0.5963759236037731,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": 0.4771875059232116,
"step": 182
},
{
"completion_length": 222.28125,
"epoch": 2.591549295774648,
"grad_norm": 11.883401870727539,
"kl": 20.277920335531235,
"learning_rate": 2.9628846582477305e-06,
"loss": 0.0203,
"reward": 19.031234979629517,
"reward_std": 6.665901035070419,
"rewards/concensus_correctness_reward_func": 15.0,
"rewards/consensus_reward_func": 1.6875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.3125,
"rewards/question_recreation_reward_func": 0.5251727998256683,
"rewards/soft_format_reward_func": 0.03125,
"rewards/strict_format_reward_func": 0.078125,
"rewards/xmlcount_reward_func": 0.3966874983161688,
"step": 184
},
{
"completion_length": 181.90625,
"epoch": 2.619718309859155,
"grad_norm": 8.755311965942383,
"kl": 25.77407142519951,
"learning_rate": 2.923041258380779e-06,
"loss": 0.0258,
"reward": 20.905356884002686,
"reward_std": 6.6025552824139595,
"rewards/concensus_correctness_reward_func": 16.875,
"rewards/consensus_reward_func": 1.6875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.1875,
"rewards/question_recreation_reward_func": 0.5499190725386143,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": 0.5585625115782022,
"step": 186
},
{
"completion_length": 184.28125,
"epoch": 2.647887323943662,
"grad_norm": 16.596654891967773,
"kl": 47.27063727378845,
"learning_rate": 2.883086923275658e-06,
"loss": 0.0473,
"reward": 20.340314149856567,
"reward_std": 8.395816408097744,
"rewards/concensus_correctness_reward_func": 15.625,
"rewards/consensus_reward_func": 1.5625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.625,
"rewards/question_recreation_reward_func": 0.7232826687395573,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.125,
"rewards/xmlcount_reward_func": 0.6639062352478504,
"step": 188
},
{
"completion_length": 245.90625,
"epoch": 2.676056338028169,
"grad_norm": 7.123233318328857,
"kl": 11.423286348581314,
"learning_rate": 2.8430321302642887e-06,
"loss": 0.0114,
"reward": 24.368746280670166,
"reward_std": 2.0759390871971846,
"rewards/concensus_correctness_reward_func": 19.375,
"rewards/consensus_reward_func": 1.9375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.6875,
"rewards/question_recreation_reward_func": 0.6851216927170753,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.078125,
"rewards/xmlcount_reward_func": 0.5898749995976686,
"step": 190
},
{
"completion_length": 195.46875,
"epoch": 2.704225352112676,
"grad_norm": 18.82990074157715,
"kl": 58.429032266139984,
"learning_rate": 2.8028873830219373e-06,
"loss": 0.0584,
"reward": 16.25366249680519,
"reward_std": 5.261145170778036,
"rewards/concensus_correctness_reward_func": 12.5,
"rewards/consensus_reward_func": 1.6875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.875,
"rewards/question_recreation_reward_func": 0.5097565241158009,
"rewards/soft_format_reward_func": 0.03125,
"rewards/strict_format_reward_func": 0.09375,
"rewards/xmlcount_reward_func": 0.5564062632620335,
"step": 192
},
{
"completion_length": 213.34375,
"epoch": 2.732394366197183,
"grad_norm": 14.168484687805176,
"kl": 110.85996335744858,
"learning_rate": 2.76266320881281e-06,
"loss": 0.1109,
"reward": 21.16573476791382,
"reward_std": 6.517005235888064,
"rewards/concensus_correctness_reward_func": 16.875,
"rewards/consensus_reward_func": 1.6875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.25,
"rewards/question_recreation_reward_func": 0.6182350441813469,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.078125,
"rewards/xmlcount_reward_func": 0.6568750068545341,
"step": 194
},
{
"completion_length": 267.03125,
"epoch": 2.76056338028169,
"grad_norm": 18.253278732299805,
"kl": 138.1692279279232,
"learning_rate": 2.7223701557294574e-06,
"loss": 0.1382,
"reward": 18.324547559022903,
"reward_std": 6.471590518951416,
"rewards/concensus_correctness_reward_func": 14.375,
"rewards/consensus_reward_func": 1.6875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.0,
"rewards/question_recreation_reward_func": 0.6808598078787327,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.078125,
"rewards/xmlcount_reward_func": 0.48743749409914017,
"step": 196
},
{
"completion_length": 204.375,
"epoch": 2.788732394366197,
"grad_norm": 15.189406394958496,
"kl": 26.855069160461426,
"learning_rate": 2.6820187899267203e-06,
"loss": 0.0269,
"reward": 19.546295881271362,
"reward_std": 8.134475693106651,
"rewards/concensus_correctness_reward_func": 15.625,
"rewards/consensus_reward_func": 1.5625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.25,
"rewards/question_recreation_reward_func": 0.5997647196054459,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": 0.44653125666081905,
"step": 198
},
{
"completion_length": 160.71875,
"epoch": 2.816901408450704,
"grad_norm": 8.699529647827148,
"kl": 49.28822618722916,
"learning_rate": 2.641619692850941e-06,
"loss": 0.0493,
"reward": 19.428176522254944,
"reward_std": 7.277277044951916,
"rewards/concensus_correctness_reward_func": 15.0,
"rewards/consensus_reward_func": 1.5,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.5,
"rewards/question_recreation_reward_func": 0.6063638776540756,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.125,
"rewards/xmlcount_reward_func": 0.6811874993145466,
"step": 200
},
{
"completion_length": 200.40625,
"epoch": 2.845070422535211,
"grad_norm": 19.380477905273438,
"kl": 95.13921695947647,
"learning_rate": 2.6011834584651597e-06,
"loss": 0.0951,
"reward": 18.081178903579712,
"reward_std": 8.80796305835247,
"rewards/concensus_correctness_reward_func": 14.467000007629395,
"rewards/consensus_reward_func": 1.4375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.9375,
"rewards/question_recreation_reward_func": 0.7127412371337414,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": 0.47956248791888356,
"step": 202
},
{
"completion_length": 185.75,
"epoch": 2.873239436619718,
"grad_norm": 21.92093276977539,
"kl": 70.87698519229889,
"learning_rate": 2.560720690471033e-06,
"loss": 0.0709,
"reward": 19.574060678482056,
"reward_std": 7.992755997925997,
"rewards/concensus_correctness_reward_func": 15.625,
"rewards/consensus_reward_func": 1.5625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.0,
"rewards/question_recreation_reward_func": 0.6473107412457466,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.0625,
"rewards/xmlcount_reward_func": 0.6611249968409538,
"step": 204
},
{
"completion_length": 197.0,
"epoch": 2.9014084507042255,
"grad_norm": 38.08381271362305,
"kl": 111.72683045268059,
"learning_rate": 2.5202419995281966e-06,
"loss": 0.1117,
"reward": 17.2239950299263,
"reward_std": 7.70870977640152,
"rewards/concensus_correctness_reward_func": 13.75,
"rewards/consensus_reward_func": 1.5625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.6875,
"rewards/question_recreation_reward_func": 0.6894014775753021,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": 0.48771873861551285,
"step": 206
},
{
"completion_length": 221.0,
"epoch": 2.9295774647887325,
"grad_norm": 12.506996154785156,
"kl": 839.4179282784462,
"learning_rate": 2.4797580004718038e-06,
"loss": 0.8394,
"reward": 19.14959552884102,
"reward_std": 6.2375103905797005,
"rewards/concensus_correctness_reward_func": 15.0,
"rewards/consensus_reward_func": 1.75,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.125,
"rewards/question_recreation_reward_func": 0.7049080766737461,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": 0.507187508046627,
"step": 208
},
{
"completion_length": 184.46875,
"epoch": 2.9577464788732395,
"grad_norm": 33.34077453613281,
"kl": 28.567565202713013,
"learning_rate": 2.4392793095289677e-06,
"loss": 0.0286,
"reward": 20.221239745616913,
"reward_std": 3.982272831723094,
"rewards/concensus_correctness_reward_func": 15.729062557220459,
"rewards/consensus_reward_func": 1.8125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.25,
"rewards/question_recreation_reward_func": 0.6523019410669804,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.09375,
"rewards/xmlcount_reward_func": 0.6680000089108944,
"step": 210
},
{
"completion_length": 199.375,
"epoch": 2.9859154929577465,
"grad_norm": 15.138080596923828,
"kl": 101.21163132786751,
"learning_rate": 2.3988165415348416e-06,
"loss": 0.1012,
"reward": 18.773864269256592,
"reward_std": 7.821951035410166,
"rewards/concensus_correctness_reward_func": 15.10393750667572,
"rewards/consensus_reward_func": 1.5,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.9375,
"rewards/question_recreation_reward_func": 0.5722080431878567,
"rewards/soft_format_reward_func": 0.0625,
"rewards/strict_format_reward_func": 0.09375,
"rewards/xmlcount_reward_func": 0.5039687566459179,
"step": 212
},
{
"completion_length": 200.09375,
"epoch": 3.0140845070422535,
"grad_norm": 27.673036575317383,
"kl": 81.34071454405785,
"learning_rate": 2.358380307149059e-06,
"loss": 0.0813,
"reward": 19.80060565471649,
"reward_std": 6.877108983695507,
"rewards/concensus_correctness_reward_func": 15.625,
"rewards/consensus_reward_func": 1.5625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.25,
"rewards/question_recreation_reward_func": 0.6550744473934174,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.109375,
"rewards/xmlcount_reward_func": 0.5830312483012676,
"step": 214
},
{
"completion_length": 197.53125,
"epoch": 3.0422535211267605,
"grad_norm": 14.81261157989502,
"kl": 53.70039749145508,
"learning_rate": 2.31798121007328e-06,
"loss": 0.0537,
"reward": 20.988556504249573,
"reward_std": 6.804740943014622,
"rewards/concensus_correctness_reward_func": 16.875,
"rewards/consensus_reward_func": 1.6875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.25,
"rewards/question_recreation_reward_func": 0.5584000945091248,
"rewards/soft_format_reward_func": 0.046875,
"rewards/strict_format_reward_func": 0.0625,
"rewards/xmlcount_reward_func": 0.5082812439650297,
"step": 216
},
{
"completion_length": 198.4375,
"epoch": 3.0704225352112675,
"grad_norm": 10.039848327636719,
"kl": 47.19896852970123,
"learning_rate": 2.2776298442705434e-06,
"loss": 0.0472,
"reward": 20.60348665714264,
"reward_std": 8.125480651855469,
"rewards/concensus_correctness_reward_func": 16.25,
"rewards/consensus_reward_func": 1.625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.4375,
"rewards/question_recreation_reward_func": 0.7180799320340157,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0625,
"rewards/xmlcount_reward_func": 0.5104062519967556,
"step": 218
},
{
"completion_length": 177.53125,
"epoch": 3.0985915492957745,
"grad_norm": 16.117830276489258,
"kl": 72.61402860283852,
"learning_rate": 2.2373367911871904e-06,
"loss": 0.0726,
"reward": 19.261219561100006,
"reward_std": 4.884680893737823,
"rewards/concensus_correctness_reward_func": 15.625,
"rewards/consensus_reward_func": 1.5625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.8125,
"rewards/question_recreation_reward_func": 0.522126174531877,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.109375,
"rewards/xmlcount_reward_func": 0.6140937507152557,
"step": 220
},
{
"completion_length": 191.625,
"epoch": 3.1267605633802815,
"grad_norm": 17.173952102661133,
"kl": 478.0856115221977,
"learning_rate": 2.1971126169780636e-06,
"loss": 0.4781,
"reward": 16.67675158381462,
"reward_std": 7.77602906152606,
"rewards/concensus_correctness_reward_func": 13.125,
"rewards/consensus_reward_func": 1.5625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.0,
"rewards/question_recreation_reward_func": 0.641875833272934,
"rewards/soft_format_reward_func": 0.03125,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.31612499337643385,
"step": 222
},
{
"completion_length": 237.59375,
"epoch": 3.1549295774647885,
"grad_norm": 10.077739715576172,
"kl": 40.82688173651695,
"learning_rate": 2.1569678697357126e-06,
"loss": 0.0408,
"reward": 20.142229437828064,
"reward_std": 3.685103869996965,
"rewards/concensus_correctness_reward_func": 16.25,
"rewards/consensus_reward_func": 1.875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.0,
"rewards/question_recreation_reward_func": 0.561760637909174,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.43984376545995474,
"step": 224
},
{
"completion_length": 199.59375,
"epoch": 3.183098591549296,
"grad_norm": 5.315944671630859,
"kl": 17.577760875225067,
"learning_rate": 2.1169130767243424e-06,
"loss": 0.0176,
"reward": 18.28103494644165,
"reward_std": 7.611427519470453,
"rewards/concensus_correctness_reward_func": 14.375,
"rewards/consensus_reward_func": 1.6875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.875,
"rewards/question_recreation_reward_func": 0.6480660997331142,
"rewards/soft_format_reward_func": 0.03125,
"rewards/strict_format_reward_func": 0.09375,
"rewards/xmlcount_reward_func": 0.5704687498509884,
"step": 226
},
{
"completion_length": 185.5625,
"epoch": 3.211267605633803,
"grad_norm": 8.0359525680542,
"kl": 58.93609178066254,
"learning_rate": 2.0769587416192212e-06,
"loss": 0.0589,
"reward": 20.35518643260002,
"reward_std": 3.533026445657015,
"rewards/concensus_correctness_reward_func": 16.25,
"rewards/consensus_reward_func": 1.8125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.1875,
"rewards/question_recreation_reward_func": 0.527998685836792,
"rewards/soft_format_reward_func": 0.03125,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": 0.4990624990314245,
"step": 228
},
{
"completion_length": 189.3125,
"epoch": 3.23943661971831,
"grad_norm": 26.13204574584961,
"kl": 53.234640538692474,
"learning_rate": 2.0371153417522703e-06,
"loss": 0.0532,
"reward": 18.698314785957336,
"reward_std": 9.453598627820611,
"rewards/concensus_correctness_reward_func": 15.0,
"rewards/consensus_reward_func": 1.5,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.125,
"rewards/question_recreation_reward_func": 0.4973145886324346,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": 0.5291249938309193,
"step": 230
},
{
"completion_length": 190.59375,
"epoch": 3.267605633802817,
"grad_norm": 13.507369995117188,
"kl": 39.84981770813465,
"learning_rate": 1.9973933253645684e-06,
"loss": 0.0398,
"reward": 19.292374074459076,
"reward_std": 6.246133454144001,
"rewards/concensus_correctness_reward_func": 15.0,
"rewards/consensus_reward_func": 1.75,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.1875,
"rewards/question_recreation_reward_func": 0.6897490993142128,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0625,
"rewards/xmlcount_reward_func": 0.6026250012218952,
"step": 232
},
{
"completion_length": 201.1875,
"epoch": 3.295774647887324,
"grad_norm": 8.261385917663574,
"kl": 47.00834572315216,
"learning_rate": 1.9578031088664812e-06,
"loss": 0.047,
"reward": 18.45159488916397,
"reward_std": 7.484951298683882,
"rewards/concensus_correctness_reward_func": 14.375,
"rewards/consensus_reward_func": 1.6875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.0,
"rewards/question_recreation_reward_func": 0.659157432615757,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.09375,
"rewards/xmlcount_reward_func": 0.6361875031143427,
"step": 234
},
{
"completion_length": 202.21875,
"epoch": 3.323943661971831,
"grad_norm": 38.87091064453125,
"kl": 38.5699297785759,
"learning_rate": 1.9183550741061354e-06,
"loss": 0.0386,
"reward": 19.85414829850197,
"reward_std": 2.194763625971973,
"rewards/concensus_correctness_reward_func": 16.25,
"rewards/consensus_reward_func": 1.875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.5625,
"rewards/question_recreation_reward_func": 0.5745858605951071,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0625,
"rewards/xmlcount_reward_func": 0.5295624881982803,
"step": 236
},
{
"completion_length": 181.5625,
"epoch": 3.352112676056338,
"grad_norm": 3900.5302734375,
"kl": 2038.339742898941,
"learning_rate": 1.8790595656469628e-06,
"loss": 2.0383,
"reward": 16.795432448387146,
"reward_std": 7.932508982717991,
"rewards/concensus_correctness_reward_func": 13.125,
"rewards/consensus_reward_func": 1.375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.9375,
"rewards/question_recreation_reward_func": 0.5185572430491447,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.140625,
"rewards/xmlcount_reward_func": 0.6987500065006316,
"step": 238
},
{
"completion_length": 212.875,
"epoch": 3.380281690140845,
"grad_norm": 15.849955558776855,
"kl": 66.65974473953247,
"learning_rate": 1.8399268880550174e-06,
"loss": 0.0667,
"reward": 21.17803716659546,
"reward_std": 6.859333042055368,
"rewards/concensus_correctness_reward_func": 16.875,
"rewards/consensus_reward_func": 1.6875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.25,
"rewards/question_recreation_reward_func": 0.7103495234623551,
"rewards/soft_format_reward_func": 0.03125,
"rewards/strict_format_reward_func": 0.109375,
"rewards/xmlcount_reward_func": 0.5145624913275242,
"step": 240
},
{
"completion_length": 226.53125,
"epoch": 3.408450704225352,
"grad_norm": 15.772849082946777,
"kl": 89.09097853302956,
"learning_rate": 1.800967303196778e-06,
"loss": 0.0891,
"reward": 22.494590759277344,
"reward_std": 5.302706576883793,
"rewards/concensus_correctness_reward_func": 18.125,
"rewards/consensus_reward_func": 1.8125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.1875,
"rewards/question_recreation_reward_func": 0.668371744453907,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.125,
"rewards/xmlcount_reward_func": 0.5762187307700515,
"step": 242
},
{
"completion_length": 164.46875,
"epoch": 3.436619718309859,
"grad_norm": 71.73149108886719,
"kl": 298.8348106145859,
"learning_rate": 1.7621910275481544e-06,
"loss": 0.2988,
"reward": 18.83104568719864,
"reward_std": 7.157097928225994,
"rewards/concensus_correctness_reward_func": 15.0,
"rewards/consensus_reward_func": 1.5,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.1875,
"rewards/question_recreation_reward_func": 0.5665771998465061,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0625,
"rewards/xmlcount_reward_func": 0.514468751847744,
"step": 244
},
{
"completion_length": 176.375,
"epoch": 3.464788732394366,
"grad_norm": 8.527463912963867,
"kl": 49.205901980400085,
"learning_rate": 1.7236082295153948e-06,
"loss": 0.0492,
"reward": 16.045627415180206,
"reward_std": 9.77073048055172,
"rewards/concensus_correctness_reward_func": 12.5,
"rewards/consensus_reward_func": 1.4375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.9375,
"rewards/question_recreation_reward_func": 0.5426900889724493,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.078125,
"rewards/xmlcount_reward_func": 0.5498124919831753,
"step": 246
},
{
"completion_length": 172.0625,
"epoch": 3.492957746478873,
"grad_norm": 17.22605323791504,
"kl": 81.85796847939491,
"learning_rate": 1.685229026768593e-06,
"loss": 0.0819,
"reward": 21.084633946418762,
"reward_std": 5.291271213442087,
"rewards/concensus_correctness_reward_func": 17.5,
"rewards/consensus_reward_func": 1.75,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.6875,
"rewards/question_recreation_reward_func": 0.5807589311152697,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": 0.5195000041276217,
"step": 248
},
{
"completion_length": 201.03125,
"epoch": 3.52112676056338,
"grad_norm": 11.301921844482422,
"kl": 69.70909404754639,
"learning_rate": 1.6470634835885097e-06,
"loss": 0.0697,
"reward": 14.430978834629059,
"reward_std": 11.139893352985382,
"rewards/concensus_correctness_reward_func": 11.25,
"rewards/consensus_reward_func": 1.3125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.9375,
"rewards/question_recreation_reward_func": 0.6216037422418594,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.3093750001862645,
"step": 250
},
{
"completion_length": 175.59375,
"epoch": 3.5492957746478875,
"grad_norm": 10.406546592712402,
"kl": 58.96156430244446,
"learning_rate": 1.6091216082273875e-06,
"loss": 0.059,
"reward": 20.337380409240723,
"reward_std": 7.825136856175959,
"rewards/concensus_correctness_reward_func": 16.25,
"rewards/consensus_reward_func": 1.625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.25,
"rewards/question_recreation_reward_func": 0.5557556711137295,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0625,
"rewards/xmlcount_reward_func": 0.5941249951720238,
"step": 252
},
{
"completion_length": 198.15625,
"epoch": 3.5774647887323945,
"grad_norm": 9.735517501831055,
"kl": 53.85496670007706,
"learning_rate": 1.5714133502844591e-06,
"loss": 0.0539,
"reward": 19.757203698158264,
"reward_std": 6.970965705811977,
"rewards/concensus_correctness_reward_func": 16.25,
"rewards/consensus_reward_func": 1.625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.6875,
"rewards/question_recreation_reward_func": 0.45339070353657007,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0625,
"rewards/xmlcount_reward_func": 0.678812500089407,
"step": 254
},
{
"completion_length": 203.59375,
"epoch": 3.6056338028169015,
"grad_norm": 21.339231491088867,
"kl": 40.247629791498184,
"learning_rate": 1.5339485980968383e-06,
"loss": 0.0402,
"reward": 17.9938845038414,
"reward_std": 5.043777231127024,
"rewards/concensus_correctness_reward_func": 14.375,
"rewards/consensus_reward_func": 1.6875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.8125,
"rewards/question_recreation_reward_func": 0.6552903726696968,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.4323437474668026,
"step": 256
},
{
"completion_length": 195.59375,
"epoch": 3.6338028169014085,
"grad_norm": 14.654716491699219,
"kl": 22.20736539363861,
"learning_rate": 1.4967371761464738e-06,
"loss": 0.0222,
"reward": 22.278719663619995,
"reward_std": 3.614749798551202,
"rewards/concensus_correctness_reward_func": 18.125,
"rewards/consensus_reward_func": 1.8125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.125,
"rewards/question_recreation_reward_func": 0.6758448034524918,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.5403749942779541,
"step": 258
},
{
"completion_length": 190.34375,
"epoch": 3.6619718309859155,
"grad_norm": 29.445497512817383,
"kl": 109.69821217656136,
"learning_rate": 1.4597888424838519e-06,
"loss": 0.1097,
"reward": 19.653862476348877,
"reward_std": 7.802595116198063,
"rewards/concensus_correctness_reward_func": 16.25,
"rewards/consensus_reward_func": 1.625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.8125,
"rewards/question_recreation_reward_func": 0.5701128906803206,
"rewards/soft_format_reward_func": 0.03125,
"rewards/strict_format_reward_func": 0.03125,
"rewards/xmlcount_reward_func": 0.33375000255182385,
"step": 260
},
{
"completion_length": 218.8125,
"epoch": 3.6901408450704225,
"grad_norm": 9.643898010253906,
"kl": 33.315061807632446,
"learning_rate": 1.4231132861691128e-06,
"loss": 0.0333,
"reward": 21.43911099433899,
"reward_std": 6.217341110110283,
"rewards/concensus_correctness_reward_func": 17.5,
"rewards/consensus_reward_func": 1.75,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.0625,
"rewards/question_recreation_reward_func": 0.6770484782755375,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.4339374974370003,
"step": 262
},
{
"completion_length": 216.78125,
"epoch": 3.7183098591549295,
"grad_norm": 14.2260103225708,
"kl": 37.71351116895676,
"learning_rate": 1.3867201247312697e-06,
"loss": 0.0377,
"reward": 19.555387258529663,
"reward_std": 7.710960239171982,
"rewards/concensus_correctness_reward_func": 16.25,
"rewards/consensus_reward_func": 1.625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.6875,
"rewards/question_recreation_reward_func": 0.5077626127749681,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.0625,
"rewards/xmlcount_reward_func": 0.4069999912753701,
"step": 264
},
{
"completion_length": 194.875,
"epoch": 3.7464788732394365,
"grad_norm": 9.09065055847168,
"kl": 14.421021282672882,
"learning_rate": 1.3506189016461674e-06,
"loss": 0.0144,
"reward": 20.84186053276062,
"reward_std": 4.1744231805205345,
"rewards/concensus_correctness_reward_func": 16.25,
"rewards/consensus_reward_func": 1.875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.25,
"rewards/question_recreation_reward_func": 0.6275481916964054,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.15625,
"rewards/xmlcount_reward_func": 0.6674374938011169,
"step": 266
},
{
"completion_length": 217.53125,
"epoch": 3.7746478873239435,
"grad_norm": 5.975257873535156,
"kl": 37.89040416479111,
"learning_rate": 1.3148190838338804e-06,
"loss": 0.0379,
"reward": 19.20604932308197,
"reward_std": 5.332434967160225,
"rewards/concensus_correctness_reward_func": 15.0,
"rewards/consensus_reward_func": 1.75,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.125,
"rewards/question_recreation_reward_func": 0.6709554120898247,
"rewards/soft_format_reward_func": 0.03125,
"rewards/strict_format_reward_func": 0.09375,
"rewards/xmlcount_reward_func": 0.5350937426555902,
"step": 268
},
{
"completion_length": 181.125,
"epoch": 3.802816901408451,
"grad_norm": 30.6917667388916,
"kl": 1376.993093073368,
"learning_rate": 1.2793300591761742e-06,
"loss": 1.377,
"reward": 18.036881029605865,
"reward_std": 6.742033764719963,
"rewards/concensus_correctness_reward_func": 14.375,
"rewards/consensus_reward_func": 1.6875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.9375,
"rewards/question_recreation_reward_func": 0.5031619630753994,
"rewards/soft_format_reward_func": 0.046875,
"rewards/strict_format_reward_func": 0.09375,
"rewards/xmlcount_reward_func": 0.3930937433615327,
"step": 270
},
{
"completion_length": 176.6875,
"epoch": 3.830985915492958,
"grad_norm": 20.22711944580078,
"kl": 30287.698419213295,
"learning_rate": 1.2441611340546958e-06,
"loss": 30.2877,
"reward": 16.03737948834896,
"reward_std": 5.3479601461440325,
"rewards/concensus_correctness_reward_func": 12.5,
"rewards/consensus_reward_func": 1.625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.8125,
"rewards/question_recreation_reward_func": 0.5265668611973524,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.03125,
"rewards/xmlcount_reward_func": 0.5420624995604157,
"step": 272
},
{
"completion_length": 200.90625,
"epoch": 3.859154929577465,
"grad_norm": 39.15205764770508,
"kl": 100.48482239246368,
"learning_rate": 1.2093215309105352e-06,
"loss": 0.1005,
"reward": 19.245975971221924,
"reward_std": 9.297940351068974,
"rewards/concensus_correctness_reward_func": 15.625,
"rewards/consensus_reward_func": 1.5625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.0,
"rewards/question_recreation_reward_func": 0.4970390796661377,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.0625,
"rewards/xmlcount_reward_func": 0.48331249598413706,
"step": 274
},
{
"completion_length": 183.25,
"epoch": 3.887323943661972,
"grad_norm": 22.197479248046875,
"kl": 61.005871653556824,
"learning_rate": 1.1748203858258056e-06,
"loss": 0.061,
"reward": 17.168922126293182,
"reward_std": 7.946846023201942,
"rewards/concensus_correctness_reward_func": 13.75,
"rewards/consensus_reward_func": 1.5625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.8125,
"rewards/question_recreation_reward_func": 0.4897347055375576,
"rewards/soft_format_reward_func": 0.03125,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": 0.47606249898672104,
"step": 276
},
{
"completion_length": 202.65625,
"epoch": 3.915492957746479,
"grad_norm": 11.439094543457031,
"kl": 89.05792760848999,
"learning_rate": 1.140666746127854e-06,
"loss": 0.0891,
"reward": 19.30165010690689,
"reward_std": 5.133161583915353,
"rewards/concensus_correctness_reward_func": 15.103812456130981,
"rewards/consensus_reward_func": 1.5,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.5,
"rewards/question_recreation_reward_func": 0.5337746478617191,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.0625,
"rewards/xmlcount_reward_func": 0.5859374962747097,
"step": 278
},
{
"completion_length": 216.40625,
"epoch": 3.943661971830986,
"grad_norm": 27.769229888916016,
"kl": 141.26176762580872,
"learning_rate": 1.1068695680167665e-06,
"loss": 0.1413,
"reward": 18.381448954343796,
"reward_std": 6.32744001224637,
"rewards/concensus_correctness_reward_func": 15.0,
"rewards/consensus_reward_func": 1.6875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.8125,
"rewards/question_recreation_reward_func": 0.41957394033670425,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.03125,
"rewards/xmlcount_reward_func": 0.43062499538064003,
"step": 280
},
{
"completion_length": 203.59375,
"epoch": 3.971830985915493,
"grad_norm": 25.721193313598633,
"kl": 42.546354830265045,
"learning_rate": 1.0734377142167549e-06,
"loss": 0.0425,
"reward": 17.76998621225357,
"reward_std": 6.668866345658898,
"rewards/concensus_correctness_reward_func": 14.375,
"rewards/consensus_reward_func": 1.5625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.6875,
"rewards/question_recreation_reward_func": 0.585392065346241,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": 0.49709375388920307,
"step": 282
},
{
"completion_length": 177.125,
"epoch": 4.0,
"grad_norm": 15.914278984069824,
"kl": 69.96249252557755,
"learning_rate": 1.0403799516520619e-06,
"loss": 0.07,
"reward": 18.440218448638916,
"reward_std": 6.883068062365055,
"rewards/concensus_correctness_reward_func": 15.0,
"rewards/consensus_reward_func": 1.5,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.8125,
"rewards/question_recreation_reward_func": 0.6001559719443321,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": 0.48068750463426113,
"step": 284
},
{
"completion_length": 191.46875,
"epoch": 4.028169014084507,
"grad_norm": 18.65995979309082,
"kl": 66.47544574737549,
"learning_rate": 1.0077049491479874e-06,
"loss": 0.0665,
"reward": 22.233580589294434,
"reward_std": 4.90995267778635,
"rewards/concensus_correctness_reward_func": 18.125,
"rewards/consensus_reward_func": 1.8125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.0625,
"rewards/question_recreation_reward_func": 0.5808308683335781,
"rewards/soft_format_reward_func": 0.03125,
"rewards/strict_format_reward_func": 0.0625,
"rewards/xmlcount_reward_func": 0.5589999947696924,
"step": 286
},
{
"completion_length": 167.0,
"epoch": 4.056338028169014,
"grad_norm": 21.48002815246582,
"kl": 208.1701105237007,
"learning_rate": 9.754212751576386e-07,
"loss": 0.2082,
"reward": 16.22745645046234,
"reward_std": 10.876746878027916,
"rewards/concensus_correctness_reward_func": 13.125,
"rewards/consensus_reward_func": 1.3125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.625,
"rewards/question_recreation_reward_func": 0.6086752116680145,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": 0.49378124438226223,
"step": 288
},
{
"completion_length": 214.75,
"epoch": 4.084507042253521,
"grad_norm": 12.544981002807617,
"kl": 36.315159887075424,
"learning_rate": 9.435373955150032e-07,
"loss": 0.0363,
"reward": 21.26410162448883,
"reward_std": 5.238973140716553,
"rewards/concensus_correctness_reward_func": 17.5,
"rewards/consensus_reward_func": 1.75,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.875,
"rewards/question_recreation_reward_func": 0.6026639565825462,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.03125,
"rewards/xmlcount_reward_func": 0.5051874946802855,
"step": 290
},
{
"completion_length": 178.8125,
"epoch": 4.112676056338028,
"grad_norm": 145.86680603027344,
"kl": 139.64250326156616,
"learning_rate": 9.120616712149291e-07,
"loss": 0.1396,
"reward": 18.312809228897095,
"reward_std": 10.484295897185802,
"rewards/concensus_correctness_reward_func": 15.0,
"rewards/consensus_reward_func": 1.5,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.75,
"rewards/question_recreation_reward_func": 0.5245908284559846,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": 0.49134374037384987,
"step": 292
},
{
"completion_length": 190.4375,
"epoch": 4.140845070422535,
"grad_norm": 18.74555778503418,
"kl": 50.41434487700462,
"learning_rate": 8.810023562206e-07,
"loss": 0.0504,
"reward": 17.838220357894897,
"reward_std": 9.871636435389519,
"rewards/concensus_correctness_reward_func": 14.375,
"rewards/consensus_reward_func": 1.4375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.0,
"rewards/question_recreation_reward_func": 0.5950643494725227,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.39940624311566353,
"step": 294
},
{
"completion_length": 205.09375,
"epoch": 4.169014084507042,
"grad_norm": 15.046422004699707,
"kl": 29.03540551662445,
"learning_rate": 8.503675952990756e-07,
"loss": 0.029,
"reward": 21.843222498893738,
"reward_std": 5.1342647187411785,
"rewards/concensus_correctness_reward_func": 17.5,
"rewards/consensus_reward_func": 1.75,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.375,
"rewards/question_recreation_reward_func": 0.6643783301115036,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": 0.5069687478244305,
"step": 296
},
{
"completion_length": 165.78125,
"epoch": 4.197183098591549,
"grad_norm": 17.604740142822266,
"kl": 51.230829417705536,
"learning_rate": 8.20165421885469e-07,
"loss": 0.0512,
"reward": 17.63426423072815,
"reward_std": 6.369495037943125,
"rewards/concensus_correctness_reward_func": 14.375,
"rewards/consensus_reward_func": 1.6875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.625,
"rewards/question_recreation_reward_func": 0.4940767101943493,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.43706249073147774,
"step": 298
},
{
"completion_length": 235.375,
"epoch": 4.225352112676056,
"grad_norm": 6.080423355102539,
"kl": 13.085180699825287,
"learning_rate": 7.904037559763162e-07,
"loss": 0.0131,
"reward": 19.109760493040085,
"reward_std": 1.1591962296515703,
"rewards/concensus_correctness_reward_func": 15.0,
"rewards/consensus_reward_func": 1.9375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.875,
"rewards/question_recreation_reward_func": 0.6586043164134026,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.078125,
"rewards/xmlcount_reward_func": 0.5605312576517463,
"step": 300
},
{
"completion_length": 194.9375,
"epoch": 4.253521126760563,
"grad_norm": 11.369367599487305,
"kl": 44.47002148628235,
"learning_rate": 7.610904020526938e-07,
"loss": 0.0445,
"reward": 16.67498344182968,
"reward_std": 9.237527802586555,
"rewards/concensus_correctness_reward_func": 13.125,
"rewards/consensus_reward_func": 1.5,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.6875,
"rewards/question_recreation_reward_func": 0.5777959898114204,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.109375,
"rewards/xmlcount_reward_func": 0.6753124929964542,
"step": 302
},
{
"completion_length": 189.65625,
"epoch": 4.28169014084507,
"grad_norm": 21.256389617919922,
"kl": 71.40908312797546,
"learning_rate": 7.322330470336314e-07,
"loss": 0.0714,
"reward": 20.69408369064331,
"reward_std": 6.768543675541878,
"rewards/concensus_correctness_reward_func": 16.875,
"rewards/consensus_reward_func": 1.6875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.9375,
"rewards/question_recreation_reward_func": 0.6492089293897152,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": 0.49800000712275505,
"step": 304
},
{
"completion_length": 195.875,
"epoch": 4.309859154929577,
"grad_norm": 7.1469597816467285,
"kl": 33.71372902393341,
"learning_rate": 7.038392582603481e-07,
"loss": 0.0337,
"reward": 20.510346174240112,
"reward_std": 6.731610176153481,
"rewards/concensus_correctness_reward_func": 16.875,
"rewards/consensus_reward_func": 1.6875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.875,
"rewards/question_recreation_reward_func": 0.5321269854903221,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": 0.4938437454402447,
"step": 306
},
{
"completion_length": 177.1875,
"epoch": 4.338028169014084,
"grad_norm": 292.14501953125,
"kl": 235.51378059387207,
"learning_rate": 6.759164815118493e-07,
"loss": 0.2355,
"reward": 16.430650651454926,
"reward_std": 5.297965854406357,
"rewards/concensus_correctness_reward_func": 12.5,
"rewards/consensus_reward_func": 1.6875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.875,
"rewards/question_recreation_reward_func": 0.616213109344244,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.09375,
"rewards/xmlcount_reward_func": 0.6581875011324883,
"step": 308
},
{
"completion_length": 216.96875,
"epoch": 4.366197183098592,
"grad_norm": 6.305967807769775,
"kl": 341.7059046626091,
"learning_rate": 6.484720390524008e-07,
"loss": 0.3417,
"reward": 21.65046501159668,
"reward_std": 6.0755148604512215,
"rewards/concensus_correctness_reward_func": 17.5,
"rewards/consensus_reward_func": 1.75,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.1875,
"rewards/question_recreation_reward_func": 0.5807776674628258,
"rewards/soft_format_reward_func": 0.03125,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": 0.5540624931454659,
"step": 310
},
{
"completion_length": 173.625,
"epoch": 4.394366197183099,
"grad_norm": 102.45231628417969,
"kl": 167.21936225891113,
"learning_rate": 6.2151312771139e-07,
"loss": 0.1672,
"reward": 18.467550039291382,
"reward_std": 6.471917539834976,
"rewards/concensus_correctness_reward_func": 15.0,
"rewards/consensus_reward_func": 1.5,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.0,
"rewards/question_recreation_reward_func": 0.3949254211038351,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": 0.5257500112056732,
"step": 312
},
{
"completion_length": 151.34375,
"epoch": 4.422535211267606,
"grad_norm": 75.45765686035156,
"kl": 130.43919348716736,
"learning_rate": 5.950468169960846e-07,
"loss": 0.1304,
"reward": 17.570589900016785,
"reward_std": 9.95434544980526,
"rewards/concensus_correctness_reward_func": 14.375,
"rewards/consensus_reward_func": 1.4375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.625,
"rewards/question_recreation_reward_func": 0.4281215965747833,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.109375,
"rewards/xmlcount_reward_func": 0.579968761652708,
"step": 314
},
{
"completion_length": 194.28125,
"epoch": 4.450704225352113,
"grad_norm": 150.11294555664062,
"kl": 140.0380249619484,
"learning_rate": 5.690800472377747e-07,
"loss": 0.14,
"reward": 19.77779531478882,
"reward_std": 8.302450500428677,
"rewards/concensus_correctness_reward_func": 16.25,
"rewards/consensus_reward_func": 1.625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.875,
"rewards/question_recreation_reward_func": 0.48335786536335945,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": 0.4975624978542328,
"step": 316
},
{
"completion_length": 175.59375,
"epoch": 4.47887323943662,
"grad_norm": 18.749839782714844,
"kl": 35.669027864933014,
"learning_rate": 5.436196277717928e-07,
"loss": 0.0357,
"reward": 20.80094337463379,
"reward_std": 6.281726138666272,
"rewards/concensus_correctness_reward_func": 17.5,
"rewards/consensus_reward_func": 1.75,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.4375,
"rewards/question_recreation_reward_func": 0.5344437230378389,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.03125,
"rewards/xmlcount_reward_func": 0.5477499924600124,
"step": 318
},
{
"completion_length": 250.53125,
"epoch": 4.507042253521127,
"grad_norm": 20.69882583618164,
"kl": 60.95956812798977,
"learning_rate": 5.186722351518822e-07,
"loss": 0.061,
"reward": 16.334015995264053,
"reward_std": 5.083772074431181,
"rewards/concensus_correctness_reward_func": 12.5,
"rewards/consensus_reward_func": 1.75,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.6875,
"rewards/question_recreation_reward_func": 0.6849223002791405,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.078125,
"rewards/xmlcount_reward_func": 0.6334687490016222,
"step": 320
},
{
"completion_length": 163.90625,
"epoch": 4.535211267605634,
"grad_norm": 25.738649368286133,
"kl": 61.802970230579376,
"learning_rate": 4.94244411399388e-07,
"loss": 0.0618,
"reward": 21.02661967277527,
"reward_std": 6.551919437944889,
"rewards/concensus_correctness_reward_func": 17.5,
"rewards/consensus_reward_func": 1.75,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.625,
"rewards/question_recreation_reward_func": 0.497026052325964,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0625,
"rewards/xmlcount_reward_func": 0.592093750834465,
"step": 322
},
{
"completion_length": 164.8125,
"epoch": 4.563380281690141,
"grad_norm": 256.33447265625,
"kl": 241.42089343070984,
"learning_rate": 4.703425622877239e-07,
"loss": 0.2414,
"reward": 17.63269305229187,
"reward_std": 10.951422438025475,
"rewards/concensus_correctness_reward_func": 14.375,
"rewards/consensus_reward_func": 1.4375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.6875,
"rewards/question_recreation_reward_func": 0.450318006798625,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.078125,
"rewards/xmlcount_reward_func": 0.6042499914765358,
"step": 324
},
{
"completion_length": 165.125,
"epoch": 4.591549295774648,
"grad_norm": 31.652605056762695,
"kl": 43.60177397727966,
"learning_rate": 4.469729556625704e-07,
"loss": 0.0436,
"reward": 19.482061743736267,
"reward_std": 8.377652376890182,
"rewards/concensus_correctness_reward_func": 15.625,
"rewards/consensus_reward_func": 1.5625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.125,
"rewards/question_recreation_reward_func": 0.5043744444847107,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": 0.6183125004172325,
"step": 326
},
{
"completion_length": 175.96875,
"epoch": 4.619718309859155,
"grad_norm": 10.242480278015137,
"kl": 21.965113878250122,
"learning_rate": 4.2414171979824e-07,
"loss": 0.022,
"reward": 20.588115096092224,
"reward_std": 6.254963330924511,
"rewards/concensus_correctness_reward_func": 16.875,
"rewards/consensus_reward_func": 1.6875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.875,
"rewards/question_recreation_reward_func": 0.47283417731523514,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.078125,
"rewards/xmlcount_reward_func": 0.5996562596410513,
"step": 328
},
{
"completion_length": 206.90625,
"epoch": 4.647887323943662,
"grad_norm": 114.71504211425781,
"kl": 82.849600315094,
"learning_rate": 4.0185484179064427e-07,
"loss": 0.0828,
"reward": 16.668977200984955,
"reward_std": 6.640440072864294,
"rewards/concensus_correctness_reward_func": 13.75,
"rewards/consensus_reward_func": 1.5625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.3125,
"rewards/question_recreation_reward_func": 0.6327583584934473,
"rewards/soft_format_reward_func": 0.03125,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.3799687549471855,
"step": 330
},
{
"completion_length": 193.78125,
"epoch": 4.676056338028169,
"grad_norm": 58.762516021728516,
"kl": 94.41752421855927,
"learning_rate": 3.801181659872805e-07,
"loss": 0.0944,
"reward": 17.93424743413925,
"reward_std": 7.9948363825678825,
"rewards/concensus_correctness_reward_func": 13.75,
"rewards/consensus_reward_func": 1.5625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.1875,
"rewards/question_recreation_reward_func": 0.615435041487217,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.15625,
"rewards/xmlcount_reward_func": 0.6625625044107437,
"step": 332
},
{
"completion_length": 205.65625,
"epoch": 4.704225352112676,
"grad_norm": 16.824317932128906,
"kl": 88.85696315765381,
"learning_rate": 3.5893739245465465e-07,
"loss": 0.0889,
"reward": 13.142742186784744,
"reward_std": 8.422066152095795,
"rewards/concensus_correctness_reward_func": 10.0,
"rewards/consensus_reward_func": 1.3125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.625,
"rewards/question_recreation_reward_func": 0.5604296084493399,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.09375,
"rewards/xmlcount_reward_func": 0.551062498241663,
"step": 334
},
{
"completion_length": 191.28125,
"epoch": 4.732394366197183,
"grad_norm": 28.791719436645508,
"kl": 14.142897069454193,
"learning_rate": 3.383180754835344e-07,
"loss": 0.0141,
"reward": 15.803372830152512,
"reward_std": 6.4000239027664065,
"rewards/concensus_correctness_reward_func": 12.5,
"rewards/consensus_reward_func": 1.75,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.5625,
"rewards/question_recreation_reward_func": 0.3909041713923216,
"rewards/soft_format_reward_func": 0.03125,
"rewards/strict_format_reward_func": 0.078125,
"rewards/xmlcount_reward_func": 0.4905937425792217,
"step": 336
},
{
"completion_length": 184.5625,
"epoch": 4.76056338028169,
"grad_norm": 16.70457649230957,
"kl": 52.15749150514603,
"learning_rate": 3.182656221324384e-07,
"loss": 0.0522,
"reward": 20.181472659111023,
"reward_std": 5.007015394046903,
"rewards/concensus_correctness_reward_func": 16.25,
"rewards/consensus_reward_func": 1.625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.125,
"rewards/question_recreation_reward_func": 0.5429101679474115,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.03125,
"rewards/xmlcount_reward_func": 0.6073125060647726,
"step": 338
},
{
"completion_length": 194.09375,
"epoch": 4.788732394366197,
"grad_norm": 9.146194458007812,
"kl": 70.11677631735802,
"learning_rate": 2.98785290809723e-07,
"loss": 0.0701,
"reward": 18.03154420852661,
"reward_std": 6.602232605218887,
"rewards/concensus_correctness_reward_func": 14.375,
"rewards/consensus_reward_func": 1.625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.875,
"rewards/question_recreation_reward_func": 0.5392944887280464,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0625,
"rewards/xmlcount_reward_func": 0.5547500140964985,
"step": 340
},
{
"completion_length": 201.0625,
"epoch": 4.816901408450704,
"grad_norm": 10.930744171142578,
"kl": 49.3902553319931,
"learning_rate": 2.798821898946588e-07,
"loss": 0.0494,
"reward": 17.96576575934887,
"reward_std": 6.204115567728877,
"rewards/concensus_correctness_reward_func": 14.375,
"rewards/consensus_reward_func": 1.5625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.875,
"rewards/question_recreation_reward_func": 0.5690469006076455,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.5685937367379665,
"step": 342
},
{
"completion_length": 183.75,
"epoch": 4.845070422535211,
"grad_norm": 11.14244556427002,
"kl": 530.0989896059036,
"learning_rate": 2.615612763978462e-07,
"loss": 0.5301,
"reward": 20.103400349617004,
"reward_std": 8.350226640701294,
"rewards/concensus_correctness_reward_func": 16.25,
"rewards/consensus_reward_func": 1.625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.9375,
"rewards/question_recreation_reward_func": 0.5339003503322601,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.109375,
"rewards/xmlcount_reward_func": 0.647624995559454,
"step": 344
},
{
"completion_length": 188.6875,
"epoch": 4.873239436619718,
"grad_norm": 77.03536987304688,
"kl": 119.32902491092682,
"learning_rate": 2.438273546613257e-07,
"loss": 0.1193,
"reward": 15.51081308722496,
"reward_std": 6.711298692971468,
"rewards/concensus_correctness_reward_func": 11.875,
"rewards/consensus_reward_func": 1.5625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.8125,
"rewards/question_recreation_reward_func": 0.6048446670174599,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.09375,
"rewards/xmlcount_reward_func": 0.5622187480330467,
"step": 346
},
{
"completion_length": 173.5,
"epoch": 4.901408450704225,
"grad_norm": 10.03520679473877,
"kl": 680.569248855114,
"learning_rate": 2.2668507509871957e-07,
"loss": 0.6806,
"reward": 20.39792001247406,
"reward_std": 7.9413245394825935,
"rewards/concensus_correctness_reward_func": 16.25,
"rewards/consensus_reward_func": 1.625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.1875,
"rewards/question_recreation_reward_func": 0.6092325560748577,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.078125,
"rewards/xmlcount_reward_func": 0.6480624973773956,
"step": 348
},
{
"completion_length": 211.9375,
"epoch": 4.929577464788732,
"grad_norm": 9.544289588928223,
"kl": 67.18813559412956,
"learning_rate": 2.1013893297574777e-07,
"loss": 0.0672,
"reward": 18.139901995658875,
"reward_std": 7.58577387034893,
"rewards/concensus_correctness_reward_func": 14.375,
"rewards/consensus_reward_func": 1.625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.0625,
"rewards/question_recreation_reward_func": 0.5312769636511803,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": 0.48362499848008156,
"step": 350
},
{
"completion_length": 158.875,
"epoch": 4.957746478873239,
"grad_norm": 1220.3043212890625,
"kl": 410.9177169203758,
"learning_rate": 1.9419326723141534e-07,
"loss": 0.4109,
"reward": 20.147993981838226,
"reward_std": 6.549997612833977,
"rewards/concensus_correctness_reward_func": 16.25,
"rewards/consensus_reward_func": 1.625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.1875,
"rewards/question_recreation_reward_func": 0.4576812032610178,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.078125,
"rewards/xmlcount_reward_func": 0.5496875047683716,
"step": 352
},
{
"completion_length": 189.53125,
"epoch": 4.985915492957746,
"grad_norm": 107952.3671875,
"kl": 50374.548123419285,
"learning_rate": 1.788522593402059e-07,
"loss": 50.3746,
"reward": 19.29557180404663,
"reward_std": 9.341163873672485,
"rewards/concensus_correctness_reward_func": 15.625,
"rewards/consensus_reward_func": 1.5625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.0625,
"rewards/question_recreation_reward_func": 0.4647905360907316,
"rewards/soft_format_reward_func": 0.046875,
"rewards/strict_format_reward_func": 0.0625,
"rewards/xmlcount_reward_func": 0.47140624839812517,
"step": 354
},
{
"completion_length": 173.28125,
"epoch": 5.014084507042254,
"grad_norm": 20.18059730529785,
"kl": 56.19126904010773,
"learning_rate": 1.6411993221555928e-07,
"loss": 0.0562,
"reward": 18.442645728588104,
"reward_std": 7.794232741463929,
"rewards/concensus_correctness_reward_func": 15.0,
"rewards/consensus_reward_func": 1.5,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.9375,
"rewards/question_recreation_reward_func": 0.4293326549232006,
"rewards/soft_format_reward_func": 0.03125,
"rewards/strict_format_reward_func": 0.03125,
"rewards/xmlcount_reward_func": 0.5133124887943268,
"step": 356
},
{
"completion_length": 197.28125,
"epoch": 5.042253521126761,
"grad_norm": 98472.9296875,
"kl": 42849.533732414246,
"learning_rate": 1.5000014915493467e-07,
"loss": 42.8495,
"reward": 18.569206684827805,
"reward_std": 6.2984634116292,
"rewards/concensus_correctness_reward_func": 15.0,
"rewards/consensus_reward_func": 1.75,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.6875,
"rewards/question_recreation_reward_func": 0.6072378233075142,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.49321874510496855,
"step": 358
},
{
"completion_length": 180.75,
"epoch": 5.070422535211268,
"grad_norm": 17.634912490844727,
"kl": 57.633513152599335,
"learning_rate": 1.3649661282672478e-07,
"loss": 0.0576,
"reward": 16.21313813328743,
"reward_std": 6.453056633472443,
"rewards/concensus_correctness_reward_func": 13.125,
"rewards/consensus_reward_func": 1.5,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.5625,
"rewards/question_recreation_reward_func": 0.5248567461967468,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.46953125298023224,
"step": 360
},
{
"completion_length": 181.9375,
"epoch": 5.098591549295775,
"grad_norm": 6.022618293762207,
"kl": 55.502021461725235,
"learning_rate": 1.2361286429929953e-07,
"loss": 0.0555,
"reward": 21.646337032318115,
"reward_std": 4.948778457939625,
"rewards/concensus_correctness_reward_func": 17.5,
"rewards/consensus_reward_func": 1.75,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.1875,
"rewards/question_recreation_reward_func": 0.500399325042963,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.078125,
"rewards/xmlcount_reward_func": 0.6146875005215406,
"step": 362
},
{
"completion_length": 218.21875,
"epoch": 5.126760563380282,
"grad_norm": 24.87447166442871,
"kl": 70.74051466584206,
"learning_rate": 1.1135228211241827e-07,
"loss": 0.0707,
"reward": 14.870485126972198,
"reward_std": 8.201804894953966,
"rewards/concensus_correctness_reward_func": 11.875,
"rewards/consensus_reward_func": 1.4375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.5,
"rewards/question_recreation_reward_func": 0.6112355031073093,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.4311249917373061,
"step": 364
},
{
"completion_length": 200.875,
"epoch": 5.154929577464789,
"grad_norm": 17.180267333984375,
"kl": 47.48751229047775,
"learning_rate": 9.97180813912682e-08,
"loss": 0.0475,
"reward": 16.328749358654022,
"reward_std": 8.015842709690332,
"rewards/concensus_correctness_reward_func": 13.125,
"rewards/consensus_reward_func": 1.5625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.625,
"rewards/question_recreation_reward_func": 0.43224936723709106,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0625,
"rewards/xmlcount_reward_func": 0.5215000063180923,
"step": 366
},
{
"completion_length": 160.96875,
"epoch": 5.183098591549296,
"grad_norm": 22.877119064331055,
"kl": 52.39796483516693,
"learning_rate": 8.871331300335322e-08,
"loss": 0.0524,
"reward": 20.07635807991028,
"reward_std": 5.351797789335251,
"rewards/concensus_correctness_reward_func": 16.875,
"rewards/consensus_reward_func": 1.6875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.5625,
"rewards/question_recreation_reward_func": 0.4254519008100033,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0,
"rewards/xmlcount_reward_func": 0.5259062610566616,
"step": 368
},
{
"completion_length": 187.78125,
"epoch": 5.211267605633803,
"grad_norm": 13.897777557373047,
"kl": 37.57750755548477,
"learning_rate": 7.834086275845587e-08,
"loss": 0.0376,
"reward": 18.33556878566742,
"reward_std": 9.281399443745613,
"rewards/concensus_correctness_reward_func": 15.0,
"rewards/consensus_reward_func": 1.5,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.6875,
"rewards/question_recreation_reward_func": 0.4743501963093877,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.125,
"rewards/xmlcount_reward_func": 0.5487187346443534,
"step": 370
},
{
"completion_length": 191.625,
"epoch": 5.23943661971831,
"grad_norm": 50.67697525024414,
"kl": 90.23001140356064,
"learning_rate": 6.860345065188512e-08,
"loss": 0.0902,
"reward": 20.43578866124153,
"reward_std": 3.805197238922119,
"rewards/concensus_correctness_reward_func": 16.25,
"rewards/consensus_reward_func": 1.8125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.0,
"rewards/question_recreation_reward_func": 0.5983824506402016,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.125,
"rewards/xmlcount_reward_func": 0.634281262755394,
"step": 372
},
{
"completion_length": 148.75,
"epoch": 5.267605633802817,
"grad_norm": 24.013933181762695,
"kl": 979.8315967023373,
"learning_rate": 5.9503630151205025e-08,
"loss": 0.9798,
"reward": 14.10513174533844,
"reward_std": 8.168088547885418,
"rewards/concensus_correctness_reward_func": 11.25,
"rewards/consensus_reward_func": 1.25,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.75,
"rewards/question_recreation_reward_func": 0.42822520434856415,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": 0.36440624482929707,
"step": 374
},
{
"completion_length": 200.625,
"epoch": 5.295774647887324,
"grad_norm": 10426.9599609375,
"kl": 5901.08397424221,
"learning_rate": 5.104378752663008e-08,
"loss": 5.9011,
"reward": 19.324188113212585,
"reward_std": 8.048752292990685,
"rewards/concensus_correctness_reward_func": 15.625,
"rewards/consensus_reward_func": 1.5625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.9375,
"rewards/question_recreation_reward_func": 0.5999692752957344,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0625,
"rewards/xmlcount_reward_func": 0.5367187485098839,
"step": 376
},
{
"completion_length": 227.375,
"epoch": 5.323943661971831,
"grad_norm": 14.172988891601562,
"kl": 192.83756294846535,
"learning_rate": 4.3226141225268804e-08,
"loss": 0.1928,
"reward": 16.749698162078857,
"reward_std": 10.971801988780499,
"rewards/concensus_correctness_reward_func": 13.75,
"rewards/consensus_reward_func": 1.375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.6875,
"rewards/question_recreation_reward_func": 0.45266704447567463,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.046875,
"rewards/xmlcount_reward_func": 0.4376562498509884,
"step": 378
},
{
"completion_length": 175.5,
"epoch": 5.352112676056338,
"grad_norm": 226.3981170654297,
"kl": 166.27400428056717,
"learning_rate": 3.605274128937464e-08,
"loss": 0.1663,
"reward": 17.65935444831848,
"reward_std": 5.609901927411556,
"rewards/concensus_correctness_reward_func": 14.375,
"rewards/consensus_reward_func": 1.6875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.5625,
"rewards/question_recreation_reward_func": 0.43151059560477734,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0625,
"rewards/xmlcount_reward_func": 0.5403437651693821,
"step": 380
},
{
"completion_length": 177.84375,
"epoch": 5.380281690140845,
"grad_norm": 11.149412155151367,
"kl": 77.37025237083435,
"learning_rate": 2.9525468818755455e-08,
"loss": 0.0774,
"reward": 16.605177223682404,
"reward_std": 7.419661745429039,
"rewards/concensus_correctness_reward_func": 13.125,
"rewards/consensus_reward_func": 1.5625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.6875,
"rewards/question_recreation_reward_func": 0.6258334219455719,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0625,
"rewards/xmlcount_reward_func": 0.5418437719345093,
"step": 382
},
{
"completion_length": 209.125,
"epoch": 5.408450704225352,
"grad_norm": 211.39231872558594,
"kl": 465.8537292480469,
"learning_rate": 2.3646035477491726e-08,
"loss": 0.4659,
"reward": 18.196722507476807,
"reward_std": 2.805102661252022,
"rewards/concensus_correctness_reward_func": 14.375,
"rewards/consensus_reward_func": 1.75,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.8125,
"rewards/question_recreation_reward_func": 0.5984417237341404,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.09375,
"rewards/xmlcount_reward_func": 0.5670312382280827,
"step": 384
},
{
"completion_length": 202.34375,
"epoch": 5.436619718309859,
"grad_norm": 1496.8240966796875,
"kl": 205.22770684957504,
"learning_rate": 1.841598304507891e-08,
"loss": 0.2052,
"reward": 14.782392874360085,
"reward_std": 6.89620116353035,
"rewards/concensus_correctness_reward_func": 11.875,
"rewards/consensus_reward_func": 1.4375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.625,
"rewards/question_recreation_reward_func": 0.47601788584142923,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.015625,
"rewards/xmlcount_reward_func": 0.3532499959692359,
"step": 386
},
{
"completion_length": 167.96875,
"epoch": 5.464788732394366,
"grad_norm": 20.663803100585938,
"kl": 39.010592728853226,
"learning_rate": 1.383668301212393e-08,
"loss": 0.039,
"reward": 18.894488275051117,
"reward_std": 7.656329156830907,
"rewards/concensus_correctness_reward_func": 15.625,
"rewards/consensus_reward_func": 1.5625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.75,
"rewards/question_recreation_reward_func": 0.48589482717216015,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.03125,
"rewards/xmlcount_reward_func": 0.4242187514901161,
"step": 388
},
{
"completion_length": 165.15625,
"epoch": 5.492957746478873,
"grad_norm": 565.16455078125,
"kl": 197.70873486995697,
"learning_rate": 9.90933622069562e-09,
"loss": 0.1977,
"reward": 17.981685161590576,
"reward_std": 6.837831487879157,
"rewards/concensus_correctness_reward_func": 14.375,
"rewards/consensus_reward_func": 1.6875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.75,
"rewards/question_recreation_reward_func": 0.5238723792135715,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.0625,
"rewards/xmlcount_reward_func": 0.5828124936670065,
"step": 390
},
{
"completion_length": 182.0625,
"epoch": 5.52112676056338,
"grad_norm": 8.814075469970703,
"kl": 79.46024709939957,
"learning_rate": 6.634972549423857e-09,
"loss": 0.0795,
"reward": 16.639443710446358,
"reward_std": 6.897542349994183,
"rewards/concensus_correctness_reward_func": 13.125,
"rewards/consensus_reward_func": 1.3125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.9375,
"rewards/question_recreation_reward_func": 0.6201934851706028,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.09375,
"rewards/xmlcount_reward_func": 0.5348749901168048,
"step": 392
},
{
"completion_length": 155.0,
"epoch": 5.549295774647887,
"grad_norm": 509.7919006347656,
"kl": 377.71353951096535,
"learning_rate": 4.01445064343281e-09,
"loss": 0.3777,
"reward": 19.832693457603455,
"reward_std": 6.739683650434017,
"rewards/concensus_correctness_reward_func": 16.25,
"rewards/consensus_reward_func": 1.625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.75,
"rewards/question_recreation_reward_func": 0.48700585681945086,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.078125,
"rewards/xmlcount_reward_func": 0.6425625011324883,
"step": 394
},
{
"completion_length": 200.125,
"epoch": 5.577464788732394,
"grad_norm": 38.70952224731445,
"kl": 128.81593072414398,
"learning_rate": 2.048457689174943e-09,
"loss": 0.1288,
"reward": 19.608071088790894,
"reward_std": 9.304894164204597,
"rewards/concensus_correctness_reward_func": 15.625,
"rewards/consensus_reward_func": 1.5625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.0625,
"rewards/question_recreation_reward_func": 0.6501020789146423,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.09375,
"rewards/xmlcount_reward_func": 0.5985937379300594,
"step": 396
},
{
"completion_length": 146.4375,
"epoch": 5.605633802816901,
"grad_norm": 10.314447402954102,
"kl": 102.41072046756744,
"learning_rate": 7.375092342298828e-10,
"loss": 0.1024,
"reward": 17.96188724040985,
"reward_std": 8.251108340919018,
"rewards/concensus_correctness_reward_func": 14.375,
"rewards/consensus_reward_func": 1.4375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 1.0,
"rewards/question_recreation_reward_func": 0.5060124294832349,
"rewards/soft_format_reward_func": 0.015625,
"rewards/strict_format_reward_func": 0.0625,
"rewards/xmlcount_reward_func": 0.5652500111609697,
"step": 398
},
{
"completion_length": 160.90625,
"epoch": 5.633802816901408,
"grad_norm": 83.75353240966797,
"kl": 102.39953392744064,
"learning_rate": 8.194905210923143e-11,
"loss": 0.1024,
"reward": 19.95249879360199,
"reward_std": 8.20958011969924,
"rewards/concensus_correctness_reward_func": 16.25,
"rewards/consensus_reward_func": 1.625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.8125,
"rewards/question_recreation_reward_func": 0.5820612944662571,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.078125,
"rewards/xmlcount_reward_func": 0.6048125065863132,
"step": 400
},
{
"epoch": 5.633802816901408,
"step": 400,
"total_flos": 0.0,
"train_loss": 0.7875715676811523,
"train_runtime": 3812.444,
"train_samples_per_second": 1.679,
"train_steps_per_second": 0.105
}
],
"logging_steps": 2,
"max_steps": 400,
"num_input_tokens_seen": 0,
"num_train_epochs": 6,
"save_steps": 25,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}