{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.714285714285714, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 341.40625, "epoch": 0.05714285714285714, "grad_norm": 596.5550537109375, "kl": 0.0, "learning_rate": 1.6666666666666665e-07, "loss": 0.0, "reward": 3.286102021113038, "reward_std": 1.2568062348291278, "rewards/concensus_correctness_reward_func": 0.9363125078380108, "rewards/consensus_reward_func": 0.75, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.1875, "rewards/question_recreation_reward_func": 0.6091644916159566, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.15625, "rewards/xmlcount_reward_func": 0.6468749991327059, "step": 2 }, { "completion_length": 267.03125, "epoch": 0.11428571428571428, "grad_norm": 8.391526222229004, "kl": 0.03340096258034464, "learning_rate": 5e-07, "loss": 0.0, "reward": 6.290437173098326, "reward_std": 0.8421196703563254, "rewards/concensus_correctness_reward_func": 1.8721874952316284, "rewards/consensus_reward_func": 1.6875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.375, "rewards/question_recreation_reward_func": 0.8401246860812535, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 1.09375, "step": 4 }, { "completion_length": 210.40625, "epoch": 0.17142857142857143, "grad_norm": 15.22243881225586, "kl": 2027.1362594434759, "learning_rate": 8.333333333333333e-07, "loss": 2.0271, "reward": 6.841284893453121, "reward_std": 0.4849709497721051, "rewards/concensus_correctness_reward_func": 2.0985624939203262, "rewards/consensus_reward_func": 1.75, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.5, "rewards/question_recreation_reward_func": 0.9067848596605472, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 1.1484375, "step": 6 }, { "completion_length": 201.5, "epoch": 0.22857142857142856, "grad_norm": 5023845.0, "kl": 491715.70446118125, "learning_rate": 9.99934441832816e-07, "loss": 491.7157, "reward": 6.59922556579113, "reward_std": 1.0329566281288862, "rewards/concensus_correctness_reward_func": 1.9156874902546406, "rewards/consensus_reward_func": 1.8125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.3125, "rewards/question_recreation_reward_func": 0.9257256090641022, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 1.1796875, "step": 8 }, { "completion_length": 241.46875, "epoch": 0.2857142857142857, "grad_norm": 75.62132263183594, "kl": 0.7261713498155586, "learning_rate": 9.994100796397953e-07, "loss": 0.0007, "reward": 6.299689278006554, "reward_std": 1.1271193381398916, "rewards/concensus_correctness_reward_func": 1.8690624944865704, "rewards/consensus_reward_func": 1.75, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.8681267369538546, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 1.125, "step": 10 }, { "completion_length": 135.90625, "epoch": 0.34285714285714286, "grad_norm": 13.225509643554688, "kl": 6.158389857970178, "learning_rate": 9.983619052372847e-07, "loss": 0.0062, "reward": 7.896404385566711, "reward_std": 0.2962362109683454, "rewards/concensus_correctness_reward_func": 2.4846875071525574, "rewards/consensus_reward_func": 1.9375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.75, "rewards/question_recreation_reward_func": 0.9976543560624123, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.2421875, "step": 12 }, { "completion_length": 185.53125, "epoch": 0.4, "grad_norm": 1333.3822021484375, "kl": 140.61930383229628, "learning_rate": 9.967910180154888e-07, "loss": 0.1406, "reward": 6.950483754277229, "reward_std": 0.5596978962421417, "rewards/concensus_correctness_reward_func": 2.1063749976456165, "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.375, "rewards/question_recreation_reward_func": 0.9378588311374187, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 1.1875, "step": 14 }, { "completion_length": 160.5, "epoch": 0.45714285714285713, "grad_norm": 1.1600862741470337, "kl": 16.362293783109635, "learning_rate": 9.946990656181779e-07, "loss": 0.0164, "reward": 6.930127799510956, "reward_std": 0.3672862723469734, "rewards/concensus_correctness_reward_func": 2.041000008583069, "rewards/consensus_reward_func": 1.9375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.3125, "rewards/question_recreation_reward_func": 0.936002803966403, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.21875, "step": 16 }, { "completion_length": 212.8125, "epoch": 0.5142857142857142, "grad_norm": 19658.451171875, "kl": 2282.9115716170054, "learning_rate": 9.92088242214537e-07, "loss": 2.2829, "reward": 6.798925548791885, "reward_std": 1.1088980715867365, "rewards/concensus_correctness_reward_func": 2.1001250073313713, "rewards/consensus_reward_func": 1.8125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.4375, "rewards/question_recreation_reward_func": 0.8745817970484495, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 1.13671875, "step": 18 }, { "completion_length": 159.53125, "epoch": 0.5714285714285714, "grad_norm": 31.297624588012695, "kl": 0.9011318488046527, "learning_rate": 9.889612861977853e-07, "loss": 0.0009, "reward": 6.927322618663311, "reward_std": 0.028847315654275008, "rewards/concensus_correctness_reward_func": 2.102874994277954, "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.375, "rewards/question_recreation_reward_func": 0.9377288408577442, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 1.18359375, "step": 20 }, { "completion_length": 153.4375, "epoch": 0.6285714285714286, "grad_norm": 60.826229095458984, "kl": 1.329597746487707, "learning_rate": 9.853214773129795e-07, "loss": 0.0013, "reward": 6.199776213616133, "reward_std": 0.005127147152961697, "rewards/concensus_correctness_reward_func": 1.7282500192523003, "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.9402761983219534, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 1.1875, "step": 22 }, { "completion_length": 179.5, "epoch": 0.6857142857142857, "grad_norm": 775657472.0, "kl": 50355153.60608631, "learning_rate": 9.81172633217015e-07, "loss": 50355.1562, "reward": 6.3887627720832825, "reward_std": 1.3089241795241833, "rewards/concensus_correctness_reward_func": 1.963062521070242, "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.4375, "rewards/question_recreation_reward_func": 0.8848253078758717, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 0.7908749878406525, "step": 24 }, { "completion_length": 151.46875, "epoch": 0.7428571428571429, "grad_norm": 14.245716094970703, "kl": 2.350977373425849, "learning_rate": 9.765191054744304e-07, "loss": 0.0024, "reward": 5.939617916941643, "reward_std": 0.934625256806612, "rewards/concensus_correctness_reward_func": 1.749625001102686, "rewards/consensus_reward_func": 1.625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.1875, "rewards/question_recreation_reward_func": 0.8888366278260946, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 1.082406248897314, "step": 26 }, { "completion_length": 128.34375, "epoch": 0.8, "grad_norm": 48335.97265625, "kl": 1940.711479806574, "learning_rate": 9.713657749932171e-07, "loss": 1.9407, "reward": 6.796030431985855, "reward_std": 0.0927647277712822, "rewards/concensus_correctness_reward_func": 1.9866250082850456, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.9695616886019707, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.23046875, "step": 28 }, { "completion_length": 122.46875, "epoch": 0.8571428571428571, "grad_norm": 0.2548629343509674, "kl": 22700.092740163207, "learning_rate": 9.657180469054212e-07, "loss": 22.7001, "reward": 7.125140815973282, "reward_std": 0.32153427973389626, "rewards/concensus_correctness_reward_func": 2.1701249927282333, "rewards/consensus_reward_func": 1.9375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.375, "rewards/question_recreation_reward_func": 0.9667346738278866, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 1.20703125, "step": 30 }, { "completion_length": 156.9375, "epoch": 0.9142857142857143, "grad_norm": 1398.3612060546875, "kl": 123.12271721323486, "learning_rate": 9.59581844897906e-07, "loss": 0.1231, "reward": 7.055217877030373, "reward_std": 0.24068230390548706, "rewards/concensus_correctness_reward_func": 2.1315624937415123, "rewards/consensus_reward_func": 1.9375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.375, "rewards/question_recreation_reward_func": 0.9392804062226787, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 1.21875, "step": 32 }, { "completion_length": 240.0, "epoch": 0.9714285714285714, "grad_norm": 186.86839294433594, "kl": 152.3791024107486, "learning_rate": 9.529636049992233e-07, "loss": 0.1524, "reward": 6.90338921546936, "reward_std": 1.3376336731016636, "rewards/concensus_correctness_reward_func": 2.182187505066395, "rewards/consensus_reward_func": 1.75, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.5625, "rewards/question_recreation_reward_func": 0.8657329957932234, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 1.12109375, "step": 34 }, { "completion_length": 247.90625, "epoch": 1.0285714285714285, "grad_norm": 326.3687438964844, "kl": 219.12903738673776, "learning_rate": 9.458702688291071e-07, "loss": 0.2191, "reward": 5.31565772742033, "reward_std": 0.9229803088819608, "rewards/concensus_correctness_reward_func": 1.3946250043809414, "rewards/consensus_reward_func": 1.5625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.8155639320611954, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.359375, "rewards/xmlcount_reward_func": 1.05859375, "step": 36 }, { "completion_length": 211.09375, "epoch": 1.0857142857142856, "grad_norm": 23.403614044189453, "kl": 3.9675100842723623, "learning_rate": 9.383092763176738e-07, "loss": 0.004, "reward": 6.58778091520071, "reward_std": 1.721142528578639, "rewards/concensus_correctness_reward_func": 2.0522499941289425, "rewards/consensus_reward_func": 1.625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.5625, "rewards/question_recreation_reward_func": 0.8519372157752514, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 1.08984375, "step": 38 }, { "completion_length": 218.0625, "epoch": 1.1428571428571428, "grad_norm": 327.6996154785156, "kl": 0.364447561558336, "learning_rate": 9.302885579019626e-07, "loss": 0.0004, "reward": 6.55630399286747, "reward_std": 1.1096585169434547, "rewards/concensus_correctness_reward_func": 1.981812495738268, "rewards/consensus_reward_func": 1.8125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.3125, "rewards/question_recreation_reward_func": 0.8752728328108788, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 1.13671875, "step": 40 }, { "completion_length": 522.21875, "epoch": 1.2, "grad_norm": 47.249237060546875, "kl": 211.27693609474227, "learning_rate": 9.218165262080022e-07, "loss": 0.2113, "reward": 3.5790372733026743, "reward_std": 1.652490053035656, "rewards/concensus_correctness_reward_func": 0.9664374999701977, "rewards/consensus_reward_func": 0.9375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.1875, "rewards/question_recreation_reward_func": 0.5032248190109385, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.234375, "rewards/xmlcount_reward_func": 0.75, "step": 42 }, { "completion_length": 399.5, "epoch": 1.2571428571428571, "grad_norm": 46957.8125, "kl": 1240.154023682233, "learning_rate": 9.129020672271281e-07, "loss": 1.2402, "reward": 4.165122143924236, "reward_std": 2.677976368338932, "rewards/concensus_correctness_reward_func": 1.1625000014901161, "rewards/consensus_reward_func": 1.1875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0625, "rewards/question_recreation_reward_func": 0.6002783491458104, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 0.85546875, "step": 44 }, { "completion_length": 348.71875, "epoch": 1.3142857142857143, "grad_norm": 197.34835815429688, "kl": 1697.7357009318657, "learning_rate": 9.035545309958046e-07, "loss": 1.6977, "reward": 5.216266397386789, "reward_std": 2.5298230523912935, "rewards/concensus_correctness_reward_func": 1.415687508881092, "rewards/consensus_reward_func": 1.375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.375, "rewards/question_recreation_reward_func": 0.7302663810260128, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.9921875, "step": 46 }, { "completion_length": 427.28125, "epoch": 1.3714285714285714, "grad_norm": 107.24671936035156, "kl": 0.5033836024813354, "learning_rate": 8.937837217887272e-07, "loss": 0.0005, "reward": 3.593818176537752, "reward_std": 1.2321268621553827, "rewards/concensus_correctness_reward_func": 0.9172500036656857, "rewards/consensus_reward_func": 1.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0625, "rewards/question_recreation_reward_func": 0.5789119017135818, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.234375, "rewards/xmlcount_reward_func": 0.80078125, "step": 48 }, { "completion_length": 406.6875, "epoch": 1.4285714285714286, "grad_norm": 2652.8984375, "kl": 21193.798785352148, "learning_rate": 8.83599887835493e-07, "loss": 21.1938, "reward": 3.9688764177262783, "reward_std": 3.281522080527793, "rewards/concensus_correctness_reward_func": 1.0411249957978725, "rewards/consensus_reward_func": 1.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.1875, "rewards/question_recreation_reward_func": 0.6308764494024217, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.25, "rewards/xmlcount_reward_func": 0.859375, "step": 50 }, { "completion_length": 456.0, "epoch": 1.4857142857142858, "grad_norm": 9026.6044921875, "kl": 1415.1057826047763, "learning_rate": 8.73013710571623e-07, "loss": 1.4151, "reward": 3.0648840237408876, "reward_std": 1.9786420244963665, "rewards/concensus_correctness_reward_func": 0.8255000002682209, "rewards/consensus_reward_func": 0.625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.3125, "rewards/question_recreation_reward_func": 0.45813402088242583, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.15625, "rewards/xmlcount_reward_func": 0.6875, "step": 52 }, { "completion_length": 484.0, "epoch": 1.5428571428571427, "grad_norm": 139.81741333007812, "kl": 0.6134039051830769, "learning_rate": 8.620362934352108e-07, "loss": 0.0006, "reward": 4.179860107600689, "reward_std": 1.8562917799558782, "rewards/concensus_correctness_reward_func": 1.2471874989569187, "rewards/consensus_reward_func": 1.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.3125, "rewards/question_recreation_reward_func": 0.5654851646249881, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.25, "rewards/xmlcount_reward_func": 0.8046875, "step": 54 }, { "completion_length": 408.9375, "epoch": 1.6, "grad_norm": 102.32249450683594, "kl": 3008801003.4072285, "learning_rate": 8.506791502209496e-07, "loss": 3008801.25, "reward": 3.813065191730857, "reward_std": 1.33644521248425, "rewards/concensus_correctness_reward_func": 1.0011250115931034, "rewards/consensus_reward_func": 0.875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.375, "rewards/question_recreation_reward_func": 0.5424089302105131, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.203125, "rewards/xmlcount_reward_func": 0.81640625, "step": 56 }, { "completion_length": 376.25, "epoch": 1.657142857142857, "grad_norm": 24.86591911315918, "kl": 76.24003965221345, "learning_rate": 8.389541930037516e-07, "loss": 0.0762, "reward": 3.775482662022114, "reward_std": 1.995910257101059, "rewards/concensus_correctness_reward_func": 0.9166250079870224, "rewards/consensus_reward_func": 1.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.6908889040350914, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.25, "rewards/xmlcount_reward_func": 0.91796875, "step": 58 }, { "completion_length": 184.46875, "epoch": 1.7142857142857144, "grad_norm": 748.4888916015625, "kl": 16.552064943592995, "learning_rate": 8.268737196446263e-07, "loss": 0.0166, "reward": 5.196034669876099, "reward_std": 2.2851073294878006, "rewards/concensus_correctness_reward_func": 1.2321875053457916, "rewards/consensus_reward_func": 1.1875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.5, "rewards/question_recreation_reward_func": 0.8818159140646458, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.296875, "rewards/xmlcount_reward_func": 1.09765625, "step": 60 }, { "completion_length": 354.375, "epoch": 1.7714285714285714, "grad_norm": 11774.6318359375, "kl": 594.4727419780102, "learning_rate": 8.144504008919222e-07, "loss": 0.5945, "reward": 5.226467318832874, "reward_std": 1.8939718978672317, "rewards/concensus_correctness_reward_func": 1.5290624983608723, "rewards/consensus_reward_func": 1.3125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.375, "rewards/question_recreation_reward_func": 0.7208423566626152, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.328125, "rewards/xmlcount_reward_func": 0.9609375, "step": 62 }, { "completion_length": 239.03125, "epoch": 1.8285714285714287, "grad_norm": 63723.67578125, "kl": 2681.7918725676136, "learning_rate": 8.016972670914623e-07, "loss": 2.6818, "reward": 5.760847687721252, "reward_std": 1.1207159195910208, "rewards/concensus_correctness_reward_func": 1.6048124991357327, "rewards/consensus_reward_func": 1.75, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.8435351252555847, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 1.125, "step": 64 }, { "completion_length": 166.84375, "epoch": 1.8857142857142857, "grad_norm": 0.016271423548460007, "kl": 0.16761540318839252, "learning_rate": 7.886276945195097e-07, "loss": 0.0002, "reward": 6.907515615224838, "reward_std": 0.27857801198842935, "rewards/concensus_correctness_reward_func": 2.0479375049471855, "rewards/consensus_reward_func": 1.9375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.9689531102776527, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.21875, "step": 66 }, { "completion_length": 166.0625, "epoch": 1.9428571428571428, "grad_norm": 1.8111671209335327, "kl": 0.17472450132481754, "learning_rate": 7.752553913529018e-07, "loss": 0.0002, "reward": 7.087520241737366, "reward_std": 0.6711924958362943, "rewards/concensus_correctness_reward_func": 2.3121249973773956, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.5, "rewards/question_recreation_reward_func": 0.9998952522873878, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.7911249995231628, "step": 68 }, { "completion_length": 155.53125, "epoch": 2.0, "grad_norm": 0.02920331247150898, "kl": 0.18397854757495224, "learning_rate": 7.61594383291065e-07, "loss": 0.0002, "reward": 6.990590900182724, "reward_std": 0.46470576524734497, "rewards/concensus_correctness_reward_func": 2.11124999076128, "rewards/consensus_reward_func": 1.8125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.4375, "rewards/question_recreation_reward_func": 0.9691846631467342, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 1.20703125, "step": 70 }, { "completion_length": 136.96875, "epoch": 2.057142857142857, "grad_norm": 0.012207652442157269, "kl": 0.12467939942143857, "learning_rate": 7.476589988449938e-07, "loss": 0.0001, "reward": 7.336249977350235, "reward_std": 0.0, "rewards/concensus_correctness_reward_func": 2.2112499997019768, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.375, "rewards/question_recreation_reward_func": 1.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 72 }, { "completion_length": 140.25, "epoch": 2.1142857142857143, "grad_norm": 1.8709704875946045, "kl": 0.14828130067326128, "learning_rate": 7.334638543086203e-07, "loss": 0.0001, "reward": 7.094191342592239, "reward_std": 8.292392158182338e-05, "rewards/concensus_correctness_reward_func": 2.0942500084638596, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.9999413713812828, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 74 }, { "completion_length": 158.71875, "epoch": 2.1714285714285713, "grad_norm": 0.01471527200192213, "kl": 0.20862194756045938, "learning_rate": 7.190238384283412e-07, "loss": 0.0002, "reward": 6.877131998538971, "reward_std": 0.7933639287948608, "rewards/concensus_correctness_reward_func": 2.067374996840954, "rewards/consensus_reward_func": 1.9375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.4375, "rewards/question_recreation_reward_func": 0.9687882512807846, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.9815937429666519, "step": 76 }, { "completion_length": 188.125, "epoch": 2.2285714285714286, "grad_norm": 1.460219144821167, "kl": 4.615653241518885, "learning_rate": 7.043540967867781e-07, "loss": 0.0046, "reward": 6.435562700033188, "reward_std": 0.9147683555056574, "rewards/concensus_correctness_reward_func": 1.9740000180900097, "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.9308439530432224, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.9369687438011169, "step": 78 }, { "completion_length": 167.875, "epoch": 2.2857142857142856, "grad_norm": 5.986639976501465, "kl": 97.31889040162787, "learning_rate": 6.894700159171534e-07, "loss": 0.0973, "reward": 6.660285115242004, "reward_std": 0.38862577243708074, "rewards/concensus_correctness_reward_func": 1.9778125062584877, "rewards/consensus_reward_func": 1.8125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.9363788738846779, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 1.21484375, "step": 80 }, { "completion_length": 132.59375, "epoch": 2.342857142857143, "grad_norm": 140.21267700195312, "kl": 11.938221657648683, "learning_rate": 6.743872071649411e-07, "loss": 0.0119, "reward": 7.314000904560089, "reward_std": 0.03995019569993019, "rewards/concensus_correctness_reward_func": 2.2172499895095825, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.375, "rewards/question_recreation_reward_func": 0.9912821874022484, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.24609375, "step": 82 }, { "completion_length": 177.125, "epoch": 2.4, "grad_norm": 0.013902968727052212, "kl": 1.2978905094787478, "learning_rate": 6.59121490313722e-07, "loss": 0.0013, "reward": 6.766894176602364, "reward_std": 0.4406161531805992, "rewards/concensus_correctness_reward_func": 2.1780624948441982, "rewards/consensus_reward_func": 1.75, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.5, "rewards/question_recreation_reward_func": 0.938206740480382, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 0.947500005364418, "step": 84 }, { "completion_length": 130.09375, "epoch": 2.4571428571428573, "grad_norm": 9.800251960754395, "kl": 0.22617360670119524, "learning_rate": 6.436888769924141e-07, "loss": 0.0002, "reward": 7.254179358482361, "reward_std": 0.013411822263151407, "rewards/concensus_correctness_reward_func": 2.1394999995827675, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.375, "rewards/question_recreation_reward_func": 0.9896793477237225, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 86 }, { "completion_length": 168.96875, "epoch": 2.5142857142857142, "grad_norm": 1.437393307685852, "kl": 0.18564400169998407, "learning_rate": 6.281055538812861e-07, "loss": 0.0002, "reward": 7.096096932888031, "reward_std": 0.3263694606721401, "rewards/concensus_correctness_reward_func": 2.0173750072717667, "rewards/consensus_reward_func": 1.9375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.5, "rewards/question_recreation_reward_func": 0.9615344516932964, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 1.2109375, "step": 88 }, { "completion_length": 225.4375, "epoch": 2.571428571428571, "grad_norm": 143.41500854492188, "kl": 8.297026936896145, "learning_rate": 6.123878657343647e-07, "loss": 0.0083, "reward": 6.617781460285187, "reward_std": 1.0305635929107666, "rewards/concensus_correctness_reward_func": 1.9847500026226044, "rewards/consensus_reward_func": 1.8125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.3125, "rewards/question_recreation_reward_func": 0.9064690098166466, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 1.1484375, "step": 90 }, { "completion_length": 155.125, "epoch": 2.6285714285714286, "grad_norm": 29.54805564880371, "kl": 1.2011672258377075, "learning_rate": 5.96552298236044e-07, "loss": 0.0012, "reward": 7.490390375256538, "reward_std": 0.510272353887558, "rewards/concensus_correctness_reward_func": 2.4165625013411045, "rewards/consensus_reward_func": 1.9375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.5625, "rewards/question_recreation_reward_func": 0.9136716090142727, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 1.20703125, "step": 92 }, { "completion_length": 183.28125, "epoch": 2.685714285714286, "grad_norm": 260070128.0, "kl": 8012920.115434824, "learning_rate": 5.806154607098799e-07, "loss": 8012.9199, "reward": 6.427686184644699, "reward_std": 0.5588814318180084, "rewards/concensus_correctness_reward_func": 1.8334374986588955, "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.9379986636340618, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 1.1875, "step": 94 }, { "completion_length": 161.90625, "epoch": 2.742857142857143, "grad_norm": 165.2026824951172, "kl": 1.9255495527759194, "learning_rate": 5.645940686977032e-07, "loss": 0.0019, "reward": 6.643823355436325, "reward_std": 0.2799105942249298, "rewards/concensus_correctness_reward_func": 1.9093125015497208, "rewards/consensus_reward_func": 1.9375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.9688858352601528, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.21875, "step": 96 }, { "completion_length": 216.78125, "epoch": 2.8, "grad_norm": 15576.0224609375, "kl": 3184225.6172290286, "learning_rate": 5.485049264273241e-07, "loss": 3184.2256, "reward": 6.616672560572624, "reward_std": 1.0126504600048065, "rewards/concensus_correctness_reward_func": 1.9756249897181988, "rewards/consensus_reward_func": 1.8125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.3125, "rewards/question_recreation_reward_func": 0.9066725894808769, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 1.15625, "step": 98 }, { "completion_length": 155.625, "epoch": 2.857142857142857, "grad_norm": 43.54243850708008, "kl": 17.29876364581287, "learning_rate": 5.323649091872178e-07, "loss": 0.0173, "reward": 7.1484761238098145, "reward_std": 0.29683050513267517, "rewards/concensus_correctness_reward_func": 2.252500005066395, "rewards/consensus_reward_func": 1.8125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.5, "rewards/question_recreation_reward_func": 0.9389448185684159, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 1.20703125, "step": 100 }, { "completion_length": 196.84375, "epoch": 2.914285714285714, "grad_norm": 1101.1700439453125, "kl": 313.8222270826809, "learning_rate": 5.16190945626678e-07, "loss": 0.3138, "reward": 6.691585049033165, "reward_std": 0.5840071098791668, "rewards/concensus_correctness_reward_func": 1.989124983549118, "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.9368351008743048, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 1.1875, "step": 102 }, { "completion_length": 153.96875, "epoch": 2.9714285714285715, "grad_norm": 1.4117786884307861, "kl": 0.2296100074891001, "learning_rate": 5e-07, "loss": 0.0002, "reward": 7.403991624712944, "reward_std": 0.28745076060295105, "rewards/concensus_correctness_reward_func": 2.2985000126063824, "rewards/consensus_reward_func": 1.9375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.5, "rewards/question_recreation_reward_func": 0.9687728695571423, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.21484375, "step": 104 }, { "completion_length": 154.875, "epoch": 3.0285714285714285, "grad_norm": 0.05570273473858833, "kl": 295.5748745780438, "learning_rate": 4.838090543733221e-07, "loss": 0.2956, "reward": 7.4088806957006454, "reward_std": 0.2800062551832525, "rewards/concensus_correctness_reward_func": 2.299625001847744, "rewards/consensus_reward_func": 1.9375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.5, "rewards/question_recreation_reward_func": 0.9686306864023209, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.21875, "step": 106 }, { "completion_length": 164.65625, "epoch": 3.085714285714286, "grad_norm": 0.04709651321172714, "kl": 0.1969663049094379, "learning_rate": 4.676350908127821e-07, "loss": 0.0002, "reward": 7.149936303496361, "reward_std": 0.28781798481941223, "rewards/concensus_correctness_reward_func": 2.1696874983608723, "rewards/consensus_reward_func": 1.9375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.375, "rewards/question_recreation_reward_func": 0.9685300551354885, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.21484375, "step": 108 }, { "completion_length": 139.78125, "epoch": 3.142857142857143, "grad_norm": 0.015770502388477325, "kl": 0.20569787896238267, "learning_rate": 4.5149507357267597e-07, "loss": 0.0002, "reward": 7.575696915388107, "reward_std": 0.0, "rewards/concensus_correctness_reward_func": 2.3559999987483025, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.5, "rewards/question_recreation_reward_func": 0.969696968793869, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 110 }, { "completion_length": 157.21875, "epoch": 3.2, "grad_norm": 50861.9453125, "kl": 6200.607699844055, "learning_rate": 4.354059313022969e-07, "loss": 6.2006, "reward": 7.015130370855331, "reward_std": 0.45689108967781067, "rewards/concensus_correctness_reward_func": 2.093562498688698, "rewards/consensus_reward_func": 1.9375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.3125, "rewards/question_recreation_reward_func": 0.9684428572654724, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.21875, "step": 112 }, { "completion_length": 164.0, "epoch": 3.257142857142857, "grad_norm": 0.013901927508413792, "kl": 6807.066189021803, "learning_rate": 4.193845392901201e-07, "loss": 6.8071, "reward": 6.620099663734436, "reward_std": 0.3450772762298584, "rewards/concensus_correctness_reward_func": 2.0902499929070473, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.937474632402882, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.9986249953508377, "step": 114 }, { "completion_length": 140.21875, "epoch": 3.314285714285714, "grad_norm": 0.050351180136203766, "kl": 0.195719227893278, "learning_rate": 4.0344770176395606e-07, "loss": 0.0002, "reward": 7.224750071763992, "reward_std": 0.0, "rewards/concensus_correctness_reward_func": 2.224750004708767, "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.375, "rewards/question_recreation_reward_func": 1.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 116 }, { "completion_length": 163.25, "epoch": 3.3714285714285714, "grad_norm": 21.72372817993164, "kl": 2.1254541873931885, "learning_rate": 3.8761213426563543e-07, "loss": 0.0021, "reward": 7.039299890398979, "reward_std": 0.43868909776210785, "rewards/concensus_correctness_reward_func": 2.16687498614192, "rewards/consensus_reward_func": 1.9375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.3125, "rewards/question_recreation_reward_func": 0.9388312064111233, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 1.21484375, "step": 118 }, { "completion_length": 159.15625, "epoch": 3.4285714285714284, "grad_norm": 0.3591388761997223, "kl": 6.059129260480404, "learning_rate": 3.718944461187138e-07, "loss": 0.0061, "reward": 6.463347539305687, "reward_std": 0.3400730788707733, "rewards/concensus_correctness_reward_func": 1.9808750078082085, "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.9365975013934076, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.9521249979734421, "step": 120 }, { "completion_length": 123.65625, "epoch": 3.4857142857142858, "grad_norm": 194.1432647705078, "kl": 57.89645641669631, "learning_rate": 3.563111230075859e-07, "loss": 0.0579, "reward": 7.029898107051849, "reward_std": 0.3636358277872205, "rewards/concensus_correctness_reward_func": 2.115187507122755, "rewards/consensus_reward_func": 1.9375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.4375, "rewards/question_recreation_reward_func": 0.914116925559938, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 1.18809375166893, "step": 122 }, { "completion_length": 148.8125, "epoch": 3.5428571428571427, "grad_norm": 14.622519493103027, "kl": 0.3278482835739851, "learning_rate": 3.408785096862782e-07, "loss": 0.0003, "reward": 6.669890329241753, "reward_std": 0.951744182035327, "rewards/concensus_correctness_reward_func": 2.159937519580126, "rewards/consensus_reward_func": 1.9375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.375, "rewards/question_recreation_reward_func": 0.9516402631998062, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 0.7770625054836273, "step": 124 }, { "completion_length": 138.34375, "epoch": 3.6, "grad_norm": 3.0443975925445557, "kl": 278.0337795561645, "learning_rate": 3.2561279283505884e-07, "loss": 0.278, "reward": 7.347518771886826, "reward_std": 0.10661890726260026, "rewards/concensus_correctness_reward_func": 2.1738749966025352, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.5, "rewards/question_recreation_reward_func": 0.9431750550866127, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.24609375, "step": 126 }, { "completion_length": 133.65625, "epoch": 3.657142857142857, "grad_norm": 63.743953704833984, "kl": 141.1469784581568, "learning_rate": 3.105299840828466e-07, "loss": 0.1411, "reward": 6.800521522760391, "reward_std": 0.010683320462703705, "rewards/concensus_correctness_reward_func": 1.9629999995231628, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.9625215027481318, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 128 }, { "completion_length": 136.46875, "epoch": 3.7142857142857144, "grad_norm": 8.97818660736084, "kl": 0.5335320448502898, "learning_rate": 2.95645903213222e-07, "loss": 0.0005, "reward": 7.278413146734238, "reward_std": 0.07083342224359512, "rewards/concensus_correctness_reward_func": 2.2035000026226044, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.375, "rewards/question_recreation_reward_func": 0.9694444462656975, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.24609375, "step": 130 }, { "completion_length": 129.1875, "epoch": 3.7714285714285714, "grad_norm": 2.273700714111328, "kl": 0.1980545329861343, "learning_rate": 2.8097616157165885e-07, "loss": 0.0002, "reward": 7.1327812522649765, "reward_std": 0.10708247870206833, "rewards/concensus_correctness_reward_func": 2.1523124910891056, "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.375, "rewards/question_recreation_reward_func": 1.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.24609375, "step": 132 }, { "completion_length": 136.46875, "epoch": 3.8285714285714287, "grad_norm": 3.4171736240386963, "kl": 0.1759730235207826, "learning_rate": 2.665361456913797e-07, "loss": 0.0002, "reward": 7.524241715669632, "reward_std": 0.012173316441476345, "rewards/concensus_correctness_reward_func": 2.2848750203847885, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.5, "rewards/question_recreation_reward_func": 0.989366702735424, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 134 }, { "completion_length": 132.6875, "epoch": 3.8857142857142857, "grad_norm": 21.402355194091797, "kl": 0.2659228784032166, "learning_rate": 2.523410011550064e-07, "loss": 0.0003, "reward": 6.992657542228699, "reward_std": 0.04131975769996643, "rewards/concensus_correctness_reward_func": 2.021875023841858, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.9707825183868408, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 136 }, { "completion_length": 129.03125, "epoch": 3.942857142857143, "grad_norm": 132813709312.0, "kl": 6803573266.517654, "learning_rate": 2.3840561670893495e-07, "loss": 6803573.5, "reward": 6.834182530641556, "reward_std": 0.2304869929794222, "rewards/concensus_correctness_reward_func": 2.089499995112419, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.8875887226313353, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 1.1695937439799309, "step": 138 }, { "completion_length": 140.8125, "epoch": 4.0, "grad_norm": 0.022691868245601654, "kl": 26381.484041058226, "learning_rate": 2.247446086470982e-07, "loss": 26.3815, "reward": 7.248510301113129, "reward_std": 0.0460367277264595, "rewards/concensus_correctness_reward_func": 2.21637499332428, "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.4375, "rewards/question_recreation_reward_func": 0.969635296612978, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 140 }, { "completion_length": 157.53125, "epoch": 4.057142857142857, "grad_norm": 0.009256873279809952, "kl": 3.527690098620951, "learning_rate": 2.113723054804904e-07, "loss": 0.0035, "reward": 7.401162892580032, "reward_std": 0.6326838135719299, "rewards/concensus_correctness_reward_func": 2.4743749871850014, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.625, "rewards/question_recreation_reward_func": 0.9991629458963871, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 0.8182500004768372, "step": 142 }, { "completion_length": 129.03125, "epoch": 4.114285714285714, "grad_norm": 7.365884304046631, "kl": 0.2767415994312614, "learning_rate": 1.9830273290853766e-07, "loss": 0.0003, "reward": 7.2931163012981415, "reward_std": 0.07107648908277042, "rewards/concensus_correctness_reward_func": 2.218375027179718, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.375, "rewards/question_recreation_reward_func": 0.9692725799977779, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.24609375, "step": 144 }, { "completion_length": 156.8125, "epoch": 4.171428571428572, "grad_norm": 4.549909591674805, "kl": 2098.400880107074, "learning_rate": 1.8554959910807772e-07, "loss": 2.0984, "reward": 6.6754628121852875, "reward_std": 0.5301313251256943, "rewards/concensus_correctness_reward_func": 2.098374992609024, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.1875, "rewards/question_recreation_reward_func": 0.9207440502941608, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 1.0000937432050705, "step": 146 }, { "completion_length": 161.1875, "epoch": 4.228571428571429, "grad_norm": 130.0556640625, "kl": 4.999799037585035, "learning_rate": 1.7312628035537386e-07, "loss": 0.005, "reward": 7.465287238359451, "reward_std": 0.6660420118496404, "rewards/concensus_correctness_reward_func": 2.3166875019669533, "rewards/consensus_reward_func": 1.9375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.625, "rewards/question_recreation_reward_func": 0.9381621927022934, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 1.194812498986721, "step": 148 }, { "completion_length": 134.40625, "epoch": 4.285714285714286, "grad_norm": 8.668111801147461, "kl": 641.8768360905815, "learning_rate": 1.6104580699624837e-07, "loss": 0.6419, "reward": 7.1231569945812225, "reward_std": 0.0297078593284823, "rewards/concensus_correctness_reward_func": 2.342249996960163, "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.375, "rewards/question_recreation_reward_func": 0.8770633104722947, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 1.2163437493145466, "step": 150 }, { "completion_length": 122.125, "epoch": 4.3428571428571425, "grad_norm": 6.391092777252197, "kl": 0.20541435782797635, "learning_rate": 1.493208497790504e-07, "loss": 0.0002, "reward": 6.966011185199022, "reward_std": 0.00430500041693449, "rewards/concensus_correctness_reward_func": 2.106500007212162, "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.375, "rewards/question_recreation_reward_func": 0.9532612152397633, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 1.1875, "step": 152 }, { "completion_length": 127.3125, "epoch": 4.4, "grad_norm": 43.81622314453125, "kl": 5.563992372946814, "learning_rate": 1.3796370656478934e-07, "loss": 0.0056, "reward": 6.806715875864029, "reward_std": 0.06279254704713821, "rewards/concensus_correctness_reward_func": 1.9822500199079514, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.9689970314502716, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.24609375, "step": 154 }, { "completion_length": 130.46875, "epoch": 4.457142857142857, "grad_norm": 122.86102294921875, "kl": 20.539323112927377, "learning_rate": 1.2698628942837697e-07, "loss": 0.0205, "reward": 6.735322088003159, "reward_std": 0.23215299472212791, "rewards/concensus_correctness_reward_func": 2.086000010371208, "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.1875, "rewards/question_recreation_reward_func": 0.896103395964019, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 1.2219687476754189, "step": 156 }, { "completion_length": 132.90625, "epoch": 4.514285714285714, "grad_norm": 69.87224578857422, "kl": 0.6354921485763043, "learning_rate": 1.1640011216450691e-07, "loss": 0.0006, "reward": 7.233689934015274, "reward_std": 0.514858566224575, "rewards/concensus_correctness_reward_func": 2.2895624935626984, "rewards/consensus_reward_func": 1.9375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.5, "rewards/question_recreation_reward_func": 0.8818775303661823, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 1.1872499994933605, "step": 158 }, { "completion_length": 133.59375, "epoch": 4.571428571428571, "grad_norm": 8.249082565307617, "kl": 0.26116269128397107, "learning_rate": 1.0621627821127288e-07, "loss": 0.0003, "reward": 7.30239263176918, "reward_std": 0.0416584275662899, "rewards/concensus_correctness_reward_func": 2.208875000476837, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.375, "rewards/question_recreation_reward_func": 0.9685175716876984, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 160 }, { "completion_length": 136.1875, "epoch": 4.628571428571428, "grad_norm": 0.024719232693314552, "kl": 0.7720870058983564, "learning_rate": 9.644546900419531e-08, "loss": 0.0008, "reward": 6.74112144112587, "reward_std": 0.14779043197631836, "rewards/concensus_correctness_reward_func": 1.97062499076128, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.9422151371836662, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.2189062498509884, "step": 162 }, { "completion_length": 132.28125, "epoch": 4.685714285714286, "grad_norm": 99.90188598632812, "kl": 5.756765312515199, "learning_rate": 8.70979327728718e-08, "loss": 0.0058, "reward": 7.076995253562927, "reward_std": 0.3961862847208977, "rewards/concensus_correctness_reward_func": 2.1743750162422657, "rewards/consensus_reward_func": 1.9375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.375, "rewards/question_recreation_reward_func": 0.9299640282988548, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 1.20703125, "step": 164 }, { "completion_length": 133.46875, "epoch": 4.742857142857143, "grad_norm": 1942.1475830078125, "kl": 80.78691061586142, "learning_rate": 7.81834737919978e-08, "loss": 0.0808, "reward": 7.304515153169632, "reward_std": 0.11349444479128579, "rewards/concensus_correctness_reward_func": 2.350000001490116, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.375, "rewards/question_recreation_reward_func": 0.8892338592559099, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 1.2215312495827675, "step": 166 }, { "completion_length": 131.75, "epoch": 4.8, "grad_norm": 36.11498260498047, "kl": 35.66279458301142, "learning_rate": 6.971144209803736e-08, "loss": 0.0357, "reward": 7.376064032316208, "reward_std": 0.1673837215421372, "rewards/concensus_correctness_reward_func": 2.3438749983906746, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.5, "rewards/question_recreation_reward_func": 0.8862515506334603, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 1.2084374986588955, "step": 168 }, { "completion_length": 159.9375, "epoch": 4.857142857142857, "grad_norm": 1420.3355712890625, "kl": 5809.957044942072, "learning_rate": 6.16907236823262e-08, "loss": 5.81, "reward": 6.121835008263588, "reward_std": 0.37124670308548957, "rewards/concensus_correctness_reward_func": 1.8349375016987324, "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0625, "rewards/question_recreation_reward_func": 0.7878349621314555, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.40625, "rewards/xmlcount_reward_func": 1.1553125008940697, "step": 170 }, { "completion_length": 139.1875, "epoch": 4.914285714285715, "grad_norm": 10.688383102416992, "kl": 0.22011687979102135, "learning_rate": 5.412973117089287e-08, "loss": 0.0002, "reward": 7.198920458555222, "reward_std": 0.20292456448078156, "rewards/concensus_correctness_reward_func": 2.0933750048279762, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.4375, "rewards/question_recreation_reward_func": 0.9375766552984715, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.24609375, "step": 172 }, { "completion_length": 136.0, "epoch": 4.9714285714285715, "grad_norm": 11.03411865234375, "kl": 1.326516842469573, "learning_rate": 4.703639500077655e-08, "loss": 0.0013, "reward": 6.875307530164719, "reward_std": 0.13279777020215988, "rewards/concensus_correctness_reward_func": 2.067124992609024, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.9099949998781085, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 1.1950624994933605, "step": 174 }, { "completion_length": 137.59375, "epoch": 5.0285714285714285, "grad_norm": 130.2829132080078, "kl": 10.879363138461486, "learning_rate": 4.041815510209395e-08, "loss": 0.0109, "reward": 6.4715642631053925, "reward_std": 0.063371941447258, "rewards/concensus_correctness_reward_func": 1.7663749903440475, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.9747205302119255, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.24609375, "step": 176 }, { "completion_length": 133.40625, "epoch": 5.085714285714285, "grad_norm": 34.99993133544922, "kl": 2.79548569873441, "learning_rate": 3.4281953094578875e-08, "loss": 0.0028, "reward": 7.421046018600464, "reward_std": 0.10142664304294158, "rewards/concensus_correctness_reward_func": 2.3401249796152115, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.5, "rewards/question_recreation_reward_func": 0.9102647739928216, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 1.2175312489271164, "step": 178 }, { "completion_length": 167.1875, "epoch": 5.142857142857143, "grad_norm": 3914.157470703125, "kl": 408.69288858864456, "learning_rate": 2.8634225006782864e-08, "loss": 0.4087, "reward": 6.723982572555542, "reward_std": 0.15523457527160645, "rewards/concensus_correctness_reward_func": 1.9587500020861626, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.9449200928211212, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 1.2265625, "step": 180 }, { "completion_length": 128.625, "epoch": 5.2, "grad_norm": 40.13917922973633, "kl": 16038.165946810506, "learning_rate": 2.348089452556956e-08, "loss": 16.0382, "reward": 6.769445240497589, "reward_std": 0.5420721787959337, "rewards/concensus_correctness_reward_func": 2.2128750011324883, "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.8193827569484711, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.421875, "rewards/xmlcount_reward_func": 1.190312497317791, "step": 182 }, { "completion_length": 135.5625, "epoch": 5.257142857142857, "grad_norm": 0.020569220185279846, "kl": 0.19448763993568718, "learning_rate": 1.882736678298491e-08, "loss": 0.0002, "reward": 7.055787056684494, "reward_std": 0.03314562886953354, "rewards/concensus_correctness_reward_func": 2.081250011920929, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.9979745373129845, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.2421875, "step": 184 }, { "completion_length": 156.5, "epoch": 5.314285714285714, "grad_norm": 19.919050216674805, "kl": 14.24597706948407, "learning_rate": 1.4678522687020412e-08, "loss": 0.0142, "reward": 6.925608813762665, "reward_std": 0.1697275247424841, "rewards/concensus_correctness_reward_func": 2.210874982178211, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.8500463847303763, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 1.1771874986588955, "step": 186 }, { "completion_length": 128.25, "epoch": 5.371428571428572, "grad_norm": 677.8944702148438, "kl": 176.0574713665992, "learning_rate": 1.1038713802214717e-08, "loss": 0.1761, "reward": 6.819926559925079, "reward_std": 0.1877051831688732, "rewards/concensus_correctness_reward_func": 2.0945000126957893, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.7840202623046935, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.453125, "rewards/xmlcount_reward_func": 1.23828125, "step": 188 }, { "completion_length": 128.46875, "epoch": 5.428571428571429, "grad_norm": 0.06580457091331482, "kl": 0.8888655919581652, "learning_rate": 7.91175778546288e-09, "loss": 0.0009, "reward": 7.360000848770142, "reward_std": 0.15114279091358185, "rewards/concensus_correctness_reward_func": 2.34187500923872, "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.4375, "rewards/question_recreation_reward_func": 0.975157156586647, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.24609375, "step": 190 }, { "completion_length": 133.9375, "epoch": 5.485714285714286, "grad_norm": 9619.09375, "kl": 433.19152829330415, "learning_rate": 5.3009343818219975e-09, "loss": 0.4332, "reward": 7.0666501224040985, "reward_std": 0.1772715449333191, "rewards/concensus_correctness_reward_func": 2.06700000166893, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.3125, "rewards/question_recreation_reward_func": 0.9739313460886478, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.2288437485694885, "step": 192 }, { "completion_length": 163.15625, "epoch": 5.542857142857143, "grad_norm": 17.19892120361328, "kl": 1.2293917203787714, "learning_rate": 3.2089819845111944e-09, "loss": 0.0012, "reward": 7.017405599355698, "reward_std": 0.27922892197966576, "rewards/concensus_correctness_reward_func": 2.2905624993145466, "rewards/consensus_reward_func": 1.875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.3125, "rewards/question_recreation_reward_func": 0.9062805884168483, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.4375, "rewards/xmlcount_reward_func": 1.1955625005066395, "step": 194 }, { "completion_length": 137.96875, "epoch": 5.6, "grad_norm": 0.01611829362809658, "kl": 0.16643174993805587, "learning_rate": 1.638094762715314e-09, "loss": 0.0002, "reward": 7.399079352617264, "reward_std": 0.0, "rewards/concensus_correctness_reward_func": 2.149375006556511, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.5, "rewards/question_recreation_reward_func": 0.9997043535113335, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 1.25, "step": 196 }, { "completion_length": 158.21875, "epoch": 5.6571428571428575, "grad_norm": 2546.03564453125, "kl": 279.96135277603753, "learning_rate": 5.899203602046654e-10, "loss": 0.28, "reward": 6.826089903712273, "reward_std": 0.3347325325012207, "rewards/concensus_correctness_reward_func": 2.163374997675419, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.25, "rewards/question_recreation_reward_func": 0.938058597035706, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.46875, "rewards/xmlcount_reward_func": 1.0059062540531158, "step": 198 }, { "completion_length": 155.625, "epoch": 5.714285714285714, "grad_norm": 32.50419616699219, "kl": 3.371709798462689, "learning_rate": 6.555816718389895e-11, "loss": 0.0034, "reward": 7.258530080318451, "reward_std": 0.13536836579442024, "rewards/concensus_correctness_reward_func": 2.2292499989271164, "rewards/consensus_reward_func": 2.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.375, "rewards/question_recreation_reward_func": 0.9394363649189472, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.484375, "rewards/xmlcount_reward_func": 1.23046875, "step": 200 }, { "epoch": 5.714285714285714, "step": 200, "total_flos": 0.0, "train_loss": 98745.44226059376, "train_runtime": 1459.4013, "train_samples_per_second": 2.193, "train_steps_per_second": 0.137 } ], "logging_steps": 2, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }