DoomerHope's picture
End of training
d803f09 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.714285714285714,
"eval_steps": 500,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 341.40625,
"epoch": 0.05714285714285714,
"grad_norm": 596.5550537109375,
"kl": 0.0,
"learning_rate": 1.6666666666666665e-07,
"loss": 0.0,
"reward": 3.286102021113038,
"reward_std": 1.2568062348291278,
"rewards/concensus_correctness_reward_func": 0.9363125078380108,
"rewards/consensus_reward_func": 0.75,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.1875,
"rewards/question_recreation_reward_func": 0.6091644916159566,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.15625,
"rewards/xmlcount_reward_func": 0.6468749991327059,
"step": 2
},
{
"completion_length": 267.03125,
"epoch": 0.11428571428571428,
"grad_norm": 8.391526222229004,
"kl": 0.03340096258034464,
"learning_rate": 5e-07,
"loss": 0.0,
"reward": 6.290437173098326,
"reward_std": 0.8421196703563254,
"rewards/concensus_correctness_reward_func": 1.8721874952316284,
"rewards/consensus_reward_func": 1.6875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.375,
"rewards/question_recreation_reward_func": 0.8401246860812535,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.421875,
"rewards/xmlcount_reward_func": 1.09375,
"step": 4
},
{
"completion_length": 210.40625,
"epoch": 0.17142857142857143,
"grad_norm": 15.22243881225586,
"kl": 2027.1362594434759,
"learning_rate": 8.333333333333333e-07,
"loss": 2.0271,
"reward": 6.841284893453121,
"reward_std": 0.4849709497721051,
"rewards/concensus_correctness_reward_func": 2.0985624939203262,
"rewards/consensus_reward_func": 1.75,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.5,
"rewards/question_recreation_reward_func": 0.9067848596605472,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4375,
"rewards/xmlcount_reward_func": 1.1484375,
"step": 6
},
{
"completion_length": 201.5,
"epoch": 0.22857142857142856,
"grad_norm": 5023845.0,
"kl": 491715.70446118125,
"learning_rate": 9.99934441832816e-07,
"loss": 491.7157,
"reward": 6.59922556579113,
"reward_std": 1.0329566281288862,
"rewards/concensus_correctness_reward_func": 1.9156874902546406,
"rewards/consensus_reward_func": 1.8125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.3125,
"rewards/question_recreation_reward_func": 0.9257256090641022,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.453125,
"rewards/xmlcount_reward_func": 1.1796875,
"step": 8
},
{
"completion_length": 241.46875,
"epoch": 0.2857142857142857,
"grad_norm": 75.62132263183594,
"kl": 0.7261713498155586,
"learning_rate": 9.994100796397953e-07,
"loss": 0.0007,
"reward": 6.299689278006554,
"reward_std": 1.1271193381398916,
"rewards/concensus_correctness_reward_func": 1.8690624944865704,
"rewards/consensus_reward_func": 1.75,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.25,
"rewards/question_recreation_reward_func": 0.8681267369538546,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4375,
"rewards/xmlcount_reward_func": 1.125,
"step": 10
},
{
"completion_length": 135.90625,
"epoch": 0.34285714285714286,
"grad_norm": 13.225509643554688,
"kl": 6.158389857970178,
"learning_rate": 9.983619052372847e-07,
"loss": 0.0062,
"reward": 7.896404385566711,
"reward_std": 0.2962362109683454,
"rewards/concensus_correctness_reward_func": 2.4846875071525574,
"rewards/consensus_reward_func": 1.9375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.75,
"rewards/question_recreation_reward_func": 0.9976543560624123,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.484375,
"rewards/xmlcount_reward_func": 1.2421875,
"step": 12
},
{
"completion_length": 185.53125,
"epoch": 0.4,
"grad_norm": 1333.3822021484375,
"kl": 140.61930383229628,
"learning_rate": 9.967910180154888e-07,
"loss": 0.1406,
"reward": 6.950483754277229,
"reward_std": 0.5596978962421417,
"rewards/concensus_correctness_reward_func": 2.1063749976456165,
"rewards/consensus_reward_func": 1.875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.375,
"rewards/question_recreation_reward_func": 0.9378588311374187,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.46875,
"rewards/xmlcount_reward_func": 1.1875,
"step": 14
},
{
"completion_length": 160.5,
"epoch": 0.45714285714285713,
"grad_norm": 1.1600862741470337,
"kl": 16.362293783109635,
"learning_rate": 9.946990656181779e-07,
"loss": 0.0164,
"reward": 6.930127799510956,
"reward_std": 0.3672862723469734,
"rewards/concensus_correctness_reward_func": 2.041000008583069,
"rewards/consensus_reward_func": 1.9375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.3125,
"rewards/question_recreation_reward_func": 0.936002803966403,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.484375,
"rewards/xmlcount_reward_func": 1.21875,
"step": 16
},
{
"completion_length": 212.8125,
"epoch": 0.5142857142857142,
"grad_norm": 19658.451171875,
"kl": 2282.9115716170054,
"learning_rate": 9.92088242214537e-07,
"loss": 2.2829,
"reward": 6.798925548791885,
"reward_std": 1.1088980715867365,
"rewards/concensus_correctness_reward_func": 2.1001250073313713,
"rewards/consensus_reward_func": 1.8125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.4375,
"rewards/question_recreation_reward_func": 0.8745817970484495,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4375,
"rewards/xmlcount_reward_func": 1.13671875,
"step": 18
},
{
"completion_length": 159.53125,
"epoch": 0.5714285714285714,
"grad_norm": 31.297624588012695,
"kl": 0.9011318488046527,
"learning_rate": 9.889612861977853e-07,
"loss": 0.0009,
"reward": 6.927322618663311,
"reward_std": 0.028847315654275008,
"rewards/concensus_correctness_reward_func": 2.102874994277954,
"rewards/consensus_reward_func": 1.875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.375,
"rewards/question_recreation_reward_func": 0.9377288408577442,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.453125,
"rewards/xmlcount_reward_func": 1.18359375,
"step": 20
},
{
"completion_length": 153.4375,
"epoch": 0.6285714285714286,
"grad_norm": 60.826229095458984,
"kl": 1.329597746487707,
"learning_rate": 9.853214773129795e-07,
"loss": 0.0013,
"reward": 6.199776213616133,
"reward_std": 0.005127147152961697,
"rewards/concensus_correctness_reward_func": 1.7282500192523003,
"rewards/consensus_reward_func": 1.875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.9402761983219534,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.46875,
"rewards/xmlcount_reward_func": 1.1875,
"step": 22
},
{
"completion_length": 179.5,
"epoch": 0.6857142857142857,
"grad_norm": 775657472.0,
"kl": 50355153.60608631,
"learning_rate": 9.81172633217015e-07,
"loss": 50355.1562,
"reward": 6.3887627720832825,
"reward_std": 1.3089241795241833,
"rewards/concensus_correctness_reward_func": 1.963062521070242,
"rewards/consensus_reward_func": 1.875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.4375,
"rewards/question_recreation_reward_func": 0.8848253078758717,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4375,
"rewards/xmlcount_reward_func": 0.7908749878406525,
"step": 24
},
{
"completion_length": 151.46875,
"epoch": 0.7428571428571429,
"grad_norm": 14.245716094970703,
"kl": 2.350977373425849,
"learning_rate": 9.765191054744304e-07,
"loss": 0.0024,
"reward": 5.939617916941643,
"reward_std": 0.934625256806612,
"rewards/concensus_correctness_reward_func": 1.749625001102686,
"rewards/consensus_reward_func": 1.625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.1875,
"rewards/question_recreation_reward_func": 0.8888366278260946,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.40625,
"rewards/xmlcount_reward_func": 1.082406248897314,
"step": 26
},
{
"completion_length": 128.34375,
"epoch": 0.8,
"grad_norm": 48335.97265625,
"kl": 1940.711479806574,
"learning_rate": 9.713657749932171e-07,
"loss": 1.9407,
"reward": 6.796030431985855,
"reward_std": 0.0927647277712822,
"rewards/concensus_correctness_reward_func": 1.9866250082850456,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 0.9695616886019707,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.484375,
"rewards/xmlcount_reward_func": 1.23046875,
"step": 28
},
{
"completion_length": 122.46875,
"epoch": 0.8571428571428571,
"grad_norm": 0.2548629343509674,
"kl": 22700.092740163207,
"learning_rate": 9.657180469054212e-07,
"loss": 22.7001,
"reward": 7.125140815973282,
"reward_std": 0.32153427973389626,
"rewards/concensus_correctness_reward_func": 2.1701249927282333,
"rewards/consensus_reward_func": 1.9375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.375,
"rewards/question_recreation_reward_func": 0.9667346738278866,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.46875,
"rewards/xmlcount_reward_func": 1.20703125,
"step": 30
},
{
"completion_length": 156.9375,
"epoch": 0.9142857142857143,
"grad_norm": 1398.3612060546875,
"kl": 123.12271721323486,
"learning_rate": 9.59581844897906e-07,
"loss": 0.1231,
"reward": 7.055217877030373,
"reward_std": 0.24068230390548706,
"rewards/concensus_correctness_reward_func": 2.1315624937415123,
"rewards/consensus_reward_func": 1.9375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.375,
"rewards/question_recreation_reward_func": 0.9392804062226787,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.453125,
"rewards/xmlcount_reward_func": 1.21875,
"step": 32
},
{
"completion_length": 240.0,
"epoch": 0.9714285714285714,
"grad_norm": 186.86839294433594,
"kl": 152.3791024107486,
"learning_rate": 9.529636049992233e-07,
"loss": 0.1524,
"reward": 6.90338921546936,
"reward_std": 1.3376336731016636,
"rewards/concensus_correctness_reward_func": 2.182187505066395,
"rewards/consensus_reward_func": 1.75,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.5625,
"rewards/question_recreation_reward_func": 0.8657329957932234,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.421875,
"rewards/xmlcount_reward_func": 1.12109375,
"step": 34
},
{
"completion_length": 247.90625,
"epoch": 1.0285714285714285,
"grad_norm": 326.3687438964844,
"kl": 219.12903738673776,
"learning_rate": 9.458702688291071e-07,
"loss": 0.2191,
"reward": 5.31565772742033,
"reward_std": 0.9229803088819608,
"rewards/concensus_correctness_reward_func": 1.3946250043809414,
"rewards/consensus_reward_func": 1.5625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 0.8155639320611954,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.359375,
"rewards/xmlcount_reward_func": 1.05859375,
"step": 36
},
{
"completion_length": 211.09375,
"epoch": 1.0857142857142856,
"grad_norm": 23.403614044189453,
"kl": 3.9675100842723623,
"learning_rate": 9.383092763176738e-07,
"loss": 0.004,
"reward": 6.58778091520071,
"reward_std": 1.721142528578639,
"rewards/concensus_correctness_reward_func": 2.0522499941289425,
"rewards/consensus_reward_func": 1.625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.5625,
"rewards/question_recreation_reward_func": 0.8519372157752514,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.40625,
"rewards/xmlcount_reward_func": 1.08984375,
"step": 38
},
{
"completion_length": 218.0625,
"epoch": 1.1428571428571428,
"grad_norm": 327.6996154785156,
"kl": 0.364447561558336,
"learning_rate": 9.302885579019626e-07,
"loss": 0.0004,
"reward": 6.55630399286747,
"reward_std": 1.1096585169434547,
"rewards/concensus_correctness_reward_func": 1.981812495738268,
"rewards/consensus_reward_func": 1.8125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.3125,
"rewards/question_recreation_reward_func": 0.8752728328108788,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4375,
"rewards/xmlcount_reward_func": 1.13671875,
"step": 40
},
{
"completion_length": 522.21875,
"epoch": 1.2,
"grad_norm": 47.249237060546875,
"kl": 211.27693609474227,
"learning_rate": 9.218165262080022e-07,
"loss": 0.2113,
"reward": 3.5790372733026743,
"reward_std": 1.652490053035656,
"rewards/concensus_correctness_reward_func": 0.9664374999701977,
"rewards/consensus_reward_func": 0.9375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.1875,
"rewards/question_recreation_reward_func": 0.5032248190109385,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.234375,
"rewards/xmlcount_reward_func": 0.75,
"step": 42
},
{
"completion_length": 399.5,
"epoch": 1.2571428571428571,
"grad_norm": 46957.8125,
"kl": 1240.154023682233,
"learning_rate": 9.129020672271281e-07,
"loss": 1.2402,
"reward": 4.165122143924236,
"reward_std": 2.677976368338932,
"rewards/concensus_correctness_reward_func": 1.1625000014901161,
"rewards/consensus_reward_func": 1.1875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0625,
"rewards/question_recreation_reward_func": 0.6002783491458104,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.296875,
"rewards/xmlcount_reward_func": 0.85546875,
"step": 44
},
{
"completion_length": 348.71875,
"epoch": 1.3142857142857143,
"grad_norm": 197.34835815429688,
"kl": 1697.7357009318657,
"learning_rate": 9.035545309958046e-07,
"loss": 1.6977,
"reward": 5.216266397386789,
"reward_std": 2.5298230523912935,
"rewards/concensus_correctness_reward_func": 1.415687508881092,
"rewards/consensus_reward_func": 1.375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.375,
"rewards/question_recreation_reward_func": 0.7302663810260128,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.328125,
"rewards/xmlcount_reward_func": 0.9921875,
"step": 46
},
{
"completion_length": 427.28125,
"epoch": 1.3714285714285714,
"grad_norm": 107.24671936035156,
"kl": 0.5033836024813354,
"learning_rate": 8.937837217887272e-07,
"loss": 0.0005,
"reward": 3.593818176537752,
"reward_std": 1.2321268621553827,
"rewards/concensus_correctness_reward_func": 0.9172500036656857,
"rewards/consensus_reward_func": 1.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0625,
"rewards/question_recreation_reward_func": 0.5789119017135818,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.234375,
"rewards/xmlcount_reward_func": 0.80078125,
"step": 48
},
{
"completion_length": 406.6875,
"epoch": 1.4285714285714286,
"grad_norm": 2652.8984375,
"kl": 21193.798785352148,
"learning_rate": 8.83599887835493e-07,
"loss": 21.1938,
"reward": 3.9688764177262783,
"reward_std": 3.281522080527793,
"rewards/concensus_correctness_reward_func": 1.0411249957978725,
"rewards/consensus_reward_func": 1.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.1875,
"rewards/question_recreation_reward_func": 0.6308764494024217,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.25,
"rewards/xmlcount_reward_func": 0.859375,
"step": 50
},
{
"completion_length": 456.0,
"epoch": 1.4857142857142858,
"grad_norm": 9026.6044921875,
"kl": 1415.1057826047763,
"learning_rate": 8.73013710571623e-07,
"loss": 1.4151,
"reward": 3.0648840237408876,
"reward_std": 1.9786420244963665,
"rewards/concensus_correctness_reward_func": 0.8255000002682209,
"rewards/consensus_reward_func": 0.625,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.3125,
"rewards/question_recreation_reward_func": 0.45813402088242583,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.15625,
"rewards/xmlcount_reward_func": 0.6875,
"step": 52
},
{
"completion_length": 484.0,
"epoch": 1.5428571428571427,
"grad_norm": 139.81741333007812,
"kl": 0.6134039051830769,
"learning_rate": 8.620362934352108e-07,
"loss": 0.0006,
"reward": 4.179860107600689,
"reward_std": 1.8562917799558782,
"rewards/concensus_correctness_reward_func": 1.2471874989569187,
"rewards/consensus_reward_func": 1.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.3125,
"rewards/question_recreation_reward_func": 0.5654851646249881,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.25,
"rewards/xmlcount_reward_func": 0.8046875,
"step": 54
},
{
"completion_length": 408.9375,
"epoch": 1.6,
"grad_norm": 102.32249450683594,
"kl": 3008801003.4072285,
"learning_rate": 8.506791502209496e-07,
"loss": 3008801.25,
"reward": 3.813065191730857,
"reward_std": 1.33644521248425,
"rewards/concensus_correctness_reward_func": 1.0011250115931034,
"rewards/consensus_reward_func": 0.875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.375,
"rewards/question_recreation_reward_func": 0.5424089302105131,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.203125,
"rewards/xmlcount_reward_func": 0.81640625,
"step": 56
},
{
"completion_length": 376.25,
"epoch": 1.657142857142857,
"grad_norm": 24.86591911315918,
"kl": 76.24003965221345,
"learning_rate": 8.389541930037516e-07,
"loss": 0.0762,
"reward": 3.775482662022114,
"reward_std": 1.995910257101059,
"rewards/concensus_correctness_reward_func": 0.9166250079870224,
"rewards/consensus_reward_func": 1.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.6908889040350914,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.25,
"rewards/xmlcount_reward_func": 0.91796875,
"step": 58
},
{
"completion_length": 184.46875,
"epoch": 1.7142857142857144,
"grad_norm": 748.4888916015625,
"kl": 16.552064943592995,
"learning_rate": 8.268737196446263e-07,
"loss": 0.0166,
"reward": 5.196034669876099,
"reward_std": 2.2851073294878006,
"rewards/concensus_correctness_reward_func": 1.2321875053457916,
"rewards/consensus_reward_func": 1.1875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.5,
"rewards/question_recreation_reward_func": 0.8818159140646458,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.296875,
"rewards/xmlcount_reward_func": 1.09765625,
"step": 60
},
{
"completion_length": 354.375,
"epoch": 1.7714285714285714,
"grad_norm": 11774.6318359375,
"kl": 594.4727419780102,
"learning_rate": 8.144504008919222e-07,
"loss": 0.5945,
"reward": 5.226467318832874,
"reward_std": 1.8939718978672317,
"rewards/concensus_correctness_reward_func": 1.5290624983608723,
"rewards/consensus_reward_func": 1.3125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.375,
"rewards/question_recreation_reward_func": 0.7208423566626152,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.328125,
"rewards/xmlcount_reward_func": 0.9609375,
"step": 62
},
{
"completion_length": 239.03125,
"epoch": 1.8285714285714287,
"grad_norm": 63723.67578125,
"kl": 2681.7918725676136,
"learning_rate": 8.016972670914623e-07,
"loss": 2.6818,
"reward": 5.760847687721252,
"reward_std": 1.1207159195910208,
"rewards/concensus_correctness_reward_func": 1.6048124991357327,
"rewards/consensus_reward_func": 1.75,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.8435351252555847,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4375,
"rewards/xmlcount_reward_func": 1.125,
"step": 64
},
{
"completion_length": 166.84375,
"epoch": 1.8857142857142857,
"grad_norm": 0.016271423548460007,
"kl": 0.16761540318839252,
"learning_rate": 7.886276945195097e-07,
"loss": 0.0002,
"reward": 6.907515615224838,
"reward_std": 0.27857801198842935,
"rewards/concensus_correctness_reward_func": 2.0479375049471855,
"rewards/consensus_reward_func": 1.9375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.25,
"rewards/question_recreation_reward_func": 0.9689531102776527,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.484375,
"rewards/xmlcount_reward_func": 1.21875,
"step": 66
},
{
"completion_length": 166.0625,
"epoch": 1.9428571428571428,
"grad_norm": 1.8111671209335327,
"kl": 0.17472450132481754,
"learning_rate": 7.752553913529018e-07,
"loss": 0.0002,
"reward": 7.087520241737366,
"reward_std": 0.6711924958362943,
"rewards/concensus_correctness_reward_func": 2.3121249973773956,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.5,
"rewards/question_recreation_reward_func": 0.9998952522873878,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.484375,
"rewards/xmlcount_reward_func": 0.7911249995231628,
"step": 68
},
{
"completion_length": 155.53125,
"epoch": 2.0,
"grad_norm": 0.02920331247150898,
"kl": 0.18397854757495224,
"learning_rate": 7.61594383291065e-07,
"loss": 0.0002,
"reward": 6.990590900182724,
"reward_std": 0.46470576524734497,
"rewards/concensus_correctness_reward_func": 2.11124999076128,
"rewards/consensus_reward_func": 1.8125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.4375,
"rewards/question_recreation_reward_func": 0.9691846631467342,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.453125,
"rewards/xmlcount_reward_func": 1.20703125,
"step": 70
},
{
"completion_length": 136.96875,
"epoch": 2.057142857142857,
"grad_norm": 0.012207652442157269,
"kl": 0.12467939942143857,
"learning_rate": 7.476589988449938e-07,
"loss": 0.0001,
"reward": 7.336249977350235,
"reward_std": 0.0,
"rewards/concensus_correctness_reward_func": 2.2112499997019768,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.375,
"rewards/question_recreation_reward_func": 1.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 1.25,
"step": 72
},
{
"completion_length": 140.25,
"epoch": 2.1142857142857143,
"grad_norm": 1.8709704875946045,
"kl": 0.14828130067326128,
"learning_rate": 7.334638543086203e-07,
"loss": 0.0001,
"reward": 7.094191342592239,
"reward_std": 8.292392158182338e-05,
"rewards/concensus_correctness_reward_func": 2.0942500084638596,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.25,
"rewards/question_recreation_reward_func": 0.9999413713812828,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 1.25,
"step": 74
},
{
"completion_length": 158.71875,
"epoch": 2.1714285714285713,
"grad_norm": 0.01471527200192213,
"kl": 0.20862194756045938,
"learning_rate": 7.190238384283412e-07,
"loss": 0.0002,
"reward": 6.877131998538971,
"reward_std": 0.7933639287948608,
"rewards/concensus_correctness_reward_func": 2.067374996840954,
"rewards/consensus_reward_func": 1.9375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.4375,
"rewards/question_recreation_reward_func": 0.9687882512807846,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.484375,
"rewards/xmlcount_reward_func": 0.9815937429666519,
"step": 76
},
{
"completion_length": 188.125,
"epoch": 2.2285714285714286,
"grad_norm": 1.460219144821167,
"kl": 4.615653241518885,
"learning_rate": 7.043540967867781e-07,
"loss": 0.0046,
"reward": 6.435562700033188,
"reward_std": 0.9147683555056574,
"rewards/concensus_correctness_reward_func": 1.9740000180900097,
"rewards/consensus_reward_func": 1.875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.25,
"rewards/question_recreation_reward_func": 0.9308439530432224,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.46875,
"rewards/xmlcount_reward_func": 0.9369687438011169,
"step": 78
},
{
"completion_length": 167.875,
"epoch": 2.2857142857142856,
"grad_norm": 5.986639976501465,
"kl": 97.31889040162787,
"learning_rate": 6.894700159171534e-07,
"loss": 0.0973,
"reward": 6.660285115242004,
"reward_std": 0.38862577243708074,
"rewards/concensus_correctness_reward_func": 1.9778125062584877,
"rewards/consensus_reward_func": 1.8125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.25,
"rewards/question_recreation_reward_func": 0.9363788738846779,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.46875,
"rewards/xmlcount_reward_func": 1.21484375,
"step": 80
},
{
"completion_length": 132.59375,
"epoch": 2.342857142857143,
"grad_norm": 140.21267700195312,
"kl": 11.938221657648683,
"learning_rate": 6.743872071649411e-07,
"loss": 0.0119,
"reward": 7.314000904560089,
"reward_std": 0.03995019569993019,
"rewards/concensus_correctness_reward_func": 2.2172499895095825,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.375,
"rewards/question_recreation_reward_func": 0.9912821874022484,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.484375,
"rewards/xmlcount_reward_func": 1.24609375,
"step": 82
},
{
"completion_length": 177.125,
"epoch": 2.4,
"grad_norm": 0.013902968727052212,
"kl": 1.2978905094787478,
"learning_rate": 6.59121490313722e-07,
"loss": 0.0013,
"reward": 6.766894176602364,
"reward_std": 0.4406161531805992,
"rewards/concensus_correctness_reward_func": 2.1780624948441982,
"rewards/consensus_reward_func": 1.75,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.5,
"rewards/question_recreation_reward_func": 0.938206740480382,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.453125,
"rewards/xmlcount_reward_func": 0.947500005364418,
"step": 84
},
{
"completion_length": 130.09375,
"epoch": 2.4571428571428573,
"grad_norm": 9.800251960754395,
"kl": 0.22617360670119524,
"learning_rate": 6.436888769924141e-07,
"loss": 0.0002,
"reward": 7.254179358482361,
"reward_std": 0.013411822263151407,
"rewards/concensus_correctness_reward_func": 2.1394999995827675,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.375,
"rewards/question_recreation_reward_func": 0.9896793477237225,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 1.25,
"step": 86
},
{
"completion_length": 168.96875,
"epoch": 2.5142857142857142,
"grad_norm": 1.437393307685852,
"kl": 0.18564400169998407,
"learning_rate": 6.281055538812861e-07,
"loss": 0.0002,
"reward": 7.096096932888031,
"reward_std": 0.3263694606721401,
"rewards/concensus_correctness_reward_func": 2.0173750072717667,
"rewards/consensus_reward_func": 1.9375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.5,
"rewards/question_recreation_reward_func": 0.9615344516932964,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.46875,
"rewards/xmlcount_reward_func": 1.2109375,
"step": 88
},
{
"completion_length": 225.4375,
"epoch": 2.571428571428571,
"grad_norm": 143.41500854492188,
"kl": 8.297026936896145,
"learning_rate": 6.123878657343647e-07,
"loss": 0.0083,
"reward": 6.617781460285187,
"reward_std": 1.0305635929107666,
"rewards/concensus_correctness_reward_func": 1.9847500026226044,
"rewards/consensus_reward_func": 1.8125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.3125,
"rewards/question_recreation_reward_func": 0.9064690098166466,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.453125,
"rewards/xmlcount_reward_func": 1.1484375,
"step": 90
},
{
"completion_length": 155.125,
"epoch": 2.6285714285714286,
"grad_norm": 29.54805564880371,
"kl": 1.2011672258377075,
"learning_rate": 5.96552298236044e-07,
"loss": 0.0012,
"reward": 7.490390375256538,
"reward_std": 0.510272353887558,
"rewards/concensus_correctness_reward_func": 2.4165625013411045,
"rewards/consensus_reward_func": 1.9375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.5625,
"rewards/question_recreation_reward_func": 0.9136716090142727,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.453125,
"rewards/xmlcount_reward_func": 1.20703125,
"step": 92
},
{
"completion_length": 183.28125,
"epoch": 2.685714285714286,
"grad_norm": 260070128.0,
"kl": 8012920.115434824,
"learning_rate": 5.806154607098799e-07,
"loss": 8012.9199,
"reward": 6.427686184644699,
"reward_std": 0.5588814318180084,
"rewards/concensus_correctness_reward_func": 1.8334374986588955,
"rewards/consensus_reward_func": 1.875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 0.9379986636340618,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.46875,
"rewards/xmlcount_reward_func": 1.1875,
"step": 94
},
{
"completion_length": 161.90625,
"epoch": 2.742857142857143,
"grad_norm": 165.2026824951172,
"kl": 1.9255495527759194,
"learning_rate": 5.645940686977032e-07,
"loss": 0.0019,
"reward": 6.643823355436325,
"reward_std": 0.2799105942249298,
"rewards/concensus_correctness_reward_func": 1.9093125015497208,
"rewards/consensus_reward_func": 1.9375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 0.9688858352601528,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.484375,
"rewards/xmlcount_reward_func": 1.21875,
"step": 96
},
{
"completion_length": 216.78125,
"epoch": 2.8,
"grad_norm": 15576.0224609375,
"kl": 3184225.6172290286,
"learning_rate": 5.485049264273241e-07,
"loss": 3184.2256,
"reward": 6.616672560572624,
"reward_std": 1.0126504600048065,
"rewards/concensus_correctness_reward_func": 1.9756249897181988,
"rewards/consensus_reward_func": 1.8125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.3125,
"rewards/question_recreation_reward_func": 0.9066725894808769,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.453125,
"rewards/xmlcount_reward_func": 1.15625,
"step": 98
},
{
"completion_length": 155.625,
"epoch": 2.857142857142857,
"grad_norm": 43.54243850708008,
"kl": 17.29876364581287,
"learning_rate": 5.323649091872178e-07,
"loss": 0.0173,
"reward": 7.1484761238098145,
"reward_std": 0.29683050513267517,
"rewards/concensus_correctness_reward_func": 2.252500005066395,
"rewards/consensus_reward_func": 1.8125,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.5,
"rewards/question_recreation_reward_func": 0.9389448185684159,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4375,
"rewards/xmlcount_reward_func": 1.20703125,
"step": 100
},
{
"completion_length": 196.84375,
"epoch": 2.914285714285714,
"grad_norm": 1101.1700439453125,
"kl": 313.8222270826809,
"learning_rate": 5.16190945626678e-07,
"loss": 0.3138,
"reward": 6.691585049033165,
"reward_std": 0.5840071098791668,
"rewards/concensus_correctness_reward_func": 1.989124983549118,
"rewards/consensus_reward_func": 1.875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.25,
"rewards/question_recreation_reward_func": 0.9368351008743048,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.453125,
"rewards/xmlcount_reward_func": 1.1875,
"step": 102
},
{
"completion_length": 153.96875,
"epoch": 2.9714285714285715,
"grad_norm": 1.4117786884307861,
"kl": 0.2296100074891001,
"learning_rate": 5e-07,
"loss": 0.0002,
"reward": 7.403991624712944,
"reward_std": 0.28745076060295105,
"rewards/concensus_correctness_reward_func": 2.2985000126063824,
"rewards/consensus_reward_func": 1.9375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.5,
"rewards/question_recreation_reward_func": 0.9687728695571423,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.484375,
"rewards/xmlcount_reward_func": 1.21484375,
"step": 104
},
{
"completion_length": 154.875,
"epoch": 3.0285714285714285,
"grad_norm": 0.05570273473858833,
"kl": 295.5748745780438,
"learning_rate": 4.838090543733221e-07,
"loss": 0.2956,
"reward": 7.4088806957006454,
"reward_std": 0.2800062551832525,
"rewards/concensus_correctness_reward_func": 2.299625001847744,
"rewards/consensus_reward_func": 1.9375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.5,
"rewards/question_recreation_reward_func": 0.9686306864023209,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.484375,
"rewards/xmlcount_reward_func": 1.21875,
"step": 106
},
{
"completion_length": 164.65625,
"epoch": 3.085714285714286,
"grad_norm": 0.04709651321172714,
"kl": 0.1969663049094379,
"learning_rate": 4.676350908127821e-07,
"loss": 0.0002,
"reward": 7.149936303496361,
"reward_std": 0.28781798481941223,
"rewards/concensus_correctness_reward_func": 2.1696874983608723,
"rewards/consensus_reward_func": 1.9375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.375,
"rewards/question_recreation_reward_func": 0.9685300551354885,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.484375,
"rewards/xmlcount_reward_func": 1.21484375,
"step": 108
},
{
"completion_length": 139.78125,
"epoch": 3.142857142857143,
"grad_norm": 0.015770502388477325,
"kl": 0.20569787896238267,
"learning_rate": 4.5149507357267597e-07,
"loss": 0.0002,
"reward": 7.575696915388107,
"reward_std": 0.0,
"rewards/concensus_correctness_reward_func": 2.3559999987483025,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.5,
"rewards/question_recreation_reward_func": 0.969696968793869,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 1.25,
"step": 110
},
{
"completion_length": 157.21875,
"epoch": 3.2,
"grad_norm": 50861.9453125,
"kl": 6200.607699844055,
"learning_rate": 4.354059313022969e-07,
"loss": 6.2006,
"reward": 7.015130370855331,
"reward_std": 0.45689108967781067,
"rewards/concensus_correctness_reward_func": 2.093562498688698,
"rewards/consensus_reward_func": 1.9375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.3125,
"rewards/question_recreation_reward_func": 0.9684428572654724,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.484375,
"rewards/xmlcount_reward_func": 1.21875,
"step": 112
},
{
"completion_length": 164.0,
"epoch": 3.257142857142857,
"grad_norm": 0.013901927508413792,
"kl": 6807.066189021803,
"learning_rate": 4.193845392901201e-07,
"loss": 6.8071,
"reward": 6.620099663734436,
"reward_std": 0.3450772762298584,
"rewards/concensus_correctness_reward_func": 2.0902499929070473,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 0.937474632402882,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.46875,
"rewards/xmlcount_reward_func": 0.9986249953508377,
"step": 114
},
{
"completion_length": 140.21875,
"epoch": 3.314285714285714,
"grad_norm": 0.050351180136203766,
"kl": 0.195719227893278,
"learning_rate": 4.0344770176395606e-07,
"loss": 0.0002,
"reward": 7.224750071763992,
"reward_std": 0.0,
"rewards/concensus_correctness_reward_func": 2.224750004708767,
"rewards/consensus_reward_func": 1.875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.375,
"rewards/question_recreation_reward_func": 1.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 1.25,
"step": 116
},
{
"completion_length": 163.25,
"epoch": 3.3714285714285714,
"grad_norm": 21.72372817993164,
"kl": 2.1254541873931885,
"learning_rate": 3.8761213426563543e-07,
"loss": 0.0021,
"reward": 7.039299890398979,
"reward_std": 0.43868909776210785,
"rewards/concensus_correctness_reward_func": 2.16687498614192,
"rewards/consensus_reward_func": 1.9375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.3125,
"rewards/question_recreation_reward_func": 0.9388312064111233,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.46875,
"rewards/xmlcount_reward_func": 1.21484375,
"step": 118
},
{
"completion_length": 159.15625,
"epoch": 3.4285714285714284,
"grad_norm": 0.3591388761997223,
"kl": 6.059129260480404,
"learning_rate": 3.718944461187138e-07,
"loss": 0.0061,
"reward": 6.463347539305687,
"reward_std": 0.3400730788707733,
"rewards/concensus_correctness_reward_func": 1.9808750078082085,
"rewards/consensus_reward_func": 1.875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.25,
"rewards/question_recreation_reward_func": 0.9365975013934076,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.46875,
"rewards/xmlcount_reward_func": 0.9521249979734421,
"step": 120
},
{
"completion_length": 123.65625,
"epoch": 3.4857142857142858,
"grad_norm": 194.1432647705078,
"kl": 57.89645641669631,
"learning_rate": 3.563111230075859e-07,
"loss": 0.0579,
"reward": 7.029898107051849,
"reward_std": 0.3636358277872205,
"rewards/concensus_correctness_reward_func": 2.115187507122755,
"rewards/consensus_reward_func": 1.9375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.4375,
"rewards/question_recreation_reward_func": 0.914116925559938,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4375,
"rewards/xmlcount_reward_func": 1.18809375166893,
"step": 122
},
{
"completion_length": 148.8125,
"epoch": 3.5428571428571427,
"grad_norm": 14.622519493103027,
"kl": 0.3278482835739851,
"learning_rate": 3.408785096862782e-07,
"loss": 0.0003,
"reward": 6.669890329241753,
"reward_std": 0.951744182035327,
"rewards/concensus_correctness_reward_func": 2.159937519580126,
"rewards/consensus_reward_func": 1.9375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.375,
"rewards/question_recreation_reward_func": 0.9516402631998062,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.46875,
"rewards/xmlcount_reward_func": 0.7770625054836273,
"step": 124
},
{
"completion_length": 138.34375,
"epoch": 3.6,
"grad_norm": 3.0443975925445557,
"kl": 278.0337795561645,
"learning_rate": 3.2561279283505884e-07,
"loss": 0.278,
"reward": 7.347518771886826,
"reward_std": 0.10661890726260026,
"rewards/concensus_correctness_reward_func": 2.1738749966025352,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.5,
"rewards/question_recreation_reward_func": 0.9431750550866127,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.484375,
"rewards/xmlcount_reward_func": 1.24609375,
"step": 126
},
{
"completion_length": 133.65625,
"epoch": 3.657142857142857,
"grad_norm": 63.743953704833984,
"kl": 141.1469784581568,
"learning_rate": 3.105299840828466e-07,
"loss": 0.1411,
"reward": 6.800521522760391,
"reward_std": 0.010683320462703705,
"rewards/concensus_correctness_reward_func": 1.9629999995231628,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 0.9625215027481318,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 1.25,
"step": 128
},
{
"completion_length": 136.46875,
"epoch": 3.7142857142857144,
"grad_norm": 8.97818660736084,
"kl": 0.5335320448502898,
"learning_rate": 2.95645903213222e-07,
"loss": 0.0005,
"reward": 7.278413146734238,
"reward_std": 0.07083342224359512,
"rewards/concensus_correctness_reward_func": 2.2035000026226044,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.375,
"rewards/question_recreation_reward_func": 0.9694444462656975,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.484375,
"rewards/xmlcount_reward_func": 1.24609375,
"step": 130
},
{
"completion_length": 129.1875,
"epoch": 3.7714285714285714,
"grad_norm": 2.273700714111328,
"kl": 0.1980545329861343,
"learning_rate": 2.8097616157165885e-07,
"loss": 0.0002,
"reward": 7.1327812522649765,
"reward_std": 0.10708247870206833,
"rewards/concensus_correctness_reward_func": 2.1523124910891056,
"rewards/consensus_reward_func": 1.875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.375,
"rewards/question_recreation_reward_func": 1.0,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.484375,
"rewards/xmlcount_reward_func": 1.24609375,
"step": 132
},
{
"completion_length": 136.46875,
"epoch": 3.8285714285714287,
"grad_norm": 3.4171736240386963,
"kl": 0.1759730235207826,
"learning_rate": 2.665361456913797e-07,
"loss": 0.0002,
"reward": 7.524241715669632,
"reward_std": 0.012173316441476345,
"rewards/concensus_correctness_reward_func": 2.2848750203847885,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.5,
"rewards/question_recreation_reward_func": 0.989366702735424,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 1.25,
"step": 134
},
{
"completion_length": 132.6875,
"epoch": 3.8857142857142857,
"grad_norm": 21.402355194091797,
"kl": 0.2659228784032166,
"learning_rate": 2.523410011550064e-07,
"loss": 0.0003,
"reward": 6.992657542228699,
"reward_std": 0.04131975769996643,
"rewards/concensus_correctness_reward_func": 2.021875023841858,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.25,
"rewards/question_recreation_reward_func": 0.9707825183868408,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 1.25,
"step": 136
},
{
"completion_length": 129.03125,
"epoch": 3.942857142857143,
"grad_norm": 132813709312.0,
"kl": 6803573266.517654,
"learning_rate": 2.3840561670893495e-07,
"loss": 6803573.5,
"reward": 6.834182530641556,
"reward_std": 0.2304869929794222,
"rewards/concensus_correctness_reward_func": 2.089499995112419,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.25,
"rewards/question_recreation_reward_func": 0.8875887226313353,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4375,
"rewards/xmlcount_reward_func": 1.1695937439799309,
"step": 138
},
{
"completion_length": 140.8125,
"epoch": 4.0,
"grad_norm": 0.022691868245601654,
"kl": 26381.484041058226,
"learning_rate": 2.247446086470982e-07,
"loss": 26.3815,
"reward": 7.248510301113129,
"reward_std": 0.0460367277264595,
"rewards/concensus_correctness_reward_func": 2.21637499332428,
"rewards/consensus_reward_func": 1.875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.4375,
"rewards/question_recreation_reward_func": 0.969635296612978,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 1.25,
"step": 140
},
{
"completion_length": 157.53125,
"epoch": 4.057142857142857,
"grad_norm": 0.009256873279809952,
"kl": 3.527690098620951,
"learning_rate": 2.113723054804904e-07,
"loss": 0.0035,
"reward": 7.401162892580032,
"reward_std": 0.6326838135719299,
"rewards/concensus_correctness_reward_func": 2.4743749871850014,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.625,
"rewards/question_recreation_reward_func": 0.9991629458963871,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.484375,
"rewards/xmlcount_reward_func": 0.8182500004768372,
"step": 142
},
{
"completion_length": 129.03125,
"epoch": 4.114285714285714,
"grad_norm": 7.365884304046631,
"kl": 0.2767415994312614,
"learning_rate": 1.9830273290853766e-07,
"loss": 0.0003,
"reward": 7.2931163012981415,
"reward_std": 0.07107648908277042,
"rewards/concensus_correctness_reward_func": 2.218375027179718,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.375,
"rewards/question_recreation_reward_func": 0.9692725799977779,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.484375,
"rewards/xmlcount_reward_func": 1.24609375,
"step": 144
},
{
"completion_length": 156.8125,
"epoch": 4.171428571428572,
"grad_norm": 4.549909591674805,
"kl": 2098.400880107074,
"learning_rate": 1.8554959910807772e-07,
"loss": 2.0984,
"reward": 6.6754628121852875,
"reward_std": 0.5301313251256943,
"rewards/concensus_correctness_reward_func": 2.098374992609024,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.1875,
"rewards/question_recreation_reward_func": 0.9207440502941608,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.46875,
"rewards/xmlcount_reward_func": 1.0000937432050705,
"step": 146
},
{
"completion_length": 161.1875,
"epoch": 4.228571428571429,
"grad_norm": 130.0556640625,
"kl": 4.999799037585035,
"learning_rate": 1.7312628035537386e-07,
"loss": 0.005,
"reward": 7.465287238359451,
"reward_std": 0.6660420118496404,
"rewards/concensus_correctness_reward_func": 2.3166875019669533,
"rewards/consensus_reward_func": 1.9375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.625,
"rewards/question_recreation_reward_func": 0.9381621927022934,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.453125,
"rewards/xmlcount_reward_func": 1.194812498986721,
"step": 148
},
{
"completion_length": 134.40625,
"epoch": 4.285714285714286,
"grad_norm": 8.668111801147461,
"kl": 641.8768360905815,
"learning_rate": 1.6104580699624837e-07,
"loss": 0.6419,
"reward": 7.1231569945812225,
"reward_std": 0.0297078593284823,
"rewards/concensus_correctness_reward_func": 2.342249996960163,
"rewards/consensus_reward_func": 1.875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.375,
"rewards/question_recreation_reward_func": 0.8770633104722947,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4375,
"rewards/xmlcount_reward_func": 1.2163437493145466,
"step": 150
},
{
"completion_length": 122.125,
"epoch": 4.3428571428571425,
"grad_norm": 6.391092777252197,
"kl": 0.20541435782797635,
"learning_rate": 1.493208497790504e-07,
"loss": 0.0002,
"reward": 6.966011185199022,
"reward_std": 0.00430500041693449,
"rewards/concensus_correctness_reward_func": 2.106500007212162,
"rewards/consensus_reward_func": 1.875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.375,
"rewards/question_recreation_reward_func": 0.9532612152397633,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.46875,
"rewards/xmlcount_reward_func": 1.1875,
"step": 152
},
{
"completion_length": 127.3125,
"epoch": 4.4,
"grad_norm": 43.81622314453125,
"kl": 5.563992372946814,
"learning_rate": 1.3796370656478934e-07,
"loss": 0.0056,
"reward": 6.806715875864029,
"reward_std": 0.06279254704713821,
"rewards/concensus_correctness_reward_func": 1.9822500199079514,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 0.9689970314502716,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.484375,
"rewards/xmlcount_reward_func": 1.24609375,
"step": 154
},
{
"completion_length": 130.46875,
"epoch": 4.457142857142857,
"grad_norm": 122.86102294921875,
"kl": 20.539323112927377,
"learning_rate": 1.2698628942837697e-07,
"loss": 0.0205,
"reward": 6.735322088003159,
"reward_std": 0.23215299472212791,
"rewards/concensus_correctness_reward_func": 2.086000010371208,
"rewards/consensus_reward_func": 1.875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.1875,
"rewards/question_recreation_reward_func": 0.896103395964019,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.46875,
"rewards/xmlcount_reward_func": 1.2219687476754189,
"step": 156
},
{
"completion_length": 132.90625,
"epoch": 4.514285714285714,
"grad_norm": 69.87224578857422,
"kl": 0.6354921485763043,
"learning_rate": 1.1640011216450691e-07,
"loss": 0.0006,
"reward": 7.233689934015274,
"reward_std": 0.514858566224575,
"rewards/concensus_correctness_reward_func": 2.2895624935626984,
"rewards/consensus_reward_func": 1.9375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.5,
"rewards/question_recreation_reward_func": 0.8818775303661823,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4375,
"rewards/xmlcount_reward_func": 1.1872499994933605,
"step": 158
},
{
"completion_length": 133.59375,
"epoch": 4.571428571428571,
"grad_norm": 8.249082565307617,
"kl": 0.26116269128397107,
"learning_rate": 1.0621627821127288e-07,
"loss": 0.0003,
"reward": 7.30239263176918,
"reward_std": 0.0416584275662899,
"rewards/concensus_correctness_reward_func": 2.208875000476837,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.375,
"rewards/question_recreation_reward_func": 0.9685175716876984,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 1.25,
"step": 160
},
{
"completion_length": 136.1875,
"epoch": 4.628571428571428,
"grad_norm": 0.024719232693314552,
"kl": 0.7720870058983564,
"learning_rate": 9.644546900419531e-08,
"loss": 0.0008,
"reward": 6.74112144112587,
"reward_std": 0.14779043197631836,
"rewards/concensus_correctness_reward_func": 1.97062499076128,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 0.9422151371836662,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.484375,
"rewards/xmlcount_reward_func": 1.2189062498509884,
"step": 162
},
{
"completion_length": 132.28125,
"epoch": 4.685714285714286,
"grad_norm": 99.90188598632812,
"kl": 5.756765312515199,
"learning_rate": 8.70979327728718e-08,
"loss": 0.0058,
"reward": 7.076995253562927,
"reward_std": 0.3961862847208977,
"rewards/concensus_correctness_reward_func": 2.1743750162422657,
"rewards/consensus_reward_func": 1.9375,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.375,
"rewards/question_recreation_reward_func": 0.9299640282988548,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.453125,
"rewards/xmlcount_reward_func": 1.20703125,
"step": 164
},
{
"completion_length": 133.46875,
"epoch": 4.742857142857143,
"grad_norm": 1942.1475830078125,
"kl": 80.78691061586142,
"learning_rate": 7.81834737919978e-08,
"loss": 0.0808,
"reward": 7.304515153169632,
"reward_std": 0.11349444479128579,
"rewards/concensus_correctness_reward_func": 2.350000001490116,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.375,
"rewards/question_recreation_reward_func": 0.8892338592559099,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.46875,
"rewards/xmlcount_reward_func": 1.2215312495827675,
"step": 166
},
{
"completion_length": 131.75,
"epoch": 4.8,
"grad_norm": 36.11498260498047,
"kl": 35.66279458301142,
"learning_rate": 6.971144209803736e-08,
"loss": 0.0357,
"reward": 7.376064032316208,
"reward_std": 0.1673837215421372,
"rewards/concensus_correctness_reward_func": 2.3438749983906746,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.5,
"rewards/question_recreation_reward_func": 0.8862515506334603,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4375,
"rewards/xmlcount_reward_func": 1.2084374986588955,
"step": 168
},
{
"completion_length": 159.9375,
"epoch": 4.857142857142857,
"grad_norm": 1420.3355712890625,
"kl": 5809.957044942072,
"learning_rate": 6.16907236823262e-08,
"loss": 5.81,
"reward": 6.121835008263588,
"reward_std": 0.37124670308548957,
"rewards/concensus_correctness_reward_func": 1.8349375016987324,
"rewards/consensus_reward_func": 1.875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0625,
"rewards/question_recreation_reward_func": 0.7878349621314555,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.40625,
"rewards/xmlcount_reward_func": 1.1553125008940697,
"step": 170
},
{
"completion_length": 139.1875,
"epoch": 4.914285714285715,
"grad_norm": 10.688383102416992,
"kl": 0.22011687979102135,
"learning_rate": 5.412973117089287e-08,
"loss": 0.0002,
"reward": 7.198920458555222,
"reward_std": 0.20292456448078156,
"rewards/concensus_correctness_reward_func": 2.0933750048279762,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.4375,
"rewards/question_recreation_reward_func": 0.9375766552984715,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.484375,
"rewards/xmlcount_reward_func": 1.24609375,
"step": 172
},
{
"completion_length": 136.0,
"epoch": 4.9714285714285715,
"grad_norm": 11.03411865234375,
"kl": 1.326516842469573,
"learning_rate": 4.703639500077655e-08,
"loss": 0.0013,
"reward": 6.875307530164719,
"reward_std": 0.13279777020215988,
"rewards/concensus_correctness_reward_func": 2.067124992609024,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.25,
"rewards/question_recreation_reward_func": 0.9099949998781085,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.453125,
"rewards/xmlcount_reward_func": 1.1950624994933605,
"step": 174
},
{
"completion_length": 137.59375,
"epoch": 5.0285714285714285,
"grad_norm": 130.2829132080078,
"kl": 10.879363138461486,
"learning_rate": 4.041815510209395e-08,
"loss": 0.0109,
"reward": 6.4715642631053925,
"reward_std": 0.063371941447258,
"rewards/concensus_correctness_reward_func": 1.7663749903440475,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.0,
"rewards/question_recreation_reward_func": 0.9747205302119255,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.484375,
"rewards/xmlcount_reward_func": 1.24609375,
"step": 176
},
{
"completion_length": 133.40625,
"epoch": 5.085714285714285,
"grad_norm": 34.99993133544922,
"kl": 2.79548569873441,
"learning_rate": 3.4281953094578875e-08,
"loss": 0.0028,
"reward": 7.421046018600464,
"reward_std": 0.10142664304294158,
"rewards/concensus_correctness_reward_func": 2.3401249796152115,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.5,
"rewards/question_recreation_reward_func": 0.9102647739928216,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.453125,
"rewards/xmlcount_reward_func": 1.2175312489271164,
"step": 178
},
{
"completion_length": 167.1875,
"epoch": 5.142857142857143,
"grad_norm": 3914.157470703125,
"kl": 408.69288858864456,
"learning_rate": 2.8634225006782864e-08,
"loss": 0.4087,
"reward": 6.723982572555542,
"reward_std": 0.15523457527160645,
"rewards/concensus_correctness_reward_func": 1.9587500020861626,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.125,
"rewards/question_recreation_reward_func": 0.9449200928211212,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.46875,
"rewards/xmlcount_reward_func": 1.2265625,
"step": 180
},
{
"completion_length": 128.625,
"epoch": 5.2,
"grad_norm": 40.13917922973633,
"kl": 16038.165946810506,
"learning_rate": 2.348089452556956e-08,
"loss": 16.0382,
"reward": 6.769445240497589,
"reward_std": 0.5420721787959337,
"rewards/concensus_correctness_reward_func": 2.2128750011324883,
"rewards/consensus_reward_func": 1.875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.25,
"rewards/question_recreation_reward_func": 0.8193827569484711,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.421875,
"rewards/xmlcount_reward_func": 1.190312497317791,
"step": 182
},
{
"completion_length": 135.5625,
"epoch": 5.257142857142857,
"grad_norm": 0.020569220185279846,
"kl": 0.19448763993568718,
"learning_rate": 1.882736678298491e-08,
"loss": 0.0002,
"reward": 7.055787056684494,
"reward_std": 0.03314562886953354,
"rewards/concensus_correctness_reward_func": 2.081250011920929,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.25,
"rewards/question_recreation_reward_func": 0.9979745373129845,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.484375,
"rewards/xmlcount_reward_func": 1.2421875,
"step": 184
},
{
"completion_length": 156.5,
"epoch": 5.314285714285714,
"grad_norm": 19.919050216674805,
"kl": 14.24597706948407,
"learning_rate": 1.4678522687020412e-08,
"loss": 0.0142,
"reward": 6.925608813762665,
"reward_std": 0.1697275247424841,
"rewards/concensus_correctness_reward_func": 2.210874982178211,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.25,
"rewards/question_recreation_reward_func": 0.8500463847303763,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4375,
"rewards/xmlcount_reward_func": 1.1771874986588955,
"step": 186
},
{
"completion_length": 128.25,
"epoch": 5.371428571428572,
"grad_norm": 677.8944702148438,
"kl": 176.0574713665992,
"learning_rate": 1.1038713802214717e-08,
"loss": 0.1761,
"reward": 6.819926559925079,
"reward_std": 0.1877051831688732,
"rewards/concensus_correctness_reward_func": 2.0945000126957893,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.25,
"rewards/question_recreation_reward_func": 0.7840202623046935,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.453125,
"rewards/xmlcount_reward_func": 1.23828125,
"step": 188
},
{
"completion_length": 128.46875,
"epoch": 5.428571428571429,
"grad_norm": 0.06580457091331482,
"kl": 0.8888655919581652,
"learning_rate": 7.91175778546288e-09,
"loss": 0.0009,
"reward": 7.360000848770142,
"reward_std": 0.15114279091358185,
"rewards/concensus_correctness_reward_func": 2.34187500923872,
"rewards/consensus_reward_func": 1.875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.4375,
"rewards/question_recreation_reward_func": 0.975157156586647,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.484375,
"rewards/xmlcount_reward_func": 1.24609375,
"step": 190
},
{
"completion_length": 133.9375,
"epoch": 5.485714285714286,
"grad_norm": 9619.09375,
"kl": 433.19152829330415,
"learning_rate": 5.3009343818219975e-09,
"loss": 0.4332,
"reward": 7.0666501224040985,
"reward_std": 0.1772715449333191,
"rewards/concensus_correctness_reward_func": 2.06700000166893,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.3125,
"rewards/question_recreation_reward_func": 0.9739313460886478,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.484375,
"rewards/xmlcount_reward_func": 1.2288437485694885,
"step": 192
},
{
"completion_length": 163.15625,
"epoch": 5.542857142857143,
"grad_norm": 17.19892120361328,
"kl": 1.2293917203787714,
"learning_rate": 3.2089819845111944e-09,
"loss": 0.0012,
"reward": 7.017405599355698,
"reward_std": 0.27922892197966576,
"rewards/concensus_correctness_reward_func": 2.2905624993145466,
"rewards/consensus_reward_func": 1.875,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.3125,
"rewards/question_recreation_reward_func": 0.9062805884168483,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.4375,
"rewards/xmlcount_reward_func": 1.1955625005066395,
"step": 194
},
{
"completion_length": 137.96875,
"epoch": 5.6,
"grad_norm": 0.01611829362809658,
"kl": 0.16643174993805587,
"learning_rate": 1.638094762715314e-09,
"loss": 0.0002,
"reward": 7.399079352617264,
"reward_std": 0.0,
"rewards/concensus_correctness_reward_func": 2.149375006556511,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.5,
"rewards/question_recreation_reward_func": 0.9997043535113335,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.5,
"rewards/xmlcount_reward_func": 1.25,
"step": 196
},
{
"completion_length": 158.21875,
"epoch": 5.6571428571428575,
"grad_norm": 2546.03564453125,
"kl": 279.96135277603753,
"learning_rate": 5.899203602046654e-10,
"loss": 0.28,
"reward": 6.826089903712273,
"reward_std": 0.3347325325012207,
"rewards/concensus_correctness_reward_func": 2.163374997675419,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.25,
"rewards/question_recreation_reward_func": 0.938058597035706,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.46875,
"rewards/xmlcount_reward_func": 1.0059062540531158,
"step": 198
},
{
"completion_length": 155.625,
"epoch": 5.714285714285714,
"grad_norm": 32.50419616699219,
"kl": 3.371709798462689,
"learning_rate": 6.555816718389895e-11,
"loss": 0.0034,
"reward": 7.258530080318451,
"reward_std": 0.13536836579442024,
"rewards/concensus_correctness_reward_func": 2.2292499989271164,
"rewards/consensus_reward_func": 2.0,
"rewards/cumulative_reward_2": 0.0,
"rewards/final_correctness_reward_func": 0.375,
"rewards/question_recreation_reward_func": 0.9394363649189472,
"rewards/soft_format_reward_func": 0.0,
"rewards/strict_format_reward_func": 0.484375,
"rewards/xmlcount_reward_func": 1.23046875,
"step": 200
},
{
"epoch": 5.714285714285714,
"step": 200,
"total_flos": 0.0,
"train_loss": 98745.44226059376,
"train_runtime": 1459.4013,
"train_samples_per_second": 2.193,
"train_steps_per_second": 0.137
}
],
"logging_steps": 2,
"max_steps": 200,
"num_input_tokens_seen": 0,
"num_train_epochs": 6,
"save_steps": 25,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}