nbd22's picture
Training in progress, step 233, checkpoint
9d3d93e verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9967914438502674,
"eval_steps": 500,
"global_step": 233,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 228.734375,
"epoch": 0.017112299465240642,
"grad_norm": 0.25726935267448425,
"kl": 0.0005143880844116211,
"learning_rate": 9.82832618025751e-07,
"loss": 0.0,
"reward": 0.08203125,
"reward_std": 0.12251314427703619,
"rewards/validate_answer_with_correct_format": 0.060546875,
"rewards/validate_format": 0.021484375,
"step": 4
},
{
"completion_length": 214.3046875,
"epoch": 0.034224598930481284,
"grad_norm": 0.2438514530658722,
"kl": 0.004357337951660156,
"learning_rate": 9.656652360515022e-07,
"loss": 0.0002,
"reward": 0.166015625,
"reward_std": 0.22593521419912577,
"rewards/validate_answer_with_correct_format": 0.134765625,
"rewards/validate_format": 0.03125,
"step": 8
},
{
"completion_length": 201.08984375,
"epoch": 0.051336898395721926,
"grad_norm": 0.3771411180496216,
"kl": 0.016904830932617188,
"learning_rate": 9.484978540772532e-07,
"loss": 0.0007,
"reward": 0.44921875,
"reward_std": 0.4575687777251005,
"rewards/validate_answer_with_correct_format": 0.2578125,
"rewards/validate_format": 0.19140625,
"step": 12
},
{
"completion_length": 193.501953125,
"epoch": 0.06844919786096257,
"grad_norm": 0.33645591139793396,
"kl": 0.023540496826171875,
"learning_rate": 9.313304721030042e-07,
"loss": 0.0009,
"reward": 0.96484375,
"reward_std": 0.6036693137139082,
"rewards/validate_answer_with_correct_format": 0.37109375,
"rewards/validate_format": 0.59375,
"step": 16
},
{
"completion_length": 175.50390625,
"epoch": 0.0855614973262032,
"grad_norm": 0.3233410716056824,
"kl": 0.03643798828125,
"learning_rate": 9.141630901287554e-07,
"loss": 0.0015,
"reward": 1.302734375,
"reward_std": 0.4221517601981759,
"rewards/validate_answer_with_correct_format": 0.48828125,
"rewards/validate_format": 0.814453125,
"step": 20
},
{
"completion_length": 176.00390625,
"epoch": 0.10267379679144385,
"grad_norm": 0.2788090109825134,
"kl": 0.029022216796875,
"learning_rate": 8.969957081545064e-07,
"loss": 0.0012,
"reward": 1.359375,
"reward_std": 0.3464417774230242,
"rewards/validate_answer_with_correct_format": 0.521484375,
"rewards/validate_format": 0.837890625,
"step": 24
},
{
"completion_length": 168.138671875,
"epoch": 0.11978609625668449,
"grad_norm": 0.2867412269115448,
"kl": 0.03003692626953125,
"learning_rate": 8.798283261802575e-07,
"loss": 0.0012,
"reward": 1.5,
"reward_std": 0.35856608115136623,
"rewards/validate_answer_with_correct_format": 0.62109375,
"rewards/validate_format": 0.87890625,
"step": 28
},
{
"completion_length": 167.71484375,
"epoch": 0.13689839572192514,
"grad_norm": 0.2035263180732727,
"kl": 0.0883941650390625,
"learning_rate": 8.626609442060086e-07,
"loss": 0.0035,
"reward": 1.466796875,
"reward_std": 0.2889786111190915,
"rewards/validate_answer_with_correct_format": 0.576171875,
"rewards/validate_format": 0.890625,
"step": 32
},
{
"completion_length": 159.65234375,
"epoch": 0.15401069518716579,
"grad_norm": 0.22620172798633575,
"kl": 0.0332489013671875,
"learning_rate": 8.454935622317596e-07,
"loss": 0.0013,
"reward": 1.51171875,
"reward_std": 0.22670799400657415,
"rewards/validate_answer_with_correct_format": 0.58203125,
"rewards/validate_format": 0.9296875,
"step": 36
},
{
"completion_length": 165.24609375,
"epoch": 0.1711229946524064,
"grad_norm": 0.270819753408432,
"kl": 0.0323638916015625,
"learning_rate": 8.283261802575107e-07,
"loss": 0.0013,
"reward": 1.49609375,
"reward_std": 0.2671235203742981,
"rewards/validate_answer_with_correct_format": 0.60546875,
"rewards/validate_format": 0.890625,
"step": 40
},
{
"completion_length": 155.900390625,
"epoch": 0.18823529411764706,
"grad_norm": 0.249853253364563,
"kl": 0.0360260009765625,
"learning_rate": 8.111587982832617e-07,
"loss": 0.0014,
"reward": 1.607421875,
"reward_std": 0.2183889476582408,
"rewards/validate_answer_with_correct_format": 0.6796875,
"rewards/validate_format": 0.927734375,
"step": 44
},
{
"completion_length": 162.89453125,
"epoch": 0.2053475935828877,
"grad_norm": 0.3030960261821747,
"kl": 0.04419708251953125,
"learning_rate": 7.939914163090128e-07,
"loss": 0.0018,
"reward": 1.6171875,
"reward_std": 0.28787759225815535,
"rewards/validate_answer_with_correct_format": 0.6875,
"rewards/validate_format": 0.9296875,
"step": 48
},
{
"completion_length": 164.138671875,
"epoch": 0.22245989304812835,
"grad_norm": 0.2891474962234497,
"kl": 0.03287506103515625,
"learning_rate": 7.76824034334764e-07,
"loss": 0.0013,
"reward": 1.56640625,
"reward_std": 0.3518991004675627,
"rewards/validate_answer_with_correct_format": 0.689453125,
"rewards/validate_format": 0.876953125,
"step": 52
},
{
"completion_length": 164.32421875,
"epoch": 0.23957219251336898,
"grad_norm": 0.26431283354759216,
"kl": 0.04036712646484375,
"learning_rate": 7.59656652360515e-07,
"loss": 0.0016,
"reward": 1.546875,
"reward_std": 0.4236298883333802,
"rewards/validate_answer_with_correct_format": 0.708984375,
"rewards/validate_format": 0.837890625,
"step": 56
},
{
"completion_length": 169.640625,
"epoch": 0.25668449197860965,
"grad_norm": 0.2717176675796509,
"kl": 0.03812408447265625,
"learning_rate": 7.424892703862661e-07,
"loss": 0.0015,
"reward": 1.646484375,
"reward_std": 0.35125905089080334,
"rewards/validate_answer_with_correct_format": 0.783203125,
"rewards/validate_format": 0.86328125,
"step": 60
},
{
"completion_length": 169.064453125,
"epoch": 0.2737967914438503,
"grad_norm": 0.2818020284175873,
"kl": 0.04370880126953125,
"learning_rate": 7.253218884120171e-07,
"loss": 0.0017,
"reward": 1.6171875,
"reward_std": 0.3997631352394819,
"rewards/validate_answer_with_correct_format": 0.744140625,
"rewards/validate_format": 0.873046875,
"step": 64
},
{
"completion_length": 172.17578125,
"epoch": 0.2909090909090909,
"grad_norm": 0.35611966252326965,
"kl": 0.0448455810546875,
"learning_rate": 7.081545064377682e-07,
"loss": 0.0018,
"reward": 1.611328125,
"reward_std": 0.3241206342354417,
"rewards/validate_answer_with_correct_format": 0.75,
"rewards/validate_format": 0.861328125,
"step": 68
},
{
"completion_length": 163.498046875,
"epoch": 0.30802139037433157,
"grad_norm": 0.2753399908542633,
"kl": 0.04538726806640625,
"learning_rate": 6.909871244635192e-07,
"loss": 0.0018,
"reward": 1.701171875,
"reward_std": 0.3228708282113075,
"rewards/validate_answer_with_correct_format": 0.802734375,
"rewards/validate_format": 0.8984375,
"step": 72
},
{
"completion_length": 166.0234375,
"epoch": 0.3251336898395722,
"grad_norm": 0.401327520608902,
"kl": 0.053253173828125,
"learning_rate": 6.738197424892703e-07,
"loss": 0.0021,
"reward": 1.728515625,
"reward_std": 0.3109824899584055,
"rewards/validate_answer_with_correct_format": 0.826171875,
"rewards/validate_format": 0.90234375,
"step": 76
},
{
"completion_length": 168.322265625,
"epoch": 0.3422459893048128,
"grad_norm": 0.4816475510597229,
"kl": 0.0675201416015625,
"learning_rate": 6.566523605150214e-07,
"loss": 0.0027,
"reward": 1.568359375,
"reward_std": 0.36935339495539665,
"rewards/validate_answer_with_correct_format": 0.748046875,
"rewards/validate_format": 0.8203125,
"step": 80
},
{
"completion_length": 173.77734375,
"epoch": 0.3593582887700535,
"grad_norm": 1.3737239837646484,
"kl": 0.1086883544921875,
"learning_rate": 6.394849785407725e-07,
"loss": 0.0043,
"reward": 1.595703125,
"reward_std": 0.3840404311195016,
"rewards/validate_answer_with_correct_format": 0.771484375,
"rewards/validate_format": 0.82421875,
"step": 84
},
{
"completion_length": 174.169921875,
"epoch": 0.3764705882352941,
"grad_norm": 1.1611127853393555,
"kl": 0.1685333251953125,
"learning_rate": 6.223175965665236e-07,
"loss": 0.0067,
"reward": 1.494140625,
"reward_std": 0.4635760700330138,
"rewards/validate_answer_with_correct_format": 0.724609375,
"rewards/validate_format": 0.76953125,
"step": 88
},
{
"completion_length": 188.8828125,
"epoch": 0.39358288770053473,
"grad_norm": 1.5201658010482788,
"kl": 0.60052490234375,
"learning_rate": 6.051502145922746e-07,
"loss": 0.024,
"reward": 1.169921875,
"reward_std": 0.6621273942291737,
"rewards/validate_answer_with_correct_format": 0.587890625,
"rewards/validate_format": 0.58203125,
"step": 92
},
{
"completion_length": 193.64453125,
"epoch": 0.4106951871657754,
"grad_norm": 4.456446170806885,
"kl": 1.027099609375,
"learning_rate": 5.879828326180257e-07,
"loss": 0.041,
"reward": 1.052734375,
"reward_std": 0.6838541068136692,
"rewards/validate_answer_with_correct_format": 0.51953125,
"rewards/validate_format": 0.533203125,
"step": 96
},
{
"completion_length": 179.8125,
"epoch": 0.42780748663101603,
"grad_norm": 0.6613827347755432,
"kl": 0.86444091796875,
"learning_rate": 5.708154506437767e-07,
"loss": 0.0346,
"reward": 1.2578125,
"reward_std": 0.540657652541995,
"rewards/validate_answer_with_correct_format": 0.609375,
"rewards/validate_format": 0.6484375,
"step": 100
},
{
"completion_length": 169.509765625,
"epoch": 0.4449197860962567,
"grad_norm": 5.908604621887207,
"kl": 0.630767822265625,
"learning_rate": 5.536480686695278e-07,
"loss": 0.0252,
"reward": 1.537109375,
"reward_std": 0.4994529504328966,
"rewards/validate_answer_with_correct_format": 0.736328125,
"rewards/validate_format": 0.80078125,
"step": 104
},
{
"completion_length": 160.07421875,
"epoch": 0.46203208556149733,
"grad_norm": 1.3123823404312134,
"kl": 0.44525146484375,
"learning_rate": 5.364806866952789e-07,
"loss": 0.0178,
"reward": 1.591796875,
"reward_std": 0.42215616535395384,
"rewards/validate_answer_with_correct_format": 0.744140625,
"rewards/validate_format": 0.84765625,
"step": 108
},
{
"completion_length": 162.609375,
"epoch": 0.47914438502673795,
"grad_norm": 3.7777585983276367,
"kl": 0.751312255859375,
"learning_rate": 5.193133047210299e-07,
"loss": 0.03,
"reward": 1.533203125,
"reward_std": 0.5010853223502636,
"rewards/validate_answer_with_correct_format": 0.72265625,
"rewards/validate_format": 0.810546875,
"step": 112
},
{
"completion_length": 176.544921875,
"epoch": 0.49625668449197863,
"grad_norm": 1.9199061393737793,
"kl": 1.5675048828125,
"learning_rate": 5.021459227467812e-07,
"loss": 0.0628,
"reward": 1.33984375,
"reward_std": 0.5347601640969515,
"rewards/validate_answer_with_correct_format": 0.619140625,
"rewards/validate_format": 0.720703125,
"step": 116
},
{
"completion_length": 174.685546875,
"epoch": 0.5133689839572193,
"grad_norm": 1.4069411754608154,
"kl": 1.360107421875,
"learning_rate": 4.849785407725322e-07,
"loss": 0.0544,
"reward": 1.25,
"reward_std": 0.5469169113785028,
"rewards/validate_answer_with_correct_format": 0.537109375,
"rewards/validate_format": 0.712890625,
"step": 120
},
{
"completion_length": 168.380859375,
"epoch": 0.5304812834224599,
"grad_norm": 1.0460318326950073,
"kl": 0.49078369140625,
"learning_rate": 4.6781115879828326e-07,
"loss": 0.0196,
"reward": 1.337890625,
"reward_std": 0.4842473194003105,
"rewards/validate_answer_with_correct_format": 0.564453125,
"rewards/validate_format": 0.7734375,
"step": 124
},
{
"completion_length": 167.39453125,
"epoch": 0.5475935828877005,
"grad_norm": 0.717617392539978,
"kl": 0.53839111328125,
"learning_rate": 4.506437768240343e-07,
"loss": 0.0216,
"reward": 1.43359375,
"reward_std": 0.47893994580954313,
"rewards/validate_answer_with_correct_format": 0.623046875,
"rewards/validate_format": 0.810546875,
"step": 128
},
{
"completion_length": 179.7890625,
"epoch": 0.5647058823529412,
"grad_norm": 1.0693079233169556,
"kl": 1.01556396484375,
"learning_rate": 4.3347639484978536e-07,
"loss": 0.0407,
"reward": 1.298828125,
"reward_std": 0.5207763016223907,
"rewards/validate_answer_with_correct_format": 0.58203125,
"rewards/validate_format": 0.716796875,
"step": 132
},
{
"completion_length": 176.11328125,
"epoch": 0.5818181818181818,
"grad_norm": 1.5559654235839844,
"kl": 1.3896484375,
"learning_rate": 4.163090128755364e-07,
"loss": 0.0556,
"reward": 1.31640625,
"reward_std": 0.6244864724576473,
"rewards/validate_answer_with_correct_format": 0.580078125,
"rewards/validate_format": 0.736328125,
"step": 136
},
{
"completion_length": 171.017578125,
"epoch": 0.5989304812834224,
"grad_norm": 1.3196905851364136,
"kl": 1.96533203125,
"learning_rate": 3.991416309012876e-07,
"loss": 0.0786,
"reward": 1.373046875,
"reward_std": 0.5217751991003752,
"rewards/validate_answer_with_correct_format": 0.63671875,
"rewards/validate_format": 0.736328125,
"step": 140
},
{
"completion_length": 181.798828125,
"epoch": 0.6160427807486631,
"grad_norm": 0.8346861004829407,
"kl": 2.1689453125,
"learning_rate": 3.819742489270386e-07,
"loss": 0.0867,
"reward": 1.2734375,
"reward_std": 0.5743660591542721,
"rewards/validate_answer_with_correct_format": 0.58984375,
"rewards/validate_format": 0.68359375,
"step": 144
},
{
"completion_length": 175.63671875,
"epoch": 0.6331550802139038,
"grad_norm": 1.6434240341186523,
"kl": 1.395263671875,
"learning_rate": 3.648068669527897e-07,
"loss": 0.0559,
"reward": 1.3671875,
"reward_std": 0.4787444490939379,
"rewards/validate_answer_with_correct_format": 0.6015625,
"rewards/validate_format": 0.765625,
"step": 148
},
{
"completion_length": 166.970703125,
"epoch": 0.6502673796791444,
"grad_norm": 2.3217926025390625,
"kl": 0.79571533203125,
"learning_rate": 3.4763948497854073e-07,
"loss": 0.0318,
"reward": 1.486328125,
"reward_std": 0.4878078643232584,
"rewards/validate_answer_with_correct_format": 0.6640625,
"rewards/validate_format": 0.822265625,
"step": 152
},
{
"completion_length": 172.341796875,
"epoch": 0.667379679144385,
"grad_norm": 2.0409505367279053,
"kl": 1.2677001953125,
"learning_rate": 3.3047210300429184e-07,
"loss": 0.0507,
"reward": 1.443359375,
"reward_std": 0.5103737181052566,
"rewards/validate_answer_with_correct_format": 0.65234375,
"rewards/validate_format": 0.791015625,
"step": 156
},
{
"completion_length": 173.0703125,
"epoch": 0.6844919786096256,
"grad_norm": 0.673163115978241,
"kl": 1.423583984375,
"learning_rate": 3.133047210300429e-07,
"loss": 0.057,
"reward": 1.40625,
"reward_std": 0.4554907586425543,
"rewards/validate_answer_with_correct_format": 0.638671875,
"rewards/validate_format": 0.767578125,
"step": 160
},
{
"completion_length": 172.994140625,
"epoch": 0.7016042780748664,
"grad_norm": 0.7499634623527527,
"kl": 1.368408203125,
"learning_rate": 2.96137339055794e-07,
"loss": 0.0548,
"reward": 1.427734375,
"reward_std": 0.48304858803749084,
"rewards/validate_answer_with_correct_format": 0.671875,
"rewards/validate_format": 0.755859375,
"step": 164
},
{
"completion_length": 174.158203125,
"epoch": 0.718716577540107,
"grad_norm": 0.622071385383606,
"kl": 1.55810546875,
"learning_rate": 2.7896995708154505e-07,
"loss": 0.0623,
"reward": 1.375,
"reward_std": 0.46824304293841124,
"rewards/validate_answer_with_correct_format": 0.646484375,
"rewards/validate_format": 0.728515625,
"step": 168
},
{
"completion_length": 172.513671875,
"epoch": 0.7358288770053476,
"grad_norm": 1.0885876417160034,
"kl": 1.2890625,
"learning_rate": 2.6180257510729615e-07,
"loss": 0.0516,
"reward": 1.34375,
"reward_std": 0.5326037332415581,
"rewards/validate_answer_with_correct_format": 0.609375,
"rewards/validate_format": 0.734375,
"step": 172
},
{
"completion_length": 177.36328125,
"epoch": 0.7529411764705882,
"grad_norm": 1.3465913534164429,
"kl": 1.707763671875,
"learning_rate": 2.446351931330472e-07,
"loss": 0.0683,
"reward": 1.29296875,
"reward_std": 0.5761713199317455,
"rewards/validate_answer_with_correct_format": 0.595703125,
"rewards/validate_format": 0.697265625,
"step": 176
},
{
"completion_length": 175.986328125,
"epoch": 0.7700534759358288,
"grad_norm": 1.9746425151824951,
"kl": 1.796875,
"learning_rate": 2.2746781115879825e-07,
"loss": 0.0718,
"reward": 1.291015625,
"reward_std": 0.5592877455055714,
"rewards/validate_answer_with_correct_format": 0.609375,
"rewards/validate_format": 0.681640625,
"step": 180
},
{
"completion_length": 181.4921875,
"epoch": 0.7871657754010695,
"grad_norm": 1.2594670057296753,
"kl": 2.2677001953125,
"learning_rate": 2.1030042918454936e-07,
"loss": 0.0906,
"reward": 1.29296875,
"reward_std": 0.5261205593124032,
"rewards/validate_answer_with_correct_format": 0.59765625,
"rewards/validate_format": 0.6953125,
"step": 184
},
{
"completion_length": 182.068359375,
"epoch": 0.8042780748663102,
"grad_norm": 0.9596216678619385,
"kl": 1.4912109375,
"learning_rate": 1.931330472103004e-07,
"loss": 0.0597,
"reward": 1.2734375,
"reward_std": 0.5831566601991653,
"rewards/validate_answer_with_correct_format": 0.580078125,
"rewards/validate_format": 0.693359375,
"step": 188
},
{
"completion_length": 176.224609375,
"epoch": 0.8213903743315508,
"grad_norm": 1.0331825017929077,
"kl": 1.3037109375,
"learning_rate": 1.759656652360515e-07,
"loss": 0.0522,
"reward": 1.384765625,
"reward_std": 0.48507228679955006,
"rewards/validate_answer_with_correct_format": 0.640625,
"rewards/validate_format": 0.744140625,
"step": 192
},
{
"completion_length": 170.205078125,
"epoch": 0.8385026737967914,
"grad_norm": 1.1908960342407227,
"kl": 1.3134765625,
"learning_rate": 1.5879828326180257e-07,
"loss": 0.0526,
"reward": 1.44140625,
"reward_std": 0.5417510252445936,
"rewards/validate_answer_with_correct_format": 0.658203125,
"rewards/validate_format": 0.783203125,
"step": 196
},
{
"completion_length": 175.8125,
"epoch": 0.8556149732620321,
"grad_norm": 2.4777722358703613,
"kl": 1.801025390625,
"learning_rate": 1.4163090128755365e-07,
"loss": 0.072,
"reward": 1.306640625,
"reward_std": 0.5542392712086439,
"rewards/validate_answer_with_correct_format": 0.595703125,
"rewards/validate_format": 0.7109375,
"step": 200
},
{
"completion_length": 177.724609375,
"epoch": 0.8727272727272727,
"grad_norm": 0.8101657032966614,
"kl": 2.00537109375,
"learning_rate": 1.2446351931330473e-07,
"loss": 0.0802,
"reward": 1.28125,
"reward_std": 0.5756953954696655,
"rewards/validate_answer_with_correct_format": 0.595703125,
"rewards/validate_format": 0.685546875,
"step": 204
},
{
"completion_length": 178.109375,
"epoch": 0.8898395721925134,
"grad_norm": 4.290409564971924,
"kl": 2.436279296875,
"learning_rate": 1.0729613733905579e-07,
"loss": 0.0974,
"reward": 1.2265625,
"reward_std": 0.5841891095042229,
"rewards/validate_answer_with_correct_format": 0.54296875,
"rewards/validate_format": 0.68359375,
"step": 208
},
{
"completion_length": 182.623046875,
"epoch": 0.906951871657754,
"grad_norm": 1.0480223894119263,
"kl": 2.49560546875,
"learning_rate": 9.012875536480687e-08,
"loss": 0.0999,
"reward": 1.173828125,
"reward_std": 0.6358627937734127,
"rewards/validate_answer_with_correct_format": 0.54296875,
"rewards/validate_format": 0.630859375,
"step": 212
},
{
"completion_length": 180.333984375,
"epoch": 0.9240641711229947,
"grad_norm": 1.4673086404800415,
"kl": 2.2333984375,
"learning_rate": 7.296137339055794e-08,
"loss": 0.0895,
"reward": 1.158203125,
"reward_std": 0.6267144195735455,
"rewards/validate_answer_with_correct_format": 0.50390625,
"rewards/validate_format": 0.654296875,
"step": 216
},
{
"completion_length": 177.82421875,
"epoch": 0.9411764705882353,
"grad_norm": 1.3507096767425537,
"kl": 2.12353515625,
"learning_rate": 5.5793991416309014e-08,
"loss": 0.0849,
"reward": 1.212890625,
"reward_std": 0.6212888453155756,
"rewards/validate_answer_with_correct_format": 0.5546875,
"rewards/validate_format": 0.658203125,
"step": 220
},
{
"completion_length": 176.25390625,
"epoch": 0.9582887700534759,
"grad_norm": 1.1437978744506836,
"kl": 1.736328125,
"learning_rate": 3.8626609442060086e-08,
"loss": 0.0695,
"reward": 1.244140625,
"reward_std": 0.5541076026856899,
"rewards/validate_answer_with_correct_format": 0.568359375,
"rewards/validate_format": 0.67578125,
"step": 224
},
{
"completion_length": 172.615234375,
"epoch": 0.9754010695187165,
"grad_norm": 2.013885974884033,
"kl": 1.874267578125,
"learning_rate": 2.1459227467811158e-08,
"loss": 0.075,
"reward": 1.31640625,
"reward_std": 0.5559380035847425,
"rewards/validate_answer_with_correct_format": 0.6015625,
"rewards/validate_format": 0.71484375,
"step": 228
},
{
"completion_length": 173.4765625,
"epoch": 0.9925133689839573,
"grad_norm": 0.9704756140708923,
"kl": 1.89306640625,
"learning_rate": 4.291845493562231e-09,
"loss": 0.0757,
"reward": 1.271484375,
"reward_std": 0.5627450533211231,
"rewards/validate_answer_with_correct_format": 0.59375,
"rewards/validate_format": 0.677734375,
"step": 232
}
],
"logging_steps": 4,
"max_steps": 233,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}