|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9967914438502674, |
|
"eval_steps": 500, |
|
"global_step": 233, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 228.734375, |
|
"epoch": 0.017112299465240642, |
|
"grad_norm": 0.25726935267448425, |
|
"kl": 0.0005143880844116211, |
|
"learning_rate": 9.82832618025751e-07, |
|
"loss": 0.0, |
|
"reward": 0.08203125, |
|
"reward_std": 0.12251314427703619, |
|
"rewards/validate_answer_with_correct_format": 0.060546875, |
|
"rewards/validate_format": 0.021484375, |
|
"step": 4 |
|
}, |
|
{ |
|
"completion_length": 214.3046875, |
|
"epoch": 0.034224598930481284, |
|
"grad_norm": 0.2438514530658722, |
|
"kl": 0.004357337951660156, |
|
"learning_rate": 9.656652360515022e-07, |
|
"loss": 0.0002, |
|
"reward": 0.166015625, |
|
"reward_std": 0.22593521419912577, |
|
"rewards/validate_answer_with_correct_format": 0.134765625, |
|
"rewards/validate_format": 0.03125, |
|
"step": 8 |
|
}, |
|
{ |
|
"completion_length": 201.08984375, |
|
"epoch": 0.051336898395721926, |
|
"grad_norm": 0.3771411180496216, |
|
"kl": 0.016904830932617188, |
|
"learning_rate": 9.484978540772532e-07, |
|
"loss": 0.0007, |
|
"reward": 0.44921875, |
|
"reward_std": 0.4575687777251005, |
|
"rewards/validate_answer_with_correct_format": 0.2578125, |
|
"rewards/validate_format": 0.19140625, |
|
"step": 12 |
|
}, |
|
{ |
|
"completion_length": 193.501953125, |
|
"epoch": 0.06844919786096257, |
|
"grad_norm": 0.33645591139793396, |
|
"kl": 0.023540496826171875, |
|
"learning_rate": 9.313304721030042e-07, |
|
"loss": 0.0009, |
|
"reward": 0.96484375, |
|
"reward_std": 0.6036693137139082, |
|
"rewards/validate_answer_with_correct_format": 0.37109375, |
|
"rewards/validate_format": 0.59375, |
|
"step": 16 |
|
}, |
|
{ |
|
"completion_length": 175.50390625, |
|
"epoch": 0.0855614973262032, |
|
"grad_norm": 0.3233410716056824, |
|
"kl": 0.03643798828125, |
|
"learning_rate": 9.141630901287554e-07, |
|
"loss": 0.0015, |
|
"reward": 1.302734375, |
|
"reward_std": 0.4221517601981759, |
|
"rewards/validate_answer_with_correct_format": 0.48828125, |
|
"rewards/validate_format": 0.814453125, |
|
"step": 20 |
|
}, |
|
{ |
|
"completion_length": 176.00390625, |
|
"epoch": 0.10267379679144385, |
|
"grad_norm": 0.2788090109825134, |
|
"kl": 0.029022216796875, |
|
"learning_rate": 8.969957081545064e-07, |
|
"loss": 0.0012, |
|
"reward": 1.359375, |
|
"reward_std": 0.3464417774230242, |
|
"rewards/validate_answer_with_correct_format": 0.521484375, |
|
"rewards/validate_format": 0.837890625, |
|
"step": 24 |
|
}, |
|
{ |
|
"completion_length": 168.138671875, |
|
"epoch": 0.11978609625668449, |
|
"grad_norm": 0.2867412269115448, |
|
"kl": 0.03003692626953125, |
|
"learning_rate": 8.798283261802575e-07, |
|
"loss": 0.0012, |
|
"reward": 1.5, |
|
"reward_std": 0.35856608115136623, |
|
"rewards/validate_answer_with_correct_format": 0.62109375, |
|
"rewards/validate_format": 0.87890625, |
|
"step": 28 |
|
}, |
|
{ |
|
"completion_length": 167.71484375, |
|
"epoch": 0.13689839572192514, |
|
"grad_norm": 0.2035263180732727, |
|
"kl": 0.0883941650390625, |
|
"learning_rate": 8.626609442060086e-07, |
|
"loss": 0.0035, |
|
"reward": 1.466796875, |
|
"reward_std": 0.2889786111190915, |
|
"rewards/validate_answer_with_correct_format": 0.576171875, |
|
"rewards/validate_format": 0.890625, |
|
"step": 32 |
|
}, |
|
{ |
|
"completion_length": 159.65234375, |
|
"epoch": 0.15401069518716579, |
|
"grad_norm": 0.22620172798633575, |
|
"kl": 0.0332489013671875, |
|
"learning_rate": 8.454935622317596e-07, |
|
"loss": 0.0013, |
|
"reward": 1.51171875, |
|
"reward_std": 0.22670799400657415, |
|
"rewards/validate_answer_with_correct_format": 0.58203125, |
|
"rewards/validate_format": 0.9296875, |
|
"step": 36 |
|
}, |
|
{ |
|
"completion_length": 165.24609375, |
|
"epoch": 0.1711229946524064, |
|
"grad_norm": 0.270819753408432, |
|
"kl": 0.0323638916015625, |
|
"learning_rate": 8.283261802575107e-07, |
|
"loss": 0.0013, |
|
"reward": 1.49609375, |
|
"reward_std": 0.2671235203742981, |
|
"rewards/validate_answer_with_correct_format": 0.60546875, |
|
"rewards/validate_format": 0.890625, |
|
"step": 40 |
|
}, |
|
{ |
|
"completion_length": 155.900390625, |
|
"epoch": 0.18823529411764706, |
|
"grad_norm": 0.249853253364563, |
|
"kl": 0.0360260009765625, |
|
"learning_rate": 8.111587982832617e-07, |
|
"loss": 0.0014, |
|
"reward": 1.607421875, |
|
"reward_std": 0.2183889476582408, |
|
"rewards/validate_answer_with_correct_format": 0.6796875, |
|
"rewards/validate_format": 0.927734375, |
|
"step": 44 |
|
}, |
|
{ |
|
"completion_length": 162.89453125, |
|
"epoch": 0.2053475935828877, |
|
"grad_norm": 0.3030960261821747, |
|
"kl": 0.04419708251953125, |
|
"learning_rate": 7.939914163090128e-07, |
|
"loss": 0.0018, |
|
"reward": 1.6171875, |
|
"reward_std": 0.28787759225815535, |
|
"rewards/validate_answer_with_correct_format": 0.6875, |
|
"rewards/validate_format": 0.9296875, |
|
"step": 48 |
|
}, |
|
{ |
|
"completion_length": 164.138671875, |
|
"epoch": 0.22245989304812835, |
|
"grad_norm": 0.2891474962234497, |
|
"kl": 0.03287506103515625, |
|
"learning_rate": 7.76824034334764e-07, |
|
"loss": 0.0013, |
|
"reward": 1.56640625, |
|
"reward_std": 0.3518991004675627, |
|
"rewards/validate_answer_with_correct_format": 0.689453125, |
|
"rewards/validate_format": 0.876953125, |
|
"step": 52 |
|
}, |
|
{ |
|
"completion_length": 164.32421875, |
|
"epoch": 0.23957219251336898, |
|
"grad_norm": 0.26431283354759216, |
|
"kl": 0.04036712646484375, |
|
"learning_rate": 7.59656652360515e-07, |
|
"loss": 0.0016, |
|
"reward": 1.546875, |
|
"reward_std": 0.4236298883333802, |
|
"rewards/validate_answer_with_correct_format": 0.708984375, |
|
"rewards/validate_format": 0.837890625, |
|
"step": 56 |
|
}, |
|
{ |
|
"completion_length": 169.640625, |
|
"epoch": 0.25668449197860965, |
|
"grad_norm": 0.2717176675796509, |
|
"kl": 0.03812408447265625, |
|
"learning_rate": 7.424892703862661e-07, |
|
"loss": 0.0015, |
|
"reward": 1.646484375, |
|
"reward_std": 0.35125905089080334, |
|
"rewards/validate_answer_with_correct_format": 0.783203125, |
|
"rewards/validate_format": 0.86328125, |
|
"step": 60 |
|
}, |
|
{ |
|
"completion_length": 169.064453125, |
|
"epoch": 0.2737967914438503, |
|
"grad_norm": 0.2818020284175873, |
|
"kl": 0.04370880126953125, |
|
"learning_rate": 7.253218884120171e-07, |
|
"loss": 0.0017, |
|
"reward": 1.6171875, |
|
"reward_std": 0.3997631352394819, |
|
"rewards/validate_answer_with_correct_format": 0.744140625, |
|
"rewards/validate_format": 0.873046875, |
|
"step": 64 |
|
}, |
|
{ |
|
"completion_length": 172.17578125, |
|
"epoch": 0.2909090909090909, |
|
"grad_norm": 0.35611966252326965, |
|
"kl": 0.0448455810546875, |
|
"learning_rate": 7.081545064377682e-07, |
|
"loss": 0.0018, |
|
"reward": 1.611328125, |
|
"reward_std": 0.3241206342354417, |
|
"rewards/validate_answer_with_correct_format": 0.75, |
|
"rewards/validate_format": 0.861328125, |
|
"step": 68 |
|
}, |
|
{ |
|
"completion_length": 163.498046875, |
|
"epoch": 0.30802139037433157, |
|
"grad_norm": 0.2753399908542633, |
|
"kl": 0.04538726806640625, |
|
"learning_rate": 6.909871244635192e-07, |
|
"loss": 0.0018, |
|
"reward": 1.701171875, |
|
"reward_std": 0.3228708282113075, |
|
"rewards/validate_answer_with_correct_format": 0.802734375, |
|
"rewards/validate_format": 0.8984375, |
|
"step": 72 |
|
}, |
|
{ |
|
"completion_length": 166.0234375, |
|
"epoch": 0.3251336898395722, |
|
"grad_norm": 0.401327520608902, |
|
"kl": 0.053253173828125, |
|
"learning_rate": 6.738197424892703e-07, |
|
"loss": 0.0021, |
|
"reward": 1.728515625, |
|
"reward_std": 0.3109824899584055, |
|
"rewards/validate_answer_with_correct_format": 0.826171875, |
|
"rewards/validate_format": 0.90234375, |
|
"step": 76 |
|
}, |
|
{ |
|
"completion_length": 168.322265625, |
|
"epoch": 0.3422459893048128, |
|
"grad_norm": 0.4816475510597229, |
|
"kl": 0.0675201416015625, |
|
"learning_rate": 6.566523605150214e-07, |
|
"loss": 0.0027, |
|
"reward": 1.568359375, |
|
"reward_std": 0.36935339495539665, |
|
"rewards/validate_answer_with_correct_format": 0.748046875, |
|
"rewards/validate_format": 0.8203125, |
|
"step": 80 |
|
}, |
|
{ |
|
"completion_length": 173.77734375, |
|
"epoch": 0.3593582887700535, |
|
"grad_norm": 1.3737239837646484, |
|
"kl": 0.1086883544921875, |
|
"learning_rate": 6.394849785407725e-07, |
|
"loss": 0.0043, |
|
"reward": 1.595703125, |
|
"reward_std": 0.3840404311195016, |
|
"rewards/validate_answer_with_correct_format": 0.771484375, |
|
"rewards/validate_format": 0.82421875, |
|
"step": 84 |
|
}, |
|
{ |
|
"completion_length": 174.169921875, |
|
"epoch": 0.3764705882352941, |
|
"grad_norm": 1.1611127853393555, |
|
"kl": 0.1685333251953125, |
|
"learning_rate": 6.223175965665236e-07, |
|
"loss": 0.0067, |
|
"reward": 1.494140625, |
|
"reward_std": 0.4635760700330138, |
|
"rewards/validate_answer_with_correct_format": 0.724609375, |
|
"rewards/validate_format": 0.76953125, |
|
"step": 88 |
|
}, |
|
{ |
|
"completion_length": 188.8828125, |
|
"epoch": 0.39358288770053473, |
|
"grad_norm": 1.5201658010482788, |
|
"kl": 0.60052490234375, |
|
"learning_rate": 6.051502145922746e-07, |
|
"loss": 0.024, |
|
"reward": 1.169921875, |
|
"reward_std": 0.6621273942291737, |
|
"rewards/validate_answer_with_correct_format": 0.587890625, |
|
"rewards/validate_format": 0.58203125, |
|
"step": 92 |
|
}, |
|
{ |
|
"completion_length": 193.64453125, |
|
"epoch": 0.4106951871657754, |
|
"grad_norm": 4.456446170806885, |
|
"kl": 1.027099609375, |
|
"learning_rate": 5.879828326180257e-07, |
|
"loss": 0.041, |
|
"reward": 1.052734375, |
|
"reward_std": 0.6838541068136692, |
|
"rewards/validate_answer_with_correct_format": 0.51953125, |
|
"rewards/validate_format": 0.533203125, |
|
"step": 96 |
|
}, |
|
{ |
|
"completion_length": 179.8125, |
|
"epoch": 0.42780748663101603, |
|
"grad_norm": 0.6613827347755432, |
|
"kl": 0.86444091796875, |
|
"learning_rate": 5.708154506437767e-07, |
|
"loss": 0.0346, |
|
"reward": 1.2578125, |
|
"reward_std": 0.540657652541995, |
|
"rewards/validate_answer_with_correct_format": 0.609375, |
|
"rewards/validate_format": 0.6484375, |
|
"step": 100 |
|
}, |
|
{ |
|
"completion_length": 169.509765625, |
|
"epoch": 0.4449197860962567, |
|
"grad_norm": 5.908604621887207, |
|
"kl": 0.630767822265625, |
|
"learning_rate": 5.536480686695278e-07, |
|
"loss": 0.0252, |
|
"reward": 1.537109375, |
|
"reward_std": 0.4994529504328966, |
|
"rewards/validate_answer_with_correct_format": 0.736328125, |
|
"rewards/validate_format": 0.80078125, |
|
"step": 104 |
|
}, |
|
{ |
|
"completion_length": 160.07421875, |
|
"epoch": 0.46203208556149733, |
|
"grad_norm": 1.3123823404312134, |
|
"kl": 0.44525146484375, |
|
"learning_rate": 5.364806866952789e-07, |
|
"loss": 0.0178, |
|
"reward": 1.591796875, |
|
"reward_std": 0.42215616535395384, |
|
"rewards/validate_answer_with_correct_format": 0.744140625, |
|
"rewards/validate_format": 0.84765625, |
|
"step": 108 |
|
}, |
|
{ |
|
"completion_length": 162.609375, |
|
"epoch": 0.47914438502673795, |
|
"grad_norm": 3.7777585983276367, |
|
"kl": 0.751312255859375, |
|
"learning_rate": 5.193133047210299e-07, |
|
"loss": 0.03, |
|
"reward": 1.533203125, |
|
"reward_std": 0.5010853223502636, |
|
"rewards/validate_answer_with_correct_format": 0.72265625, |
|
"rewards/validate_format": 0.810546875, |
|
"step": 112 |
|
}, |
|
{ |
|
"completion_length": 176.544921875, |
|
"epoch": 0.49625668449197863, |
|
"grad_norm": 1.9199061393737793, |
|
"kl": 1.5675048828125, |
|
"learning_rate": 5.021459227467812e-07, |
|
"loss": 0.0628, |
|
"reward": 1.33984375, |
|
"reward_std": 0.5347601640969515, |
|
"rewards/validate_answer_with_correct_format": 0.619140625, |
|
"rewards/validate_format": 0.720703125, |
|
"step": 116 |
|
}, |
|
{ |
|
"completion_length": 174.685546875, |
|
"epoch": 0.5133689839572193, |
|
"grad_norm": 1.4069411754608154, |
|
"kl": 1.360107421875, |
|
"learning_rate": 4.849785407725322e-07, |
|
"loss": 0.0544, |
|
"reward": 1.25, |
|
"reward_std": 0.5469169113785028, |
|
"rewards/validate_answer_with_correct_format": 0.537109375, |
|
"rewards/validate_format": 0.712890625, |
|
"step": 120 |
|
}, |
|
{ |
|
"completion_length": 168.380859375, |
|
"epoch": 0.5304812834224599, |
|
"grad_norm": 1.0460318326950073, |
|
"kl": 0.49078369140625, |
|
"learning_rate": 4.6781115879828326e-07, |
|
"loss": 0.0196, |
|
"reward": 1.337890625, |
|
"reward_std": 0.4842473194003105, |
|
"rewards/validate_answer_with_correct_format": 0.564453125, |
|
"rewards/validate_format": 0.7734375, |
|
"step": 124 |
|
}, |
|
{ |
|
"completion_length": 167.39453125, |
|
"epoch": 0.5475935828877005, |
|
"grad_norm": 0.717617392539978, |
|
"kl": 0.53839111328125, |
|
"learning_rate": 4.506437768240343e-07, |
|
"loss": 0.0216, |
|
"reward": 1.43359375, |
|
"reward_std": 0.47893994580954313, |
|
"rewards/validate_answer_with_correct_format": 0.623046875, |
|
"rewards/validate_format": 0.810546875, |
|
"step": 128 |
|
}, |
|
{ |
|
"completion_length": 179.7890625, |
|
"epoch": 0.5647058823529412, |
|
"grad_norm": 1.0693079233169556, |
|
"kl": 1.01556396484375, |
|
"learning_rate": 4.3347639484978536e-07, |
|
"loss": 0.0407, |
|
"reward": 1.298828125, |
|
"reward_std": 0.5207763016223907, |
|
"rewards/validate_answer_with_correct_format": 0.58203125, |
|
"rewards/validate_format": 0.716796875, |
|
"step": 132 |
|
}, |
|
{ |
|
"completion_length": 176.11328125, |
|
"epoch": 0.5818181818181818, |
|
"grad_norm": 1.5559654235839844, |
|
"kl": 1.3896484375, |
|
"learning_rate": 4.163090128755364e-07, |
|
"loss": 0.0556, |
|
"reward": 1.31640625, |
|
"reward_std": 0.6244864724576473, |
|
"rewards/validate_answer_with_correct_format": 0.580078125, |
|
"rewards/validate_format": 0.736328125, |
|
"step": 136 |
|
}, |
|
{ |
|
"completion_length": 171.017578125, |
|
"epoch": 0.5989304812834224, |
|
"grad_norm": 1.3196905851364136, |
|
"kl": 1.96533203125, |
|
"learning_rate": 3.991416309012876e-07, |
|
"loss": 0.0786, |
|
"reward": 1.373046875, |
|
"reward_std": 0.5217751991003752, |
|
"rewards/validate_answer_with_correct_format": 0.63671875, |
|
"rewards/validate_format": 0.736328125, |
|
"step": 140 |
|
}, |
|
{ |
|
"completion_length": 181.798828125, |
|
"epoch": 0.6160427807486631, |
|
"grad_norm": 0.8346861004829407, |
|
"kl": 2.1689453125, |
|
"learning_rate": 3.819742489270386e-07, |
|
"loss": 0.0867, |
|
"reward": 1.2734375, |
|
"reward_std": 0.5743660591542721, |
|
"rewards/validate_answer_with_correct_format": 0.58984375, |
|
"rewards/validate_format": 0.68359375, |
|
"step": 144 |
|
}, |
|
{ |
|
"completion_length": 175.63671875, |
|
"epoch": 0.6331550802139038, |
|
"grad_norm": 1.6434240341186523, |
|
"kl": 1.395263671875, |
|
"learning_rate": 3.648068669527897e-07, |
|
"loss": 0.0559, |
|
"reward": 1.3671875, |
|
"reward_std": 0.4787444490939379, |
|
"rewards/validate_answer_with_correct_format": 0.6015625, |
|
"rewards/validate_format": 0.765625, |
|
"step": 148 |
|
}, |
|
{ |
|
"completion_length": 166.970703125, |
|
"epoch": 0.6502673796791444, |
|
"grad_norm": 2.3217926025390625, |
|
"kl": 0.79571533203125, |
|
"learning_rate": 3.4763948497854073e-07, |
|
"loss": 0.0318, |
|
"reward": 1.486328125, |
|
"reward_std": 0.4878078643232584, |
|
"rewards/validate_answer_with_correct_format": 0.6640625, |
|
"rewards/validate_format": 0.822265625, |
|
"step": 152 |
|
}, |
|
{ |
|
"completion_length": 172.341796875, |
|
"epoch": 0.667379679144385, |
|
"grad_norm": 2.0409505367279053, |
|
"kl": 1.2677001953125, |
|
"learning_rate": 3.3047210300429184e-07, |
|
"loss": 0.0507, |
|
"reward": 1.443359375, |
|
"reward_std": 0.5103737181052566, |
|
"rewards/validate_answer_with_correct_format": 0.65234375, |
|
"rewards/validate_format": 0.791015625, |
|
"step": 156 |
|
}, |
|
{ |
|
"completion_length": 173.0703125, |
|
"epoch": 0.6844919786096256, |
|
"grad_norm": 0.673163115978241, |
|
"kl": 1.423583984375, |
|
"learning_rate": 3.133047210300429e-07, |
|
"loss": 0.057, |
|
"reward": 1.40625, |
|
"reward_std": 0.4554907586425543, |
|
"rewards/validate_answer_with_correct_format": 0.638671875, |
|
"rewards/validate_format": 0.767578125, |
|
"step": 160 |
|
}, |
|
{ |
|
"completion_length": 172.994140625, |
|
"epoch": 0.7016042780748664, |
|
"grad_norm": 0.7499634623527527, |
|
"kl": 1.368408203125, |
|
"learning_rate": 2.96137339055794e-07, |
|
"loss": 0.0548, |
|
"reward": 1.427734375, |
|
"reward_std": 0.48304858803749084, |
|
"rewards/validate_answer_with_correct_format": 0.671875, |
|
"rewards/validate_format": 0.755859375, |
|
"step": 164 |
|
}, |
|
{ |
|
"completion_length": 174.158203125, |
|
"epoch": 0.718716577540107, |
|
"grad_norm": 0.622071385383606, |
|
"kl": 1.55810546875, |
|
"learning_rate": 2.7896995708154505e-07, |
|
"loss": 0.0623, |
|
"reward": 1.375, |
|
"reward_std": 0.46824304293841124, |
|
"rewards/validate_answer_with_correct_format": 0.646484375, |
|
"rewards/validate_format": 0.728515625, |
|
"step": 168 |
|
}, |
|
{ |
|
"completion_length": 172.513671875, |
|
"epoch": 0.7358288770053476, |
|
"grad_norm": 1.0885876417160034, |
|
"kl": 1.2890625, |
|
"learning_rate": 2.6180257510729615e-07, |
|
"loss": 0.0516, |
|
"reward": 1.34375, |
|
"reward_std": 0.5326037332415581, |
|
"rewards/validate_answer_with_correct_format": 0.609375, |
|
"rewards/validate_format": 0.734375, |
|
"step": 172 |
|
}, |
|
{ |
|
"completion_length": 177.36328125, |
|
"epoch": 0.7529411764705882, |
|
"grad_norm": 1.3465913534164429, |
|
"kl": 1.707763671875, |
|
"learning_rate": 2.446351931330472e-07, |
|
"loss": 0.0683, |
|
"reward": 1.29296875, |
|
"reward_std": 0.5761713199317455, |
|
"rewards/validate_answer_with_correct_format": 0.595703125, |
|
"rewards/validate_format": 0.697265625, |
|
"step": 176 |
|
}, |
|
{ |
|
"completion_length": 175.986328125, |
|
"epoch": 0.7700534759358288, |
|
"grad_norm": 1.9746425151824951, |
|
"kl": 1.796875, |
|
"learning_rate": 2.2746781115879825e-07, |
|
"loss": 0.0718, |
|
"reward": 1.291015625, |
|
"reward_std": 0.5592877455055714, |
|
"rewards/validate_answer_with_correct_format": 0.609375, |
|
"rewards/validate_format": 0.681640625, |
|
"step": 180 |
|
}, |
|
{ |
|
"completion_length": 181.4921875, |
|
"epoch": 0.7871657754010695, |
|
"grad_norm": 1.2594670057296753, |
|
"kl": 2.2677001953125, |
|
"learning_rate": 2.1030042918454936e-07, |
|
"loss": 0.0906, |
|
"reward": 1.29296875, |
|
"reward_std": 0.5261205593124032, |
|
"rewards/validate_answer_with_correct_format": 0.59765625, |
|
"rewards/validate_format": 0.6953125, |
|
"step": 184 |
|
}, |
|
{ |
|
"completion_length": 182.068359375, |
|
"epoch": 0.8042780748663102, |
|
"grad_norm": 0.9596216678619385, |
|
"kl": 1.4912109375, |
|
"learning_rate": 1.931330472103004e-07, |
|
"loss": 0.0597, |
|
"reward": 1.2734375, |
|
"reward_std": 0.5831566601991653, |
|
"rewards/validate_answer_with_correct_format": 0.580078125, |
|
"rewards/validate_format": 0.693359375, |
|
"step": 188 |
|
}, |
|
{ |
|
"completion_length": 176.224609375, |
|
"epoch": 0.8213903743315508, |
|
"grad_norm": 1.0331825017929077, |
|
"kl": 1.3037109375, |
|
"learning_rate": 1.759656652360515e-07, |
|
"loss": 0.0522, |
|
"reward": 1.384765625, |
|
"reward_std": 0.48507228679955006, |
|
"rewards/validate_answer_with_correct_format": 0.640625, |
|
"rewards/validate_format": 0.744140625, |
|
"step": 192 |
|
}, |
|
{ |
|
"completion_length": 170.205078125, |
|
"epoch": 0.8385026737967914, |
|
"grad_norm": 1.1908960342407227, |
|
"kl": 1.3134765625, |
|
"learning_rate": 1.5879828326180257e-07, |
|
"loss": 0.0526, |
|
"reward": 1.44140625, |
|
"reward_std": 0.5417510252445936, |
|
"rewards/validate_answer_with_correct_format": 0.658203125, |
|
"rewards/validate_format": 0.783203125, |
|
"step": 196 |
|
}, |
|
{ |
|
"completion_length": 175.8125, |
|
"epoch": 0.8556149732620321, |
|
"grad_norm": 2.4777722358703613, |
|
"kl": 1.801025390625, |
|
"learning_rate": 1.4163090128755365e-07, |
|
"loss": 0.072, |
|
"reward": 1.306640625, |
|
"reward_std": 0.5542392712086439, |
|
"rewards/validate_answer_with_correct_format": 0.595703125, |
|
"rewards/validate_format": 0.7109375, |
|
"step": 200 |
|
}, |
|
{ |
|
"completion_length": 177.724609375, |
|
"epoch": 0.8727272727272727, |
|
"grad_norm": 0.8101657032966614, |
|
"kl": 2.00537109375, |
|
"learning_rate": 1.2446351931330473e-07, |
|
"loss": 0.0802, |
|
"reward": 1.28125, |
|
"reward_std": 0.5756953954696655, |
|
"rewards/validate_answer_with_correct_format": 0.595703125, |
|
"rewards/validate_format": 0.685546875, |
|
"step": 204 |
|
}, |
|
{ |
|
"completion_length": 178.109375, |
|
"epoch": 0.8898395721925134, |
|
"grad_norm": 4.290409564971924, |
|
"kl": 2.436279296875, |
|
"learning_rate": 1.0729613733905579e-07, |
|
"loss": 0.0974, |
|
"reward": 1.2265625, |
|
"reward_std": 0.5841891095042229, |
|
"rewards/validate_answer_with_correct_format": 0.54296875, |
|
"rewards/validate_format": 0.68359375, |
|
"step": 208 |
|
}, |
|
{ |
|
"completion_length": 182.623046875, |
|
"epoch": 0.906951871657754, |
|
"grad_norm": 1.0480223894119263, |
|
"kl": 2.49560546875, |
|
"learning_rate": 9.012875536480687e-08, |
|
"loss": 0.0999, |
|
"reward": 1.173828125, |
|
"reward_std": 0.6358627937734127, |
|
"rewards/validate_answer_with_correct_format": 0.54296875, |
|
"rewards/validate_format": 0.630859375, |
|
"step": 212 |
|
}, |
|
{ |
|
"completion_length": 180.333984375, |
|
"epoch": 0.9240641711229947, |
|
"grad_norm": 1.4673086404800415, |
|
"kl": 2.2333984375, |
|
"learning_rate": 7.296137339055794e-08, |
|
"loss": 0.0895, |
|
"reward": 1.158203125, |
|
"reward_std": 0.6267144195735455, |
|
"rewards/validate_answer_with_correct_format": 0.50390625, |
|
"rewards/validate_format": 0.654296875, |
|
"step": 216 |
|
}, |
|
{ |
|
"completion_length": 177.82421875, |
|
"epoch": 0.9411764705882353, |
|
"grad_norm": 1.3507096767425537, |
|
"kl": 2.12353515625, |
|
"learning_rate": 5.5793991416309014e-08, |
|
"loss": 0.0849, |
|
"reward": 1.212890625, |
|
"reward_std": 0.6212888453155756, |
|
"rewards/validate_answer_with_correct_format": 0.5546875, |
|
"rewards/validate_format": 0.658203125, |
|
"step": 220 |
|
}, |
|
{ |
|
"completion_length": 176.25390625, |
|
"epoch": 0.9582887700534759, |
|
"grad_norm": 1.1437978744506836, |
|
"kl": 1.736328125, |
|
"learning_rate": 3.8626609442060086e-08, |
|
"loss": 0.0695, |
|
"reward": 1.244140625, |
|
"reward_std": 0.5541076026856899, |
|
"rewards/validate_answer_with_correct_format": 0.568359375, |
|
"rewards/validate_format": 0.67578125, |
|
"step": 224 |
|
}, |
|
{ |
|
"completion_length": 172.615234375, |
|
"epoch": 0.9754010695187165, |
|
"grad_norm": 2.013885974884033, |
|
"kl": 1.874267578125, |
|
"learning_rate": 2.1459227467811158e-08, |
|
"loss": 0.075, |
|
"reward": 1.31640625, |
|
"reward_std": 0.5559380035847425, |
|
"rewards/validate_answer_with_correct_format": 0.6015625, |
|
"rewards/validate_format": 0.71484375, |
|
"step": 228 |
|
}, |
|
{ |
|
"completion_length": 173.4765625, |
|
"epoch": 0.9925133689839573, |
|
"grad_norm": 0.9704756140708923, |
|
"kl": 1.89306640625, |
|
"learning_rate": 4.291845493562231e-09, |
|
"loss": 0.0757, |
|
"reward": 1.271484375, |
|
"reward_std": 0.5627450533211231, |
|
"rewards/validate_answer_with_correct_format": 0.59375, |
|
"rewards/validate_format": 0.677734375, |
|
"step": 232 |
|
} |
|
], |
|
"logging_steps": 4, |
|
"max_steps": 233, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|