|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.4203446826397646, |
|
"eval_steps": 500, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 582.8750228881836, |
|
"epoch": 0.0016813787305590584, |
|
"grad_norm": 0.32669930150754334, |
|
"kl": 0.0, |
|
"learning_rate": 6.666666666666667e-08, |
|
"loss": 0.0, |
|
"reward": 0.6666666865348816, |
|
"reward_std": 0.3535533882677555, |
|
"rewards/format_reward_func": 0.5416666865348816, |
|
"rewards/solution_reward_func": 0.1250000037252903, |
|
"step": 2 |
|
}, |
|
{ |
|
"completion_length": 699.4583587646484, |
|
"epoch": 0.003362757461118117, |
|
"grad_norm": 0.5103613895194884, |
|
"kl": 0.0002980232238769531, |
|
"learning_rate": 1.3333333333333334e-07, |
|
"loss": 0.0, |
|
"reward": 0.5833333507180214, |
|
"reward_std": 0.4714045189321041, |
|
"rewards/format_reward_func": 0.541666679084301, |
|
"rewards/solution_reward_func": 0.0416666679084301, |
|
"step": 4 |
|
}, |
|
{ |
|
"completion_length": 575.2083511352539, |
|
"epoch": 0.005044136191677175, |
|
"grad_norm": 0.6433224536587134, |
|
"kl": 0.0003399848937988281, |
|
"learning_rate": 2e-07, |
|
"loss": 0.0, |
|
"reward": 0.7916666865348816, |
|
"reward_std": 0.2946278229355812, |
|
"rewards/format_reward_func": 0.7083333656191826, |
|
"rewards/solution_reward_func": 0.0833333358168602, |
|
"step": 6 |
|
}, |
|
{ |
|
"completion_length": 828.2083435058594, |
|
"epoch": 0.006725514922236234, |
|
"grad_norm": 0.5211006593141526, |
|
"kl": 0.00028324127197265625, |
|
"learning_rate": 2.6666666666666667e-07, |
|
"loss": 0.0, |
|
"reward": 0.708333358168602, |
|
"reward_std": 0.4124789573252201, |
|
"rewards/format_reward_func": 0.6250000149011612, |
|
"rewards/solution_reward_func": 0.0833333358168602, |
|
"step": 8 |
|
}, |
|
{ |
|
"completion_length": 572.6666870117188, |
|
"epoch": 0.008406893652795292, |
|
"grad_norm": 0.7556094738301846, |
|
"kl": 0.0003337860107421875, |
|
"learning_rate": 3.333333333333333e-07, |
|
"loss": 0.0, |
|
"reward": 0.833333358168602, |
|
"reward_std": 0.3535533882677555, |
|
"rewards/format_reward_func": 0.6666666716337204, |
|
"rewards/solution_reward_func": 0.1666666716337204, |
|
"step": 10 |
|
}, |
|
{ |
|
"completion_length": 538.7083511352539, |
|
"epoch": 0.01008827238335435, |
|
"grad_norm": 0.7513982192873419, |
|
"kl": 0.0002989768981933594, |
|
"learning_rate": 4e-07, |
|
"loss": 0.0, |
|
"reward": 0.6666666939854622, |
|
"reward_std": 0.4714045189321041, |
|
"rewards/format_reward_func": 0.5833333358168602, |
|
"rewards/solution_reward_func": 0.0833333358168602, |
|
"step": 12 |
|
}, |
|
{ |
|
"completion_length": 673.7500152587891, |
|
"epoch": 0.011769651113913409, |
|
"grad_norm": 0.5852912050950652, |
|
"kl": 0.0003135204315185547, |
|
"learning_rate": 4.6666666666666666e-07, |
|
"loss": 0.0, |
|
"reward": 0.9166666865348816, |
|
"reward_std": 0.4714045189321041, |
|
"rewards/format_reward_func": 0.7916666865348816, |
|
"rewards/solution_reward_func": 0.1250000037252903, |
|
"step": 14 |
|
}, |
|
{ |
|
"completion_length": 711.3750305175781, |
|
"epoch": 0.013451029844472467, |
|
"grad_norm": 0.9204289802606287, |
|
"kl": 0.0003161430358886719, |
|
"learning_rate": 4.999947552503497e-07, |
|
"loss": 0.0, |
|
"reward": 0.6250000074505806, |
|
"reward_std": 0.5303300842642784, |
|
"rewards/format_reward_func": 0.5000000074505806, |
|
"rewards/solution_reward_func": 0.1250000037252903, |
|
"step": 16 |
|
}, |
|
{ |
|
"completion_length": 552.1250152587891, |
|
"epoch": 0.015132408575031526, |
|
"grad_norm": 0.9100701777961465, |
|
"kl": 0.0004177093505859375, |
|
"learning_rate": 4.999527985734931e-07, |
|
"loss": 0.0, |
|
"reward": 0.833333358168602, |
|
"reward_std": 0.3535533882677555, |
|
"rewards/format_reward_func": 0.75, |
|
"rewards/solution_reward_func": 0.0833333358168602, |
|
"step": 18 |
|
}, |
|
{ |
|
"completion_length": 540.1250228881836, |
|
"epoch": 0.016813787305590584, |
|
"grad_norm": 0.6037876280305763, |
|
"kl": 0.0005059242248535156, |
|
"learning_rate": 4.998688922613787e-07, |
|
"loss": 0.0, |
|
"reward": 0.7916666865348816, |
|
"reward_std": 0.2946278266608715, |
|
"rewards/format_reward_func": 0.6666666865348816, |
|
"rewards/solution_reward_func": 0.1250000037252903, |
|
"step": 20 |
|
}, |
|
{ |
|
"completion_length": 590.2500152587891, |
|
"epoch": 0.018495166036149643, |
|
"grad_norm": 0.9748922522783966, |
|
"kl": 0.0004825592041015625, |
|
"learning_rate": 4.997430503960219e-07, |
|
"loss": 0.0, |
|
"reward": 0.6666666939854622, |
|
"reward_std": 0.3535533919930458, |
|
"rewards/format_reward_func": 0.5416666679084301, |
|
"rewards/solution_reward_func": 0.1250000037252903, |
|
"step": 22 |
|
}, |
|
{ |
|
"completion_length": 689.5416717529297, |
|
"epoch": 0.0201765447667087, |
|
"grad_norm": 0.4822668116388023, |
|
"kl": 0.000446319580078125, |
|
"learning_rate": 4.995752940974918e-07, |
|
"loss": 0.0, |
|
"reward": 0.7916666865348816, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 0.7083333432674408, |
|
"rewards/solution_reward_func": 0.0833333358168602, |
|
"step": 24 |
|
}, |
|
{ |
|
"completion_length": 556.9166870117188, |
|
"epoch": 0.02185792349726776, |
|
"grad_norm": 0.46041536357233115, |
|
"kl": 0.0004811286926269531, |
|
"learning_rate": 4.993656515203662e-07, |
|
"loss": 0.0, |
|
"reward": 0.8333333432674408, |
|
"reward_std": 0.3535533919930458, |
|
"rewards/format_reward_func": 0.7500000149011612, |
|
"rewards/solution_reward_func": 0.0833333358168602, |
|
"step": 26 |
|
}, |
|
{ |
|
"completion_length": 597.2916717529297, |
|
"epoch": 0.023539302227826818, |
|
"grad_norm": 0.5772106772675187, |
|
"kl": 0.0006680488586425781, |
|
"learning_rate": 4.991141578490066e-07, |
|
"loss": 0.0, |
|
"reward": 0.9166666865348816, |
|
"reward_std": 0.2357022576034069, |
|
"rewards/format_reward_func": 0.7916666865348816, |
|
"rewards/solution_reward_func": 0.1250000037252903, |
|
"step": 28 |
|
}, |
|
{ |
|
"completion_length": 610.5416717529297, |
|
"epoch": 0.025220680958385876, |
|
"grad_norm": 0.8453841880983054, |
|
"kl": 0.0008635520935058594, |
|
"learning_rate": 4.988208552916535e-07, |
|
"loss": 0.0, |
|
"reward": 0.9583333730697632, |
|
"reward_std": 0.4124789498746395, |
|
"rewards/format_reward_func": 0.7916666865348816, |
|
"rewards/solution_reward_func": 0.1666666716337204, |
|
"step": 30 |
|
}, |
|
{ |
|
"completion_length": 672.7083587646484, |
|
"epoch": 0.026902059688944935, |
|
"grad_norm": 0.5771700431960171, |
|
"kl": 0.000789642333984375, |
|
"learning_rate": 4.984857930733419e-07, |
|
"loss": 0.0, |
|
"reward": 0.9166666865348816, |
|
"reward_std": 0.2357022613286972, |
|
"rewards/format_reward_func": 0.833333358168602, |
|
"rewards/solution_reward_func": 0.0833333358168602, |
|
"step": 32 |
|
}, |
|
{ |
|
"completion_length": 648.0416717529297, |
|
"epoch": 0.028583438419503993, |
|
"grad_norm": 0.6098823109986267, |
|
"kl": 0.0009946823120117188, |
|
"learning_rate": 4.981090274276405e-07, |
|
"loss": 0.0, |
|
"reward": 0.8333333730697632, |
|
"reward_std": 0.3535533882677555, |
|
"rewards/format_reward_func": 0.708333358168602, |
|
"rewards/solution_reward_func": 0.1250000037252903, |
|
"step": 34 |
|
}, |
|
{ |
|
"completion_length": 821.3333587646484, |
|
"epoch": 0.03026481715006305, |
|
"grad_norm": 0.6980212217774439, |
|
"kl": 0.0006208419799804688, |
|
"learning_rate": 4.976906215872137e-07, |
|
"loss": 0.0, |
|
"reward": 0.8333333432674408, |
|
"reward_std": 0.3535533919930458, |
|
"rewards/format_reward_func": 0.7500000149011612, |
|
"rewards/solution_reward_func": 0.0833333358168602, |
|
"step": 36 |
|
}, |
|
{ |
|
"completion_length": 620.2500152587891, |
|
"epoch": 0.031946195880622114, |
|
"grad_norm": 0.0003861918457850463, |
|
"kl": 0.0010890960693359375, |
|
"learning_rate": 4.97230645773209e-07, |
|
"loss": 0.0, |
|
"reward": 0.833333358168602, |
|
"reward_std": 0.1178511306643486, |
|
"rewards/format_reward_func": 0.7916667014360428, |
|
"rewards/solution_reward_func": 0.0416666679084301, |
|
"step": 38 |
|
}, |
|
{ |
|
"completion_length": 533.5833435058594, |
|
"epoch": 0.03362757461118117, |
|
"grad_norm": 0.19582302389190498, |
|
"kl": 0.0016956329345703125, |
|
"learning_rate": 4.967291771834726e-07, |
|
"loss": 0.0, |
|
"reward": 1.0416666865348816, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 0.833333358168602, |
|
"rewards/solution_reward_func": 0.2083333358168602, |
|
"step": 40 |
|
}, |
|
{ |
|
"completion_length": 554.9166793823242, |
|
"epoch": 0.03530895334174023, |
|
"grad_norm": 0.43787338966334477, |
|
"kl": 0.0012445449829101562, |
|
"learning_rate": 4.961862999795923e-07, |
|
"loss": 0.0, |
|
"reward": 1.0, |
|
"reward_std": 0.2357022613286972, |
|
"rewards/format_reward_func": 0.833333358168602, |
|
"rewards/solution_reward_func": 0.1666666716337204, |
|
"step": 42 |
|
}, |
|
{ |
|
"completion_length": 631.4166946411133, |
|
"epoch": 0.036990332072299285, |
|
"grad_norm": 0.7633407511221333, |
|
"kl": 0.0015544891357421875, |
|
"learning_rate": 4.956021052727731e-07, |
|
"loss": 0.0, |
|
"reward": 0.8750000149011612, |
|
"reward_std": 0.2946278266608715, |
|
"rewards/format_reward_func": 0.7916666865348816, |
|
"rewards/solution_reward_func": 0.0833333358168602, |
|
"step": 44 |
|
}, |
|
{ |
|
"completion_length": 623.9166870117188, |
|
"epoch": 0.03867171080285835, |
|
"grad_norm": 0.9409898030779542, |
|
"kl": 0.0011081695556640625, |
|
"learning_rate": 4.949766911085461e-07, |
|
"loss": 0.0, |
|
"reward": 0.9166666865348816, |
|
"reward_std": 0.3535533882677555, |
|
"rewards/format_reward_func": 0.7500000149011612, |
|
"rewards/solution_reward_func": 0.1666666716337204, |
|
"step": 46 |
|
}, |
|
{ |
|
"completion_length": 576.1250152587891, |
|
"epoch": 0.0403530895334174, |
|
"grad_norm": 0.5174245567750604, |
|
"kl": 0.0010805130004882812, |
|
"learning_rate": 4.943101624503132e-07, |
|
"loss": 0.0, |
|
"reward": 1.0000000298023224, |
|
"reward_std": 0.3535533919930458, |
|
"rewards/format_reward_func": 0.833333358168602, |
|
"rewards/solution_reward_func": 0.1666666716337204, |
|
"step": 48 |
|
}, |
|
{ |
|
"completion_length": 464.91668701171875, |
|
"epoch": 0.042034468263976464, |
|
"grad_norm": 0.721286337233584, |
|
"kl": 0.0024700164794921875, |
|
"learning_rate": 4.936026311617316e-07, |
|
"loss": 0.0, |
|
"reward": 0.9166667014360428, |
|
"reward_std": 0.2357022613286972, |
|
"rewards/format_reward_func": 0.8333333432674408, |
|
"rewards/solution_reward_func": 0.0833333358168602, |
|
"step": 50 |
|
}, |
|
{ |
|
"completion_length": 604.5000228881836, |
|
"epoch": 0.04371584699453552, |
|
"grad_norm": 0.4391469354984254, |
|
"kl": 0.0014371871948242188, |
|
"learning_rate": 4.928542159879385e-07, |
|
"loss": 0.0, |
|
"reward": 1.0833333730697632, |
|
"reward_std": 0.3535533882677555, |
|
"rewards/format_reward_func": 0.8750000149011612, |
|
"rewards/solution_reward_func": 0.2083333395421505, |
|
"step": 52 |
|
}, |
|
{ |
|
"completion_length": 487.04168701171875, |
|
"epoch": 0.04539722572509458, |
|
"grad_norm": 0.5796863501691008, |
|
"kl": 0.001621246337890625, |
|
"learning_rate": 4.920650425356239e-07, |
|
"loss": 0.0, |
|
"reward": 1.1250000596046448, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 0.9583333432674408, |
|
"rewards/solution_reward_func": 0.1666666716337204, |
|
"step": 54 |
|
}, |
|
{ |
|
"completion_length": 702.7917022705078, |
|
"epoch": 0.047078604455653636, |
|
"grad_norm": 0.45499979356207115, |
|
"kl": 0.0010595321655273438, |
|
"learning_rate": 4.912352432519484e-07, |
|
"loss": 0.0, |
|
"reward": 0.9583333730697632, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 0.9166666865348816, |
|
"rewards/solution_reward_func": 0.0416666679084301, |
|
"step": 56 |
|
}, |
|
{ |
|
"completion_length": 432.5416793823242, |
|
"epoch": 0.0487599831862127, |
|
"grad_norm": 0.7200412766464303, |
|
"kl": 0.002208709716796875, |
|
"learning_rate": 4.90364957402315e-07, |
|
"loss": 0.0, |
|
"reward": 1.2083333432674408, |
|
"reward_std": 0.2946278266608715, |
|
"rewards/format_reward_func": 0.9583333432674408, |
|
"rewards/solution_reward_func": 0.2500000037252903, |
|
"step": 58 |
|
}, |
|
{ |
|
"completion_length": 504.87501525878906, |
|
"epoch": 0.05044136191677175, |
|
"grad_norm": 0.69519393769251, |
|
"kl": 0.00209808349609375, |
|
"learning_rate": 4.894543310469967e-07, |
|
"loss": 0.0, |
|
"reward": 0.9583333730697632, |
|
"reward_std": 0.2946278266608715, |
|
"rewards/format_reward_func": 0.8750000298023224, |
|
"rewards/solution_reward_func": 0.0833333358168602, |
|
"step": 60 |
|
}, |
|
{ |
|
"completion_length": 665.458366394043, |
|
"epoch": 0.052122740647330815, |
|
"grad_norm": 0.7276068452685034, |
|
"kl": 0.0016222000122070312, |
|
"learning_rate": 4.885035170166228e-07, |
|
"loss": 0.0, |
|
"reward": 0.958333358168602, |
|
"reward_std": 0.2946278266608715, |
|
"rewards/format_reward_func": 0.8333333432674408, |
|
"rewards/solution_reward_func": 0.1250000037252903, |
|
"step": 62 |
|
}, |
|
{ |
|
"completion_length": 628.0000152587891, |
|
"epoch": 0.05380411937788987, |
|
"grad_norm": 0.5896398911030171, |
|
"kl": 0.0023822784423828125, |
|
"learning_rate": 4.875126748865289e-07, |
|
"loss": 0.0, |
|
"reward": 1.0416667014360428, |
|
"reward_std": 0.4124789535999298, |
|
"rewards/format_reward_func": 0.833333358168602, |
|
"rewards/solution_reward_func": 0.2083333358168602, |
|
"step": 64 |
|
}, |
|
{ |
|
"completion_length": 583.1250228881836, |
|
"epoch": 0.05548549810844893, |
|
"grad_norm": 0.3607454748525709, |
|
"kl": 0.0025081634521484375, |
|
"learning_rate": 4.864819709499761e-07, |
|
"loss": 0.0, |
|
"reward": 1.0000000298023224, |
|
"reward_std": 0.1178511306643486, |
|
"rewards/format_reward_func": 0.8750000298023224, |
|
"rewards/solution_reward_func": 0.1250000037252903, |
|
"step": 66 |
|
}, |
|
{ |
|
"completion_length": 734.5833435058594, |
|
"epoch": 0.057166876839007986, |
|
"grad_norm": 0.5186256820202049, |
|
"kl": 0.0015344619750976562, |
|
"learning_rate": 4.854115781902414e-07, |
|
"loss": 0.0, |
|
"reward": 0.916666716337204, |
|
"reward_std": 0.2357022613286972, |
|
"rewards/format_reward_func": 0.8750000298023224, |
|
"rewards/solution_reward_func": 0.0416666679084301, |
|
"step": 68 |
|
}, |
|
{ |
|
"completion_length": 738.0833511352539, |
|
"epoch": 0.05884825556956705, |
|
"grad_norm": 0.39707553003489315, |
|
"kl": 0.0017681121826171875, |
|
"learning_rate": 4.843016762515859e-07, |
|
"loss": 0.0, |
|
"reward": 1.0416666865348816, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 0.9583333432674408, |
|
"rewards/solution_reward_func": 0.0833333358168602, |
|
"step": 70 |
|
}, |
|
{ |
|
"completion_length": 625.6250305175781, |
|
"epoch": 0.0605296343001261, |
|
"grad_norm": 0.30316010027843815, |
|
"kl": 0.001819610595703125, |
|
"learning_rate": 4.831524514091056e-07, |
|
"loss": 0.0, |
|
"reward": 1.0000000298023224, |
|
"reward_std": 0.2357022576034069, |
|
"rewards/format_reward_func": 0.9166666865348816, |
|
"rewards/solution_reward_func": 0.0833333358168602, |
|
"step": 72 |
|
}, |
|
{ |
|
"completion_length": 636.8750228881836, |
|
"epoch": 0.062211013030685165, |
|
"grad_norm": 0.4721947459888053, |
|
"kl": 0.0018463134765625, |
|
"learning_rate": 4.81964096537468e-07, |
|
"loss": 0.0, |
|
"reward": 1.0416666716337204, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 0.8750000149011612, |
|
"rewards/solution_reward_func": 0.1666666679084301, |
|
"step": 74 |
|
}, |
|
{ |
|
"completion_length": 508.4166793823242, |
|
"epoch": 0.06389239176124423, |
|
"grad_norm": 0.0005282376556457386, |
|
"kl": 0.0027256011962890625, |
|
"learning_rate": 4.80736811078543e-07, |
|
"loss": 0.0, |
|
"reward": 1.1666666865348816, |
|
"reward_std": 0.1178511306643486, |
|
"rewards/format_reward_func": 0.9583333432674408, |
|
"rewards/solution_reward_func": 0.2083333395421505, |
|
"step": 76 |
|
}, |
|
{ |
|
"completion_length": 460.00001525878906, |
|
"epoch": 0.06557377049180328, |
|
"grad_norm": 0.8701537519403687, |
|
"kl": 0.0024814605712890625, |
|
"learning_rate": 4.794708010079288e-07, |
|
"loss": 0.0, |
|
"reward": 1.0833333730697632, |
|
"reward_std": 0.2357022613286972, |
|
"rewards/format_reward_func": 0.9166666865348816, |
|
"rewards/solution_reward_func": 0.1666666716337204, |
|
"step": 78 |
|
}, |
|
{ |
|
"completion_length": 602.1666870117188, |
|
"epoch": 0.06725514922236234, |
|
"grad_norm": 0.5672242931158207, |
|
"kl": 0.005603790283203125, |
|
"learning_rate": 4.78166278800385e-07, |
|
"loss": 0.0, |
|
"reward": 1.041666716337204, |
|
"reward_std": 0.2946278266608715, |
|
"rewards/format_reward_func": 0.9166666865348816, |
|
"rewards/solution_reward_func": 0.1250000037252903, |
|
"step": 80 |
|
}, |
|
{ |
|
"completion_length": 547.5416870117188, |
|
"epoch": 0.06893652795292139, |
|
"grad_norm": 0.6276836580523573, |
|
"kl": 0.004207611083984375, |
|
"learning_rate": 4.7682346339417157e-07, |
|
"loss": 0.0, |
|
"reward": 0.9583333432674408, |
|
"reward_std": 0.2946278266608715, |
|
"rewards/format_reward_func": 0.8750000298023224, |
|
"rewards/solution_reward_func": 0.0833333358168602, |
|
"step": 82 |
|
}, |
|
{ |
|
"completion_length": 668.9583587646484, |
|
"epoch": 0.07061790668348046, |
|
"grad_norm": 0.00047527262535576603, |
|
"kl": 0.00447845458984375, |
|
"learning_rate": 4.754425801543046e-07, |
|
"loss": 0.0, |
|
"reward": 1.0, |
|
"reward_std": 0.1178511306643486, |
|
"rewards/format_reward_func": 0.9583333432674408, |
|
"rewards/solution_reward_func": 0.0416666679084301, |
|
"step": 84 |
|
}, |
|
{ |
|
"completion_length": 542.7083511352539, |
|
"epoch": 0.07229928541403952, |
|
"grad_norm": 0.6328447561571717, |
|
"kl": 0.002597808837890625, |
|
"learning_rate": 4.7402386083473364e-07, |
|
"loss": 0.0, |
|
"reward": 1.166666716337204, |
|
"reward_std": 0.2357022613286972, |
|
"rewards/format_reward_func": 0.9583333432674408, |
|
"rewards/solution_reward_func": 0.2083333395421505, |
|
"step": 86 |
|
}, |
|
{ |
|
"completion_length": 570.7500305175781, |
|
"epoch": 0.07398066414459857, |
|
"grad_norm": 0.48341787098839506, |
|
"kl": 0.0023593902587890625, |
|
"learning_rate": 4.72567543539446e-07, |
|
"loss": 0.0, |
|
"reward": 0.9583333730697632, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 0.9166666865348816, |
|
"rewards/solution_reward_func": 0.0416666679084301, |
|
"step": 88 |
|
}, |
|
{ |
|
"completion_length": 515.6666793823242, |
|
"epoch": 0.07566204287515763, |
|
"grad_norm": 0.4411126705303185, |
|
"kl": 0.003017425537109375, |
|
"learning_rate": 4.7107387268250586e-07, |
|
"loss": 0.0, |
|
"reward": 1.0416666865348816, |
|
"reward_std": 0.2946278266608715, |
|
"rewards/format_reward_func": 0.9166666865348816, |
|
"rewards/solution_reward_func": 0.1250000037252903, |
|
"step": 90 |
|
}, |
|
{ |
|
"completion_length": 608.3333435058594, |
|
"epoch": 0.0773434216057167, |
|
"grad_norm": 0.3226336554890738, |
|
"kl": 0.0033473968505859375, |
|
"learning_rate": 4.6954309894703426e-07, |
|
"loss": 0.0, |
|
"reward": 1.166666716337204, |
|
"reward_std": 0.2357022613286972, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.1666666716337204, |
|
"step": 92 |
|
}, |
|
{ |
|
"completion_length": 456.4166793823242, |
|
"epoch": 0.07902480033627575, |
|
"grad_norm": 0.7644602389120536, |
|
"kl": 0.0032806396484375, |
|
"learning_rate": 4.6797547924313673e-07, |
|
"loss": 0.0, |
|
"reward": 1.1250000298023224, |
|
"reward_std": 0.2946278266608715, |
|
"rewards/format_reward_func": 0.9583333432674408, |
|
"rewards/solution_reward_func": 0.1666666716337204, |
|
"step": 94 |
|
}, |
|
{ |
|
"completion_length": 551.4583587646484, |
|
"epoch": 0.0807061790668348, |
|
"grad_norm": 0.773881603923625, |
|
"kl": 0.003635406494140625, |
|
"learning_rate": 4.6637127666478617e-07, |
|
"loss": 0.0, |
|
"reward": 1.1250000298023224, |
|
"reward_std": 0.2946278266608715, |
|
"rewards/format_reward_func": 0.9583333432674408, |
|
"rewards/solution_reward_func": 0.1666666716337204, |
|
"step": 96 |
|
}, |
|
{ |
|
"completion_length": 494.4583511352539, |
|
"epoch": 0.08238755779739386, |
|
"grad_norm": 0.6252744428861863, |
|
"kl": 0.00386810302734375, |
|
"learning_rate": 4.647307604456674e-07, |
|
"loss": 0.0, |
|
"reward": 1.166666716337204, |
|
"reward_std": 0.2357022613286972, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.1666666716337204, |
|
"step": 98 |
|
}, |
|
{ |
|
"completion_length": 540.9166870117188, |
|
"epoch": 0.08406893652795293, |
|
"grad_norm": 0.0005101931995823585, |
|
"kl": 0.003147125244140625, |
|
"learning_rate": 4.630542059139923e-07, |
|
"loss": 0.0, |
|
"reward": 1.0, |
|
"reward_std": 0.0, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.0, |
|
"step": 100 |
|
}, |
|
{ |
|
"completion_length": 447.2916793823242, |
|
"epoch": 0.08575031525851198, |
|
"grad_norm": 0.7334515099745728, |
|
"kl": 0.00638580322265625, |
|
"learning_rate": 4.613418944462906e-07, |
|
"loss": 0.0, |
|
"reward": 1.2500000596046448, |
|
"reward_std": 0.2357022613286972, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.2500000074505806, |
|
"step": 102 |
|
}, |
|
{ |
|
"completion_length": 465.45835876464844, |
|
"epoch": 0.08743169398907104, |
|
"grad_norm": 0.2608326039080234, |
|
"kl": 0.004886627197265625, |
|
"learning_rate": 4.5959411342018704e-07, |
|
"loss": 0.0, |
|
"reward": 1.1250000298023224, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 0.9583333432674408, |
|
"rewards/solution_reward_func": 0.1666666679084301, |
|
"step": 104 |
|
}, |
|
{ |
|
"completion_length": 479.9166946411133, |
|
"epoch": 0.0891130727196301, |
|
"grad_norm": 0.2883550467956107, |
|
"kl": 0.00536346435546875, |
|
"learning_rate": 4.578111561661702e-07, |
|
"loss": 0.0, |
|
"reward": 1.0833333730697632, |
|
"reward_std": 0.1178511306643486, |
|
"rewards/format_reward_func": 0.9583333432674408, |
|
"rewards/solution_reward_func": 0.1250000037252903, |
|
"step": 106 |
|
}, |
|
{ |
|
"completion_length": 424.5416793823242, |
|
"epoch": 0.09079445145018916, |
|
"grad_norm": 0.6124946815591971, |
|
"kl": 0.00507354736328125, |
|
"learning_rate": 4.559933219183631e-07, |
|
"loss": 0.0, |
|
"reward": 1.0000000298023224, |
|
"reward_std": 0.1178511306643486, |
|
"rewards/format_reward_func": 0.9583333432674408, |
|
"rewards/solution_reward_func": 0.0416666679084301, |
|
"step": 108 |
|
}, |
|
{ |
|
"completion_length": 374.3333435058594, |
|
"epoch": 0.09247583018074822, |
|
"grad_norm": 0.003755671104748954, |
|
"kl": 0.01043701171875, |
|
"learning_rate": 4.541409157643027e-07, |
|
"loss": 0.0, |
|
"reward": 1.1250000298023224, |
|
"reward_std": 0.0589255653321743, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.1250000037252903, |
|
"step": 110 |
|
}, |
|
{ |
|
"completion_length": 527.3750228881836, |
|
"epoch": 0.09415720891130727, |
|
"grad_norm": 0.5338777369576395, |
|
"kl": 0.00551605224609375, |
|
"learning_rate": 4.5225424859373684e-07, |
|
"loss": 0.0, |
|
"reward": 1.166666716337204, |
|
"reward_std": 0.2357022613286972, |
|
"rewards/format_reward_func": 0.9583333432674408, |
|
"rewards/solution_reward_func": 0.2083333395421505, |
|
"step": 112 |
|
}, |
|
{ |
|
"completion_length": 381.62500762939453, |
|
"epoch": 0.09583858764186633, |
|
"grad_norm": 0.47889287389319934, |
|
"kl": 0.007801055908203125, |
|
"learning_rate": 4.503336370464475e-07, |
|
"loss": 0.0, |
|
"reward": 1.1250000298023224, |
|
"reward_std": 0.0589255653321743, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.1250000037252903, |
|
"step": 114 |
|
}, |
|
{ |
|
"completion_length": 464.70835876464844, |
|
"epoch": 0.0975199663724254, |
|
"grad_norm": 0.5542831400022328, |
|
"kl": 0.005016326904296875, |
|
"learning_rate": 4.4837940345910917e-07, |
|
"loss": 0.0, |
|
"reward": 1.1250000298023224, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 0.9583333432674408, |
|
"rewards/solution_reward_func": 0.1666666716337204, |
|
"step": 116 |
|
}, |
|
{ |
|
"completion_length": 537.0000076293945, |
|
"epoch": 0.09920134510298445, |
|
"grad_norm": 0.32822474431275217, |
|
"kl": 0.005290985107421875, |
|
"learning_rate": 4.4639187581119116e-07, |
|
"loss": 0.0, |
|
"reward": 1.0, |
|
"reward_std": 0.1178511306643486, |
|
"rewards/format_reward_func": 0.9583333432674408, |
|
"rewards/solution_reward_func": 0.0416666679084301, |
|
"step": 118 |
|
}, |
|
{ |
|
"completion_length": 449.8333435058594, |
|
"epoch": 0.1008827238335435, |
|
"grad_norm": 0.6078651118658578, |
|
"kl": 0.005950927734375, |
|
"learning_rate": 4.443713876699123e-07, |
|
"loss": 0.0, |
|
"reward": 1.1250000596046448, |
|
"reward_std": 0.2946278266608715, |
|
"rewards/format_reward_func": 0.9583333432674408, |
|
"rewards/solution_reward_func": 0.1666666716337204, |
|
"step": 120 |
|
}, |
|
{ |
|
"completion_length": 487.08335876464844, |
|
"epoch": 0.10256410256410256, |
|
"grad_norm": 0.0015859284209239941, |
|
"kl": 0.00882720947265625, |
|
"learning_rate": 4.423182781342588e-07, |
|
"loss": 0.0, |
|
"reward": 1.0833333730697632, |
|
"reward_std": 0.1178511306643486, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.0833333358168602, |
|
"step": 122 |
|
}, |
|
{ |
|
"completion_length": 477.7916793823242, |
|
"epoch": 0.10424548129466163, |
|
"grad_norm": 0.8316436306146491, |
|
"kl": 0.00827789306640625, |
|
"learning_rate": 4.402328917780728e-07, |
|
"loss": 0.0, |
|
"reward": 1.041666716337204, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 0.9583333432674408, |
|
"rewards/solution_reward_func": 0.0833333358168602, |
|
"step": 124 |
|
}, |
|
{ |
|
"completion_length": 360.37500762939453, |
|
"epoch": 0.10592686002522068, |
|
"grad_norm": 0.5068283502332778, |
|
"kl": 0.0106048583984375, |
|
"learning_rate": 4.381155785922225e-07, |
|
"loss": 0.0, |
|
"reward": 1.166666716337204, |
|
"reward_std": 0.2357022613286972, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.1666666716337204, |
|
"step": 126 |
|
}, |
|
{ |
|
"completion_length": 373.25000762939453, |
|
"epoch": 0.10760823875577974, |
|
"grad_norm": 0.0008593493041121279, |
|
"kl": 0.0091400146484375, |
|
"learning_rate": 4.3596669392586363e-07, |
|
"loss": 0.0, |
|
"reward": 1.0833333730697632, |
|
"reward_std": 0.1178511306643486, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.0833333358168602, |
|
"step": 128 |
|
}, |
|
{ |
|
"completion_length": 373.3333435058594, |
|
"epoch": 0.1092896174863388, |
|
"grad_norm": 0.7307467398455231, |
|
"kl": 0.0075836181640625, |
|
"learning_rate": 4.337865984268001e-07, |
|
"loss": 0.0, |
|
"reward": 1.1250000298023224, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.1250000037252903, |
|
"step": 130 |
|
}, |
|
{ |
|
"completion_length": 394.6666717529297, |
|
"epoch": 0.11097099621689786, |
|
"grad_norm": 0.8811859841007285, |
|
"kl": 0.010345458984375, |
|
"learning_rate": 4.3157565798095746e-07, |
|
"loss": 0.0, |
|
"reward": 1.2500000596046448, |
|
"reward_std": 0.2357022613286972, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.2500000074505806, |
|
"step": 132 |
|
}, |
|
{ |
|
"completion_length": 341.2916793823242, |
|
"epoch": 0.11265237494745692, |
|
"grad_norm": 0.7652707329784861, |
|
"kl": 0.0125885009765625, |
|
"learning_rate": 4.293342436509756e-07, |
|
"loss": 0.0, |
|
"reward": 1.1666666865348816, |
|
"reward_std": 0.3535533919930458, |
|
"rewards/format_reward_func": 0.9166666865348816, |
|
"rewards/solution_reward_func": 0.2500000037252903, |
|
"step": 134 |
|
}, |
|
{ |
|
"completion_length": 478.3333435058594, |
|
"epoch": 0.11433375367801597, |
|
"grad_norm": 0.6268940088226042, |
|
"kl": 0.0119781494140625, |
|
"learning_rate": 4.2706273161393326e-07, |
|
"loss": 0.0, |
|
"reward": 1.1666667461395264, |
|
"reward_std": 0.2357022613286972, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.1666666716337204, |
|
"step": 136 |
|
}, |
|
{ |
|
"completion_length": 357.8333435058594, |
|
"epoch": 0.11601513240857503, |
|
"grad_norm": 0.6940515666291348, |
|
"kl": 0.0140228271484375, |
|
"learning_rate": 4.2476150309821437e-07, |
|
"loss": 0.0, |
|
"reward": 1.2083333730697632, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.2083333395421505, |
|
"step": 138 |
|
}, |
|
{ |
|
"completion_length": 398.0833435058594, |
|
"epoch": 0.1176965111391341, |
|
"grad_norm": 0.4068021983578087, |
|
"kl": 0.0218048095703125, |
|
"learning_rate": 4.2243094431952607e-07, |
|
"loss": 0.0, |
|
"reward": 1.2916666865348816, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 0.9583333432674408, |
|
"rewards/solution_reward_func": 0.3333333395421505, |
|
"step": 140 |
|
}, |
|
{ |
|
"completion_length": 391.5416717529297, |
|
"epoch": 0.11937788986969315, |
|
"grad_norm": 1.054400750652676, |
|
"kl": 0.0133209228515625, |
|
"learning_rate": 4.2007144641608035e-07, |
|
"loss": 0.0, |
|
"reward": 1.2500000596046448, |
|
"reward_std": 0.2357022613286972, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.2500000074505806, |
|
"step": 142 |
|
}, |
|
{ |
|
"completion_length": 362.9166717529297, |
|
"epoch": 0.1210592686002522, |
|
"grad_norm": 0.7202533244508015, |
|
"kl": 0.0169525146484375, |
|
"learning_rate": 4.1768340538294914e-07, |
|
"loss": 0.0, |
|
"reward": 1.2083333730697632, |
|
"reward_std": 0.2946278266608715, |
|
"rewards/format_reward_func": 0.9583333432674408, |
|
"rewards/solution_reward_func": 0.2500000037252903, |
|
"step": 144 |
|
}, |
|
{ |
|
"completion_length": 392.75000762939453, |
|
"epoch": 0.12274064733081126, |
|
"grad_norm": 0.5668355112824374, |
|
"kl": 0.0153656005859375, |
|
"learning_rate": 4.1526722200560436e-07, |
|
"loss": 0.0, |
|
"reward": 1.2083333432674408, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.2083333358168602, |
|
"step": 146 |
|
}, |
|
{ |
|
"completion_length": 488.00001525878906, |
|
"epoch": 0.12442202606137033, |
|
"grad_norm": 0.4109654167954109, |
|
"kl": 0.0104522705078125, |
|
"learning_rate": 4.1282330179265377e-07, |
|
"loss": 0.0, |
|
"reward": 1.0833333730697632, |
|
"reward_std": 0.1178511306643486, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.0833333358168602, |
|
"step": 148 |
|
}, |
|
{ |
|
"completion_length": 379.00000762939453, |
|
"epoch": 0.12610340479192939, |
|
"grad_norm": 0.6511266004361977, |
|
"kl": 0.0153656005859375, |
|
"learning_rate": 4.1035205490778496e-07, |
|
"loss": 0.0, |
|
"reward": 1.291666716337204, |
|
"reward_std": 0.2946278266608715, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.2916666716337204, |
|
"step": 150 |
|
}, |
|
{ |
|
"completion_length": 477.4583511352539, |
|
"epoch": 0.12778478352248845, |
|
"grad_norm": 0.41009026844574226, |
|
"kl": 0.0298004150390625, |
|
"learning_rate": 4.078538961009268e-07, |
|
"loss": 0.0, |
|
"reward": 1.2500000596046448, |
|
"reward_std": 0.2357022613286972, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.2500000037252903, |
|
"step": 152 |
|
}, |
|
{ |
|
"completion_length": 380.6666793823242, |
|
"epoch": 0.1294661622530475, |
|
"grad_norm": 0.41679517817122624, |
|
"kl": 0.0112762451171875, |
|
"learning_rate": 4.0532924463864214e-07, |
|
"loss": 0.0, |
|
"reward": 1.2500000298023224, |
|
"reward_std": 0.2357022613286972, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.2500000037252903, |
|
"step": 154 |
|
}, |
|
{ |
|
"completion_length": 331.00001525878906, |
|
"epoch": 0.13114754098360656, |
|
"grad_norm": 0.002301312664670671, |
|
"kl": 0.0154876708984375, |
|
"learning_rate": 4.027785242337625e-07, |
|
"loss": 0.0, |
|
"reward": 1.291666716337204, |
|
"reward_std": 0.0589255653321743, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.2916666753590107, |
|
"step": 156 |
|
}, |
|
{ |
|
"completion_length": 337.1666717529297, |
|
"epoch": 0.1328289197141656, |
|
"grad_norm": 0.41037493061839153, |
|
"kl": 0.01825714111328125, |
|
"learning_rate": 4.002021629742759e-07, |
|
"loss": 0.0, |
|
"reward": 1.25, |
|
"reward_std": 0.1178511306643486, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.25, |
|
"step": 158 |
|
}, |
|
{ |
|
"completion_length": 355.3333435058594, |
|
"epoch": 0.13451029844472467, |
|
"grad_norm": 0.4834324481747559, |
|
"kl": 0.0101470947265625, |
|
"learning_rate": 3.9760059325148063e-07, |
|
"loss": 0.0, |
|
"reward": 1.291666716337204, |
|
"reward_std": 0.2946278266608715, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.2916666753590107, |
|
"step": 160 |
|
}, |
|
{ |
|
"completion_length": 475.8333511352539, |
|
"epoch": 0.13619167717528374, |
|
"grad_norm": 0.0007131492797933627, |
|
"kl": 0.00870513916015625, |
|
"learning_rate": 3.949742516874175e-07, |
|
"loss": 0.0, |
|
"reward": 1.4583333730697632, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.4583333432674408, |
|
"step": 162 |
|
}, |
|
{ |
|
"completion_length": 400.0416793823242, |
|
"epoch": 0.13787305590584278, |
|
"grad_norm": 0.6704272888706837, |
|
"kl": 0.01324462890625, |
|
"learning_rate": 3.9232357906159065e-07, |
|
"loss": 0.0, |
|
"reward": 1.2500000596046448, |
|
"reward_std": 0.2357022613286972, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.2500000074505806, |
|
"step": 164 |
|
}, |
|
{ |
|
"completion_length": 391.62501525878906, |
|
"epoch": 0.13955443463640185, |
|
"grad_norm": 0.8927607188893651, |
|
"kl": 0.00870513916015625, |
|
"learning_rate": 3.8964902023699234e-07, |
|
"loss": 0.0, |
|
"reward": 1.2916666865348816, |
|
"reward_std": 0.4124789498746395, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.2916666679084301, |
|
"step": 166 |
|
}, |
|
{ |
|
"completion_length": 432.1666793823242, |
|
"epoch": 0.14123581336696092, |
|
"grad_norm": 0.5356539936839141, |
|
"kl": 0.00945281982421875, |
|
"learning_rate": 3.869510240854407e-07, |
|
"loss": 0.0, |
|
"reward": 1.2500000596046448, |
|
"reward_std": 0.2357022613286972, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.2500000037252903, |
|
"step": 168 |
|
}, |
|
{ |
|
"completion_length": 400.9583511352539, |
|
"epoch": 0.14291719209751996, |
|
"grad_norm": 0.6797066476960871, |
|
"kl": 0.0103912353515625, |
|
"learning_rate": 3.8423004341224595e-07, |
|
"loss": 0.0, |
|
"reward": 1.2083334028720856, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.2083333395421505, |
|
"step": 170 |
|
}, |
|
{ |
|
"completion_length": 465.2083511352539, |
|
"epoch": 0.14459857082807903, |
|
"grad_norm": 0.7805415268970916, |
|
"kl": 0.00925445556640625, |
|
"learning_rate": 3.8148653488021566e-07, |
|
"loss": 0.0, |
|
"reward": 1.3333333730697632, |
|
"reward_std": 0.2357022613286972, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.3333333395421505, |
|
"step": 172 |
|
}, |
|
{ |
|
"completion_length": 435.0833435058594, |
|
"epoch": 0.14627994955863807, |
|
"grad_norm": 0.4294043427967522, |
|
"kl": 0.006561279296875, |
|
"learning_rate": 3.787209589330134e-07, |
|
"loss": 0.0, |
|
"reward": 1.2083333730697632, |
|
"reward_std": 0.2946278266608715, |
|
"rewards/format_reward_func": 0.9583333432674408, |
|
"rewards/solution_reward_func": 0.2500000037252903, |
|
"step": 174 |
|
}, |
|
{ |
|
"completion_length": 474.00001525878906, |
|
"epoch": 0.14796132828919714, |
|
"grad_norm": 0.5780056567965537, |
|
"kl": 0.0077667236328125, |
|
"learning_rate": 3.759337797178816e-07, |
|
"loss": 0.0, |
|
"reward": 1.2916667461395264, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.2916666753590107, |
|
"step": 176 |
|
}, |
|
{ |
|
"completion_length": 483.50000762939453, |
|
"epoch": 0.1496427070197562, |
|
"grad_norm": 0.7711416518392432, |
|
"kl": 0.00598907470703125, |
|
"learning_rate": 3.7312546500774455e-07, |
|
"loss": 0.0, |
|
"reward": 1.4583334028720856, |
|
"reward_std": 0.4124789573252201, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.4583333469927311, |
|
"step": 178 |
|
}, |
|
{ |
|
"completion_length": 560.7916793823242, |
|
"epoch": 0.15132408575031525, |
|
"grad_norm": 0.6275316044622387, |
|
"kl": 0.007049560546875, |
|
"learning_rate": 3.7029648612270123e-07, |
|
"loss": 0.0, |
|
"reward": 1.1250000298023224, |
|
"reward_std": 0.2946278229355812, |
|
"rewards/format_reward_func": 0.9583333432674408, |
|
"rewards/solution_reward_func": 0.1666666679084301, |
|
"step": 180 |
|
}, |
|
{ |
|
"completion_length": 493.2916793823242, |
|
"epoch": 0.15300546448087432, |
|
"grad_norm": 0.41941279841402085, |
|
"kl": 0.00750732421875, |
|
"learning_rate": 3.6744731785092393e-07, |
|
"loss": 0.0, |
|
"reward": 1.1666666865348816, |
|
"reward_std": 0.3535533919930458, |
|
"rewards/format_reward_func": 0.9583333432674408, |
|
"rewards/solution_reward_func": 0.2083333395421505, |
|
"step": 182 |
|
}, |
|
{ |
|
"completion_length": 431.25001525878906, |
|
"epoch": 0.1546868432114334, |
|
"grad_norm": 0.5134185107434652, |
|
"kl": 0.0072479248046875, |
|
"learning_rate": 3.6457843836897417e-07, |
|
"loss": 0.0, |
|
"reward": 1.416666716337204, |
|
"reward_std": 0.2357022613286972, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.4166666753590107, |
|
"step": 184 |
|
}, |
|
{ |
|
"completion_length": 427.2916717529297, |
|
"epoch": 0.15636822194199243, |
|
"grad_norm": 0.498752511394473, |
|
"kl": 0.01192474365234375, |
|
"learning_rate": 3.6169032916155055e-07, |
|
"loss": 0.0, |
|
"reward": 1.2083334028720856, |
|
"reward_std": 0.2946278266608715, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.2083333395421505, |
|
"step": 186 |
|
}, |
|
{ |
|
"completion_length": 529.5833435058594, |
|
"epoch": 0.1580496006725515, |
|
"grad_norm": 0.484964125720315, |
|
"kl": 0.0084381103515625, |
|
"learning_rate": 3.587834749406808e-07, |
|
"loss": 0.0, |
|
"reward": 1.2083333730697632, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 0.9583333432674408, |
|
"rewards/solution_reward_func": 0.2500000074505806, |
|
"step": 188 |
|
}, |
|
{ |
|
"completion_length": 442.75000762939453, |
|
"epoch": 0.15973097940311054, |
|
"grad_norm": 0.29670849204677346, |
|
"kl": 0.00942230224609375, |
|
"learning_rate": 3.558583635643726e-07, |
|
"loss": 0.0, |
|
"reward": 1.2500000298023224, |
|
"reward_std": 0.1178511306643486, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.2500000111758709, |
|
"step": 190 |
|
}, |
|
{ |
|
"completion_length": 461.12501525878906, |
|
"epoch": 0.1614123581336696, |
|
"grad_norm": 0.8668624278803033, |
|
"kl": 0.010345458984375, |
|
"learning_rate": 3.52915485954736e-07, |
|
"loss": 0.0, |
|
"reward": 1.2500000596046448, |
|
"reward_std": 0.2357022613286972, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.2500000074505806, |
|
"step": 192 |
|
}, |
|
{ |
|
"completion_length": 552.1250076293945, |
|
"epoch": 0.16309373686422868, |
|
"grad_norm": 0.5686603271249641, |
|
"kl": 0.00679779052734375, |
|
"learning_rate": 3.4995533601559225e-07, |
|
"loss": 0.0, |
|
"reward": 1.2500000596046448, |
|
"reward_std": 0.3535533919930458, |
|
"rewards/format_reward_func": 0.9583333432674408, |
|
"rewards/solution_reward_func": 0.2916666753590107, |
|
"step": 194 |
|
}, |
|
{ |
|
"completion_length": 489.75000762939453, |
|
"epoch": 0.16477511559478772, |
|
"grad_norm": 0.9001750933871568, |
|
"kl": 0.01209259033203125, |
|
"learning_rate": 3.469784105495816e-07, |
|
"loss": 0.0, |
|
"reward": 1.3750000298023224, |
|
"reward_std": 0.4124789573252201, |
|
"rewards/format_reward_func": 0.9583333432674408, |
|
"rewards/solution_reward_func": 0.4166666716337204, |
|
"step": 196 |
|
}, |
|
{ |
|
"completion_length": 504.29168701171875, |
|
"epoch": 0.1664564943253468, |
|
"grad_norm": 0.351872496477236, |
|
"kl": 0.01596832275390625, |
|
"learning_rate": 3.4398520917478476e-07, |
|
"loss": 0.0, |
|
"reward": 1.2500000298023224, |
|
"reward_std": 0.2357022613286972, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.2500000037252903, |
|
"step": 198 |
|
}, |
|
{ |
|
"completion_length": 551.7083511352539, |
|
"epoch": 0.16813787305590586, |
|
"grad_norm": 0.7904185495449961, |
|
"kl": 0.0072021484375, |
|
"learning_rate": 3.409762342408719e-07, |
|
"loss": 0.0, |
|
"reward": 1.2916666865348816, |
|
"reward_std": 0.4124789535999298, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.2916666716337204, |
|
"step": 200 |
|
}, |
|
{ |
|
"completion_length": 436.7916793823242, |
|
"epoch": 0.1698192517864649, |
|
"grad_norm": 0.30374732572574603, |
|
"kl": 0.01061248779296875, |
|
"learning_rate": 3.379519907447931e-07, |
|
"loss": 0.0, |
|
"reward": 1.2916666865348816, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.2916666679084301, |
|
"step": 202 |
|
}, |
|
{ |
|
"completion_length": 494.2083511352539, |
|
"epoch": 0.17150063051702397, |
|
"grad_norm": 0.4216305294562641, |
|
"kl": 0.00971221923828125, |
|
"learning_rate": 3.349129862460251e-07, |
|
"loss": 0.0, |
|
"reward": 1.2500000596046448, |
|
"reward_std": 0.1178511306643486, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.2500000074505806, |
|
"step": 204 |
|
}, |
|
{ |
|
"completion_length": 436.58333587646484, |
|
"epoch": 0.173182009247583, |
|
"grad_norm": 0.34898976819652716, |
|
"kl": 0.00868988037109375, |
|
"learning_rate": 3.318597307813866e-07, |
|
"loss": 0.0, |
|
"reward": 1.5416666865348816, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.541666679084301, |
|
"step": 206 |
|
}, |
|
{ |
|
"completion_length": 552.3333511352539, |
|
"epoch": 0.17486338797814208, |
|
"grad_norm": 0.001036227801857544, |
|
"kl": 0.00815582275390625, |
|
"learning_rate": 3.287927367794397e-07, |
|
"loss": 0.0, |
|
"reward": 1.125, |
|
"reward_std": 0.0589255653321743, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.125, |
|
"step": 208 |
|
}, |
|
{ |
|
"completion_length": 541.2500228881836, |
|
"epoch": 0.17654476670870115, |
|
"grad_norm": 0.6275771663735162, |
|
"kl": 0.11170196533203125, |
|
"learning_rate": 3.2571251897448763e-07, |
|
"loss": 0.0001, |
|
"reward": 1.3333333432674408, |
|
"reward_std": 0.2357022576034069, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.3333333358168602, |
|
"step": 210 |
|
}, |
|
{ |
|
"completion_length": 530.9166717529297, |
|
"epoch": 0.1782261454392602, |
|
"grad_norm": 0.5568701150128608, |
|
"kl": 0.011932373046875, |
|
"learning_rate": 3.226195943201883e-07, |
|
"loss": 0.0, |
|
"reward": 1.041666716337204, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 0.9583333432674408, |
|
"rewards/solution_reward_func": 0.0833333358168602, |
|
"step": 212 |
|
}, |
|
{ |
|
"completion_length": 503.0416717529297, |
|
"epoch": 0.17990752416981926, |
|
"grad_norm": 0.668445204981218, |
|
"kl": 0.01616668701171875, |
|
"learning_rate": 3.1951448190279253e-07, |
|
"loss": 0.0, |
|
"reward": 1.4583334028720856, |
|
"reward_std": 0.2946278266608715, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.4583333469927311, |
|
"step": 214 |
|
}, |
|
{ |
|
"completion_length": 493.5416717529297, |
|
"epoch": 0.18158890290037832, |
|
"grad_norm": 0.8478573951713925, |
|
"kl": 0.00855255126953125, |
|
"learning_rate": 3.163977028540263e-07, |
|
"loss": 0.0, |
|
"reward": 1.416666716337204, |
|
"reward_std": 0.4714045189321041, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.4166666753590107, |
|
"step": 216 |
|
}, |
|
{ |
|
"completion_length": 591.0416870117188, |
|
"epoch": 0.18327028163093737, |
|
"grad_norm": 0.7348765645347503, |
|
"kl": 0.0094451904296875, |
|
"learning_rate": 3.1326978026362905e-07, |
|
"loss": 0.0, |
|
"reward": 1.2083333730697632, |
|
"reward_std": 0.4124789535999298, |
|
"rewards/format_reward_func": 0.9583333432674408, |
|
"rewards/solution_reward_func": 0.2500000074505806, |
|
"step": 218 |
|
}, |
|
{ |
|
"completion_length": 406.12500762939453, |
|
"epoch": 0.18495166036149643, |
|
"grad_norm": 0.6283226728886679, |
|
"kl": 0.00975799560546875, |
|
"learning_rate": 3.101312390915634e-07, |
|
"loss": 0.0, |
|
"reward": 1.2916667461395264, |
|
"reward_std": 0.2946278266608715, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.2916666753590107, |
|
"step": 220 |
|
}, |
|
{ |
|
"completion_length": 434.6666793823242, |
|
"epoch": 0.18663303909205547, |
|
"grad_norm": 0.46740684202706906, |
|
"kl": 0.0135955810546875, |
|
"learning_rate": 3.069826060799109e-07, |
|
"loss": 0.0, |
|
"reward": 1.2083333730697632, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 0.9583333432674408, |
|
"rewards/solution_reward_func": 0.2500000074505806, |
|
"step": 222 |
|
}, |
|
{ |
|
"completion_length": 512.3750152587891, |
|
"epoch": 0.18831441782261454, |
|
"grad_norm": 0.6829987852089712, |
|
"kl": 0.0085601806640625, |
|
"learning_rate": 3.038244096644687e-07, |
|
"loss": 0.0, |
|
"reward": 1.3750000298023224, |
|
"reward_std": 0.2946278266608715, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.3750000037252903, |
|
"step": 224 |
|
}, |
|
{ |
|
"completion_length": 443.3333435058594, |
|
"epoch": 0.1899957965531736, |
|
"grad_norm": 0.7302277802124593, |
|
"kl": 0.0476226806640625, |
|
"learning_rate": 3.0065717988606256e-07, |
|
"loss": 0.0, |
|
"reward": 1.3750000298023224, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.3750000149011612, |
|
"step": 226 |
|
}, |
|
{ |
|
"completion_length": 450.37500762939453, |
|
"epoch": 0.19167717528373265, |
|
"grad_norm": 0.0033526514387488097, |
|
"kl": 0.0142059326171875, |
|
"learning_rate": 2.974814483015892e-07, |
|
"loss": 0.0, |
|
"reward": 1.4583333730697632, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.4583333432674408, |
|
"step": 228 |
|
}, |
|
{ |
|
"completion_length": 484.16668701171875, |
|
"epoch": 0.19335855401429172, |
|
"grad_norm": 0.4236067078997056, |
|
"kl": 0.01483154296875, |
|
"learning_rate": 2.942977478948057e-07, |
|
"loss": 0.0, |
|
"reward": 1.291666716337204, |
|
"reward_std": 0.2946278266608715, |
|
"rewards/format_reward_func": 0.9583333432674408, |
|
"rewards/solution_reward_func": 0.3333333395421505, |
|
"step": 230 |
|
}, |
|
{ |
|
"completion_length": 470.75001525878906, |
|
"epoch": 0.1950399327448508, |
|
"grad_norm": 0.5734136074701052, |
|
"kl": 0.0141448974609375, |
|
"learning_rate": 2.911066129868782e-07, |
|
"loss": 0.0, |
|
"reward": 1.5000000298023224, |
|
"reward_std": 0.2357022613286972, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.5000000074505806, |
|
"step": 232 |
|
}, |
|
{ |
|
"completion_length": 539.8750152587891, |
|
"epoch": 0.19672131147540983, |
|
"grad_norm": 0.8539808804727996, |
|
"kl": 0.02154541015625, |
|
"learning_rate": 2.87908579146707e-07, |
|
"loss": 0.0, |
|
"reward": 1.3333333730697632, |
|
"reward_std": 0.3535533882677555, |
|
"rewards/format_reward_func": 0.9583333432674408, |
|
"rewards/solution_reward_func": 0.3750000074505806, |
|
"step": 234 |
|
}, |
|
{ |
|
"completion_length": 491.7500228881836, |
|
"epoch": 0.1984026902059689, |
|
"grad_norm": 0.3028340455454943, |
|
"kl": 0.0213470458984375, |
|
"learning_rate": 2.847041831010417e-07, |
|
"loss": 0.0, |
|
"reward": 1.3750000596046448, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.3750000074505806, |
|
"step": 236 |
|
}, |
|
{ |
|
"completion_length": 376.50000762939453, |
|
"epoch": 0.20008406893652794, |
|
"grad_norm": 0.31222669099640876, |
|
"kl": 0.020355224609375, |
|
"learning_rate": 2.8149396264440227e-07, |
|
"loss": 0.0, |
|
"reward": 1.4583333730697632, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.4583333432674408, |
|
"step": 238 |
|
}, |
|
{ |
|
"completion_length": 343.5833435058594, |
|
"epoch": 0.201765447667087, |
|
"grad_norm": 0.7628977182906207, |
|
"kl": 0.0173187255859375, |
|
"learning_rate": 2.782784565488211e-07, |
|
"loss": 0.0, |
|
"reward": 1.4166666865348816, |
|
"reward_std": 0.2357022613286972, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.4166666679084301, |
|
"step": 240 |
|
}, |
|
{ |
|
"completion_length": 347.7916717529297, |
|
"epoch": 0.20344682639764608, |
|
"grad_norm": 0.9647796082832435, |
|
"kl": 0.033447265625, |
|
"learning_rate": 2.7505820447342024e-07, |
|
"loss": 0.0, |
|
"reward": 1.541666716337204, |
|
"reward_std": 0.2946278266608715, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.5416666828095913, |
|
"step": 242 |
|
}, |
|
{ |
|
"completion_length": 367.0833435058594, |
|
"epoch": 0.20512820512820512, |
|
"grad_norm": 0.5497011882389701, |
|
"kl": 0.01873779296875, |
|
"learning_rate": 2.7183374687384096e-07, |
|
"loss": 0.0, |
|
"reward": 1.3750000298023224, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.3750000111758709, |
|
"step": 244 |
|
}, |
|
{ |
|
"completion_length": 386.12500762939453, |
|
"epoch": 0.2068095838587642, |
|
"grad_norm": 0.7366647205721275, |
|
"kl": 0.028411865234375, |
|
"learning_rate": 2.686056249115385e-07, |
|
"loss": 0.0, |
|
"reward": 1.3333333432674408, |
|
"reward_std": 0.2357022613286972, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.3333333358168602, |
|
"step": 246 |
|
}, |
|
{ |
|
"completion_length": 490.3333435058594, |
|
"epoch": 0.20849096258932326, |
|
"grad_norm": 0.3640781084742551, |
|
"kl": 0.0124053955078125, |
|
"learning_rate": 2.653743803629587e-07, |
|
"loss": 0.0, |
|
"reward": 1.3750000298023224, |
|
"reward_std": 0.2946278229355812, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.3750000037252903, |
|
"step": 248 |
|
}, |
|
{ |
|
"completion_length": 496.6666793823242, |
|
"epoch": 0.2101723413198823, |
|
"grad_norm": 0.4955566369463096, |
|
"kl": 0.02813720703125, |
|
"learning_rate": 2.621405555286121e-07, |
|
"loss": 0.0, |
|
"reward": 1.3750000596046448, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.3750000111758709, |
|
"step": 250 |
|
}, |
|
{ |
|
"completion_length": 403.0000114440918, |
|
"epoch": 0.21185372005044137, |
|
"grad_norm": 0.4129623000055268, |
|
"kl": 0.02422332763671875, |
|
"learning_rate": 2.589046931420589e-07, |
|
"loss": 0.0, |
|
"reward": 1.5000000596046448, |
|
"reward_std": 0.1178511306643486, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.5000000149011612, |
|
"step": 252 |
|
}, |
|
{ |
|
"completion_length": 381.75000762939453, |
|
"epoch": 0.2135350987810004, |
|
"grad_norm": 0.3516450115328973, |
|
"kl": 0.0178985595703125, |
|
"learning_rate": 2.556673362788225e-07, |
|
"loss": 0.0, |
|
"reward": 1.541666716337204, |
|
"reward_std": 0.0589255653321743, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.541666679084301, |
|
"step": 254 |
|
}, |
|
{ |
|
"completion_length": 458.0416793823242, |
|
"epoch": 0.21521647751155948, |
|
"grad_norm": 0.4125605646498622, |
|
"kl": 0.0144500732421875, |
|
"learning_rate": 2.524290282652443e-07, |
|
"loss": 0.0, |
|
"reward": 1.4583333730697632, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.4583333395421505, |
|
"step": 256 |
|
}, |
|
{ |
|
"completion_length": 514.2083511352539, |
|
"epoch": 0.21689785624211855, |
|
"grad_norm": 0.2564484003528315, |
|
"kl": 0.0143585205078125, |
|
"learning_rate": 2.4919031258729785e-07, |
|
"loss": 0.0, |
|
"reward": 1.291666716337204, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.2916666716337204, |
|
"step": 258 |
|
}, |
|
{ |
|
"completion_length": 437.8333511352539, |
|
"epoch": 0.2185792349726776, |
|
"grad_norm": 1.0909769396928994, |
|
"kl": 0.015350341796875, |
|
"learning_rate": 2.459517327993746e-07, |
|
"loss": 0.0, |
|
"reward": 1.5000000298023224, |
|
"reward_std": 0.3535533882677555, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.5000000074505806, |
|
"step": 260 |
|
}, |
|
{ |
|
"completion_length": 408.3333435058594, |
|
"epoch": 0.22026061370323666, |
|
"grad_norm": 0.4944126035221868, |
|
"kl": 0.036376953125, |
|
"learning_rate": 2.427138324330601e-07, |
|
"loss": 0.0, |
|
"reward": 1.666666716337204, |
|
"reward_std": 0.1178511306643486, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.6666666939854622, |
|
"step": 262 |
|
}, |
|
{ |
|
"completion_length": 484.45835876464844, |
|
"epoch": 0.22194199243379573, |
|
"grad_norm": 0.5710232427079407, |
|
"kl": 0.0914764404296875, |
|
"learning_rate": 2.3947715490591203e-07, |
|
"loss": 0.0001, |
|
"reward": 1.541666716337204, |
|
"reward_std": 0.2946278266608715, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.541666679084301, |
|
"step": 264 |
|
}, |
|
{ |
|
"completion_length": 381.6666793823242, |
|
"epoch": 0.22362337116435477, |
|
"grad_norm": 0.5414781111970816, |
|
"kl": 0.0176849365234375, |
|
"learning_rate": 2.3624224343025876e-07, |
|
"loss": 0.0, |
|
"reward": 1.5833333730697632, |
|
"reward_std": 0.2357022576034069, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.5833333432674408, |
|
"step": 266 |
|
}, |
|
{ |
|
"completion_length": 381.9166793823242, |
|
"epoch": 0.22530474989491384, |
|
"grad_norm": 0.2752143657107066, |
|
"kl": 0.015380859375, |
|
"learning_rate": 2.3300964092203203e-07, |
|
"loss": 0.0, |
|
"reward": 1.4583333730697632, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.4583333432674408, |
|
"step": 268 |
|
}, |
|
{ |
|
"completion_length": 409.4166717529297, |
|
"epoch": 0.22698612862547288, |
|
"grad_norm": 0.7197256320386043, |
|
"kl": 0.012451171875, |
|
"learning_rate": 2.2977988990964896e-07, |
|
"loss": 0.0, |
|
"reward": 1.4583333730697632, |
|
"reward_std": 0.2946278229355812, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.4583333432674408, |
|
"step": 270 |
|
}, |
|
{ |
|
"completion_length": 402.5416717529297, |
|
"epoch": 0.22866750735603195, |
|
"grad_norm": 1.0400541151794251, |
|
"kl": 0.0240020751953125, |
|
"learning_rate": 2.2655353244295927e-07, |
|
"loss": 0.0, |
|
"reward": 1.5000000298023224, |
|
"reward_std": 0.3535533882677555, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.5000000074505806, |
|
"step": 272 |
|
}, |
|
{ |
|
"completion_length": 459.62501525878906, |
|
"epoch": 0.23034888608659101, |
|
"grad_norm": 0.5084744997876609, |
|
"kl": 0.020263671875, |
|
"learning_rate": 2.233311100022734e-07, |
|
"loss": 0.0, |
|
"reward": 1.541666716337204, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.5416666828095913, |
|
"step": 274 |
|
}, |
|
{ |
|
"completion_length": 366.4583435058594, |
|
"epoch": 0.23203026481715006, |
|
"grad_norm": 0.7424502755105113, |
|
"kl": 0.1666107177734375, |
|
"learning_rate": 2.2011316340748528e-07, |
|
"loss": 0.0002, |
|
"reward": 1.6250000298023224, |
|
"reward_std": 0.2946278229355812, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.6250000149011612, |
|
"step": 276 |
|
}, |
|
{ |
|
"completion_length": 495.16668701171875, |
|
"epoch": 0.23371164354770912, |
|
"grad_norm": 0.8257211078506724, |
|
"kl": 0.0146484375, |
|
"learning_rate": 2.1690023272730678e-07, |
|
"loss": 0.0, |
|
"reward": 1.541666716337204, |
|
"reward_std": 0.5303300879895687, |
|
"rewards/format_reward_func": 0.9583333432674408, |
|
"rewards/solution_reward_func": 0.5833333432674408, |
|
"step": 278 |
|
}, |
|
{ |
|
"completion_length": 457.7916793823242, |
|
"epoch": 0.2353930222782682, |
|
"grad_norm": 0.42375918647040944, |
|
"kl": 0.0121307373046875, |
|
"learning_rate": 2.1369285718862748e-07, |
|
"loss": 0.0, |
|
"reward": 1.541666716337204, |
|
"reward_std": 0.0589255653321743, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.541666679084301, |
|
"step": 280 |
|
}, |
|
{ |
|
"completion_length": 420.37500762939453, |
|
"epoch": 0.23707440100882723, |
|
"grad_norm": 0.5152043630269939, |
|
"kl": 0.0162506103515625, |
|
"learning_rate": 2.104915750860164e-07, |
|
"loss": 0.0, |
|
"reward": 1.4583333730697632, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.4583333395421505, |
|
"step": 282 |
|
}, |
|
{ |
|
"completion_length": 385.8333511352539, |
|
"epoch": 0.2387557797393863, |
|
"grad_norm": 0.49294668816422704, |
|
"kl": 0.0164794921875, |
|
"learning_rate": 2.072969236913799e-07, |
|
"loss": 0.0, |
|
"reward": 1.3333333730697632, |
|
"reward_std": 0.2357022613286972, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.3333333358168602, |
|
"step": 284 |
|
}, |
|
{ |
|
"completion_length": 392.62500762939453, |
|
"epoch": 0.24043715846994534, |
|
"grad_norm": 0.6512225875746797, |
|
"kl": 0.01849365234375, |
|
"learning_rate": 2.0410943916379097e-07, |
|
"loss": 0.0, |
|
"reward": 1.416666716337204, |
|
"reward_std": 0.2357022613286972, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.4166666753590107, |
|
"step": 286 |
|
}, |
|
{ |
|
"completion_length": 412.12501525878906, |
|
"epoch": 0.2421185372005044, |
|
"grad_norm": 0.3660817551390846, |
|
"kl": 0.010711669921875, |
|
"learning_rate": 2.0092965645950564e-07, |
|
"loss": 0.0, |
|
"reward": 1.5833333730697632, |
|
"reward_std": 0.1178511306643486, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.5833333395421505, |
|
"step": 288 |
|
}, |
|
{ |
|
"completion_length": 430.12500762939453, |
|
"epoch": 0.24379991593106348, |
|
"grad_norm": 0.626448385607845, |
|
"kl": 0.0183258056640625, |
|
"learning_rate": 1.977581092421812e-07, |
|
"loss": 0.0, |
|
"reward": 1.416666716337204, |
|
"reward_std": 0.2357022613286972, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.4166666753590107, |
|
"step": 290 |
|
}, |
|
{ |
|
"completion_length": 396.7083435058594, |
|
"epoch": 0.24548129466162252, |
|
"grad_norm": 0.004913345703168958, |
|
"kl": 0.020263671875, |
|
"learning_rate": 1.9459532979331148e-07, |
|
"loss": 0.0, |
|
"reward": 1.541666716337204, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.5416666865348816, |
|
"step": 292 |
|
}, |
|
{ |
|
"completion_length": 493.9583511352539, |
|
"epoch": 0.2471626733921816, |
|
"grad_norm": 0.5565359495913534, |
|
"kl": 0.0181427001953125, |
|
"learning_rate": 1.9144184892289336e-07, |
|
"loss": 0.0, |
|
"reward": 1.4583333432674408, |
|
"reward_std": 0.2946278229355812, |
|
"rewards/format_reward_func": 0.9583333432674408, |
|
"rewards/solution_reward_func": 0.5000000074505806, |
|
"step": 294 |
|
}, |
|
{ |
|
"completion_length": 422.45835876464844, |
|
"epoch": 0.24884405212274066, |
|
"grad_norm": 0.449063011244765, |
|
"kl": 0.0212249755859375, |
|
"learning_rate": 1.882981958803414e-07, |
|
"loss": 0.0, |
|
"reward": 1.4583333432674408, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.4583333358168602, |
|
"step": 296 |
|
}, |
|
{ |
|
"completion_length": 511.58335876464844, |
|
"epoch": 0.2505254308532997, |
|
"grad_norm": 0.5020099782167112, |
|
"kl": 0.011138916015625, |
|
"learning_rate": 1.8516489826566374e-07, |
|
"loss": 0.0, |
|
"reward": 1.4583333432674408, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.4583333358168602, |
|
"step": 298 |
|
}, |
|
{ |
|
"completion_length": 412.25000762939453, |
|
"epoch": 0.25220680958385877, |
|
"grad_norm": 0.0014012414260649606, |
|
"kl": 0.02069091796875, |
|
"learning_rate": 1.8204248194091425e-07, |
|
"loss": 0.0, |
|
"reward": 1.7083333730697632, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.7083333507180214, |
|
"step": 300 |
|
}, |
|
{ |
|
"completion_length": 450.8333435058594, |
|
"epoch": 0.25388818831441784, |
|
"grad_norm": 0.24821505940896457, |
|
"kl": 0.013641357421875, |
|
"learning_rate": 1.7893147094193784e-07, |
|
"loss": 0.0, |
|
"reward": 1.541666716337204, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 0.9583333432674408, |
|
"rewards/solution_reward_func": 0.5833333432674408, |
|
"step": 302 |
|
}, |
|
{ |
|
"completion_length": 407.37501525878906, |
|
"epoch": 0.2555695670449769, |
|
"grad_norm": 0.7504631135855148, |
|
"kl": 0.0157470703125, |
|
"learning_rate": 1.7583238739042084e-07, |
|
"loss": 0.0, |
|
"reward": 1.7083333432674408, |
|
"reward_std": 0.2946278229355812, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.7083333432674408, |
|
"step": 304 |
|
}, |
|
{ |
|
"completion_length": 454.37500762939453, |
|
"epoch": 0.2572509457755359, |
|
"grad_norm": 0.6337846991609941, |
|
"kl": 0.0143890380859375, |
|
"learning_rate": 1.7274575140626315e-07, |
|
"loss": 0.0, |
|
"reward": 1.7083334028720856, |
|
"reward_std": 0.2946278266608715, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.708333358168602, |
|
"step": 306 |
|
}, |
|
{ |
|
"completion_length": 444.0833435058594, |
|
"epoch": 0.258932324506095, |
|
"grad_norm": 0.6680546776528479, |
|
"kl": 0.0210113525390625, |
|
"learning_rate": 1.6967208102028696e-07, |
|
"loss": 0.0, |
|
"reward": 1.4583333730697632, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.4583333432674408, |
|
"step": 308 |
|
}, |
|
{ |
|
"completion_length": 415.62500762939453, |
|
"epoch": 0.26061370323665406, |
|
"grad_norm": 0.390703161764533, |
|
"kl": 0.054595947265625, |
|
"learning_rate": 1.6661189208729489e-07, |
|
"loss": 0.0001, |
|
"reward": 1.5000000298023224, |
|
"reward_std": 0.1178511306643486, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.5000000149011612, |
|
"step": 310 |
|
}, |
|
{ |
|
"completion_length": 390.83333587646484, |
|
"epoch": 0.26229508196721313, |
|
"grad_norm": 0.42044370258908614, |
|
"kl": 0.016571044921875, |
|
"learning_rate": 1.6356569819949427e-07, |
|
"loss": 0.0, |
|
"reward": 1.5000000298023224, |
|
"reward_std": 0.1178511306643486, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.5000000149011612, |
|
"step": 312 |
|
}, |
|
{ |
|
"completion_length": 430.87501525878906, |
|
"epoch": 0.2639764606977722, |
|
"grad_norm": 0.30802749969680127, |
|
"kl": 0.020721435546875, |
|
"learning_rate": 1.6053401060030097e-07, |
|
"loss": 0.0, |
|
"reward": 1.7500000596046448, |
|
"reward_std": 0.2357022613286972, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.7500000298023224, |
|
"step": 314 |
|
}, |
|
{ |
|
"completion_length": 381.2916793823242, |
|
"epoch": 0.2656578394283312, |
|
"grad_norm": 0.4549527318175164, |
|
"kl": 0.0202789306640625, |
|
"learning_rate": 1.57517338098537e-07, |
|
"loss": 0.0, |
|
"reward": 1.5833333432674408, |
|
"reward_std": 0.1178511306643486, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.5833333358168602, |
|
"step": 316 |
|
}, |
|
{ |
|
"completion_length": 365.08333587646484, |
|
"epoch": 0.2673392181588903, |
|
"grad_norm": 0.46488928219712783, |
|
"kl": 0.032012939453125, |
|
"learning_rate": 1.545161869830371e-07, |
|
"loss": 0.0, |
|
"reward": 1.666666716337204, |
|
"reward_std": 0.1178511306643486, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.6666666939854622, |
|
"step": 318 |
|
}, |
|
{ |
|
"completion_length": 353.62501525878906, |
|
"epoch": 0.26902059688944935, |
|
"grad_norm": 0.8439115565804695, |
|
"kl": 0.018096923828125, |
|
"learning_rate": 1.5153106093767825e-07, |
|
"loss": 0.0, |
|
"reward": 1.7500000596046448, |
|
"reward_std": 0.3535533919930458, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.7500000298023224, |
|
"step": 320 |
|
}, |
|
{ |
|
"completion_length": 425.3333511352539, |
|
"epoch": 0.2707019756200084, |
|
"grad_norm": 0.0030299941880457203, |
|
"kl": 0.016815185546875, |
|
"learning_rate": 1.4856246095684622e-07, |
|
"loss": 0.0, |
|
"reward": 1.8750000298023224, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.8750000149011612, |
|
"step": 322 |
|
}, |
|
{ |
|
"completion_length": 408.4583435058594, |
|
"epoch": 0.2723833543505675, |
|
"grad_norm": 0.45480636781617306, |
|
"kl": 0.024200439453125, |
|
"learning_rate": 1.4561088526135374e-07, |
|
"loss": 0.0, |
|
"reward": 1.4583333432674408, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.4583333358168602, |
|
"step": 324 |
|
}, |
|
{ |
|
"completion_length": 354.9583435058594, |
|
"epoch": 0.2740647330811265, |
|
"grad_norm": 0.3157474969297678, |
|
"kl": 0.019775390625, |
|
"learning_rate": 1.4267682921482356e-07, |
|
"loss": 0.0, |
|
"reward": 1.5000000298023224, |
|
"reward_std": 0.1178511306643486, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.5000000074505806, |
|
"step": 326 |
|
}, |
|
{ |
|
"completion_length": 461.16668701171875, |
|
"epoch": 0.27574611181168557, |
|
"grad_norm": 0.8364092517911256, |
|
"kl": 0.0169219970703125, |
|
"learning_rate": 1.3976078524055203e-07, |
|
"loss": 0.0, |
|
"reward": 1.3333333730697632, |
|
"reward_std": 0.3535533919930458, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.3333333395421505, |
|
"step": 328 |
|
}, |
|
{ |
|
"completion_length": 403.95833587646484, |
|
"epoch": 0.27742749054224464, |
|
"grad_norm": 0.8372601176868383, |
|
"kl": 0.0169219970703125, |
|
"learning_rate": 1.3686324273886528e-07, |
|
"loss": 0.0, |
|
"reward": 1.4583333730697632, |
|
"reward_std": 0.2946278266608715, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.4583333469927311, |
|
"step": 330 |
|
}, |
|
{ |
|
"completion_length": 381.5833435058594, |
|
"epoch": 0.2791088692728037, |
|
"grad_norm": 0.7788446171406045, |
|
"kl": 0.026214599609375, |
|
"learning_rate": 1.339846880049829e-07, |
|
"loss": 0.0, |
|
"reward": 1.5000000298023224, |
|
"reward_std": 0.2357022613286972, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.5000000111758709, |
|
"step": 332 |
|
}, |
|
{ |
|
"completion_length": 487.7916793823242, |
|
"epoch": 0.2807902480033628, |
|
"grad_norm": 0.9953471506857543, |
|
"kl": 0.0202484130859375, |
|
"learning_rate": 1.3112560414740313e-07, |
|
"loss": 0.0, |
|
"reward": 1.541666716337204, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.541666679084301, |
|
"step": 334 |
|
}, |
|
{ |
|
"completion_length": 499.62500762939453, |
|
"epoch": 0.28247162673392184, |
|
"grad_norm": 0.35616817874834467, |
|
"kl": 0.02581787109375, |
|
"learning_rate": 1.2828647100682261e-07, |
|
"loss": 0.0, |
|
"reward": 1.666666716337204, |
|
"reward_std": 0.2357022613286972, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.666666679084301, |
|
"step": 336 |
|
}, |
|
{ |
|
"completion_length": 454.2083435058594, |
|
"epoch": 0.28415300546448086, |
|
"grad_norm": 0.5428149223244878, |
|
"kl": 0.014923095703125, |
|
"learning_rate": 1.2546776507560467e-07, |
|
"loss": 0.0, |
|
"reward": 1.5000000298023224, |
|
"reward_std": 0.3535533919930458, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.5000000149011612, |
|
"step": 338 |
|
}, |
|
{ |
|
"completion_length": 399.4166717529297, |
|
"epoch": 0.2858343841950399, |
|
"grad_norm": 0.9031786183823988, |
|
"kl": 0.0172576904296875, |
|
"learning_rate": 1.2266995941780933e-07, |
|
"loss": 0.0, |
|
"reward": 1.541666716337204, |
|
"reward_std": 0.2946278266608715, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.541666679084301, |
|
"step": 340 |
|
}, |
|
{ |
|
"completion_length": 453.66667556762695, |
|
"epoch": 0.287515762925599, |
|
"grad_norm": 0.6316532508922427, |
|
"kl": 0.040191650390625, |
|
"learning_rate": 1.1989352358979888e-07, |
|
"loss": 0.0, |
|
"reward": 1.5000000298023224, |
|
"reward_std": 0.2357022613286972, |
|
"rewards/format_reward_func": 0.9583333432674408, |
|
"rewards/solution_reward_func": 0.541666679084301, |
|
"step": 342 |
|
}, |
|
{ |
|
"completion_length": 392.25000762939453, |
|
"epoch": 0.28919714165615806, |
|
"grad_norm": 0.7866237262244063, |
|
"kl": 0.0247802734375, |
|
"learning_rate": 1.1713892356143238e-07, |
|
"loss": 0.0, |
|
"reward": 1.8333333730697632, |
|
"reward_std": 0.2357022613286972, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.8333333730697632, |
|
"step": 344 |
|
}, |
|
{ |
|
"completion_length": 382.58333587646484, |
|
"epoch": 0.29087852038671713, |
|
"grad_norm": 0.8580927088394277, |
|
"kl": 0.0238189697265625, |
|
"learning_rate": 1.1440662163786166e-07, |
|
"loss": 0.0, |
|
"reward": 1.416666716337204, |
|
"reward_std": 0.1178511306643486, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.416666679084301, |
|
"step": 346 |
|
}, |
|
{ |
|
"completion_length": 476.0416793823242, |
|
"epoch": 0.29255989911727615, |
|
"grad_norm": 0.3362942892798578, |
|
"kl": 0.0136871337890625, |
|
"learning_rate": 1.1169707638194237e-07, |
|
"loss": 0.0, |
|
"reward": 1.7500000596046448, |
|
"reward_std": 0.1178511306643486, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.7500000298023224, |
|
"step": 348 |
|
}, |
|
{ |
|
"completion_length": 538.2083511352539, |
|
"epoch": 0.2942412778478352, |
|
"grad_norm": 0.2941167148030311, |
|
"kl": 0.01708984375, |
|
"learning_rate": 1.0901074253727336e-07, |
|
"loss": 0.0, |
|
"reward": 1.6666666865348816, |
|
"reward_std": 0.1178511306643486, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.666666679084301, |
|
"step": 350 |
|
}, |
|
{ |
|
"completion_length": 380.75, |
|
"epoch": 0.2959226565783943, |
|
"grad_norm": 0.5138201935880581, |
|
"kl": 0.023223876953125, |
|
"learning_rate": 1.0634807095187737e-07, |
|
"loss": 0.0, |
|
"reward": 1.5833333730697632, |
|
"reward_std": 0.1178511306643486, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.5833333432674408, |
|
"step": 352 |
|
}, |
|
{ |
|
"completion_length": 380.4583435058594, |
|
"epoch": 0.29760403530895335, |
|
"grad_norm": 0.6613574228632922, |
|
"kl": 0.14385986328125, |
|
"learning_rate": 1.0370950850253449e-07, |
|
"loss": 0.0001, |
|
"reward": 1.6666666865348816, |
|
"reward_std": 0.2357022613286972, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.6666666865348816, |
|
"step": 354 |
|
}, |
|
{ |
|
"completion_length": 424.87500762939453, |
|
"epoch": 0.2992854140395124, |
|
"grad_norm": 0.4785401178732256, |
|
"kl": 0.0345916748046875, |
|
"learning_rate": 1.0109549801978304e-07, |
|
"loss": 0.0, |
|
"reward": 1.7083333432674408, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.7083333432674408, |
|
"step": 356 |
|
}, |
|
{ |
|
"completion_length": 348.7916793823242, |
|
"epoch": 0.30096679277007143, |
|
"grad_norm": 0.0020426538152803417, |
|
"kl": 0.024261474609375, |
|
"learning_rate": 9.850647821359917e-08, |
|
"loss": 0.0, |
|
"reward": 1.4583333730697632, |
|
"reward_std": 0.0589255653321743, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.4583333432674408, |
|
"step": 358 |
|
}, |
|
{ |
|
"completion_length": 475.79168701171875, |
|
"epoch": 0.3026481715006305, |
|
"grad_norm": 0.429195015613074, |
|
"kl": 0.0204010009765625, |
|
"learning_rate": 9.594288359976815e-08, |
|
"loss": 0.0, |
|
"reward": 1.6250000596046448, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.6250000149011612, |
|
"step": 360 |
|
}, |
|
{ |
|
"completion_length": 494.25001525878906, |
|
"epoch": 0.30432955023118957, |
|
"grad_norm": 0.8473545963282998, |
|
"kl": 0.0186004638671875, |
|
"learning_rate": 9.340514442695952e-08, |
|
"loss": 0.0, |
|
"reward": 1.541666716337204, |
|
"reward_std": 0.4124789573252201, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.5416666865348816, |
|
"step": 362 |
|
}, |
|
{ |
|
"completion_length": 492.8750228881836, |
|
"epoch": 0.30601092896174864, |
|
"grad_norm": 0.3234727873687572, |
|
"kl": 0.01751708984375, |
|
"learning_rate": 9.089368660451798e-08, |
|
"loss": 0.0, |
|
"reward": 1.5833333432674408, |
|
"reward_std": 0.1178511306643486, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.5833333358168602, |
|
"step": 364 |
|
}, |
|
{ |
|
"completion_length": 452.2916793823242, |
|
"epoch": 0.3076923076923077, |
|
"grad_norm": 0.40422101797827664, |
|
"kl": 0.025787353515625, |
|
"learning_rate": 8.840893163098332e-08, |
|
"loss": 0.0, |
|
"reward": 1.6666666865348816, |
|
"reward_std": 0.2357022613286972, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.6666666865348816, |
|
"step": 366 |
|
}, |
|
{ |
|
"completion_length": 368.6666793823242, |
|
"epoch": 0.3093736864228668, |
|
"grad_norm": 0.7245593032700843, |
|
"kl": 0.0194549560546875, |
|
"learning_rate": 8.595129652335017e-08, |
|
"loss": 0.0, |
|
"reward": 1.7083334028720856, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.708333358168602, |
|
"step": 368 |
|
}, |
|
{ |
|
"completion_length": 397.08334732055664, |
|
"epoch": 0.3110550651534258, |
|
"grad_norm": 0.8562497096589754, |
|
"kl": 0.0177154541015625, |
|
"learning_rate": 8.352119374707977e-08, |
|
"loss": 0.0, |
|
"reward": 1.7916666865348816, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.7916666865348816, |
|
"step": 370 |
|
}, |
|
{ |
|
"completion_length": 443.2083435058594, |
|
"epoch": 0.31273644388398486, |
|
"grad_norm": 0.43124200301124715, |
|
"kl": 0.020538330078125, |
|
"learning_rate": 8.11190311468759e-08, |
|
"loss": 0.0, |
|
"reward": 1.666666716337204, |
|
"reward_std": 0.2357022613286972, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.6666666939854622, |
|
"step": 372 |
|
}, |
|
{ |
|
"completion_length": 469.9583435058594, |
|
"epoch": 0.31441782261454393, |
|
"grad_norm": 0.6436306808211317, |
|
"kl": 0.076263427734375, |
|
"learning_rate": 7.87452118782363e-08, |
|
"loss": 0.0001, |
|
"reward": 1.4583333730697632, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.4583333507180214, |
|
"step": 374 |
|
}, |
|
{ |
|
"completion_length": 424.7083511352539, |
|
"epoch": 0.316099201345103, |
|
"grad_norm": 0.8624260825303681, |
|
"kl": 0.0174713134765625, |
|
"learning_rate": 7.640013433979093e-08, |
|
"loss": 0.0, |
|
"reward": 1.666666716337204, |
|
"reward_std": 0.3535533919930458, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.6666666865348816, |
|
"step": 376 |
|
}, |
|
{ |
|
"completion_length": 480.4583435058594, |
|
"epoch": 0.31778058007566207, |
|
"grad_norm": 0.4768870927728356, |
|
"kl": 0.019195556640625, |
|
"learning_rate": 7.408419210643846e-08, |
|
"loss": 0.0, |
|
"reward": 1.5833333730697632, |
|
"reward_std": 0.3535533919930458, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.583333358168602, |
|
"step": 378 |
|
}, |
|
{ |
|
"completion_length": 457.7083435058594, |
|
"epoch": 0.3194619588062211, |
|
"grad_norm": 0.7325139897043152, |
|
"kl": 0.0226287841796875, |
|
"learning_rate": 7.179777386329275e-08, |
|
"loss": 0.0, |
|
"reward": 1.7083333730697632, |
|
"reward_std": 0.2946278266608715, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.708333358168602, |
|
"step": 380 |
|
}, |
|
{ |
|
"completion_length": 492.0416793823242, |
|
"epoch": 0.32114333753678015, |
|
"grad_norm": 0.6812039461038883, |
|
"kl": 0.0167388916015625, |
|
"learning_rate": 6.954126334044949e-08, |
|
"loss": 0.0, |
|
"reward": 1.4583333730697632, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.4583333432674408, |
|
"step": 382 |
|
}, |
|
{ |
|
"completion_length": 427.75001525878906, |
|
"epoch": 0.3228247162673392, |
|
"grad_norm": 0.26789925678872634, |
|
"kl": 0.0200653076171875, |
|
"learning_rate": 6.731503924858516e-08, |
|
"loss": 0.0, |
|
"reward": 1.5833333730697632, |
|
"reward_std": 0.1178511306643486, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.5833333507180214, |
|
"step": 384 |
|
}, |
|
{ |
|
"completion_length": 460.37500762939453, |
|
"epoch": 0.3245060949978983, |
|
"grad_norm": 0.4139795134217995, |
|
"kl": 0.0171051025390625, |
|
"learning_rate": 6.511947521539737e-08, |
|
"loss": 0.0, |
|
"reward": 1.8750000298023224, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.8750000298023224, |
|
"step": 386 |
|
}, |
|
{ |
|
"completion_length": 401.37500762939453, |
|
"epoch": 0.32618747372845736, |
|
"grad_norm": 0.9550140447715619, |
|
"kl": 0.04052734375, |
|
"learning_rate": 6.295493972289903e-08, |
|
"loss": 0.0, |
|
"reward": 1.541666716337204, |
|
"reward_std": 0.2946278266608715, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.5416666828095913, |
|
"step": 388 |
|
}, |
|
{ |
|
"completion_length": 381.4583435058594, |
|
"epoch": 0.32786885245901637, |
|
"grad_norm": 0.8642430155063329, |
|
"kl": 0.018341064453125, |
|
"learning_rate": 6.082179604557616e-08, |
|
"loss": 0.0, |
|
"reward": 1.5833333730697632, |
|
"reward_std": 0.2357022613286972, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.5833333507180214, |
|
"step": 390 |
|
}, |
|
{ |
|
"completion_length": 504.0000228881836, |
|
"epoch": 0.32955023118957544, |
|
"grad_norm": 0.45869091032068066, |
|
"kl": 0.0639801025390625, |
|
"learning_rate": 5.8720402189419286e-08, |
|
"loss": 0.0001, |
|
"reward": 1.5833333730697632, |
|
"reward_std": 0.2357022613286972, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.5833333544433117, |
|
"step": 392 |
|
}, |
|
{ |
|
"completion_length": 499.0833511352539, |
|
"epoch": 0.3312316099201345, |
|
"grad_norm": 0.001226330191092669, |
|
"kl": 0.0165252685546875, |
|
"learning_rate": 5.6651110831839046e-08, |
|
"loss": 0.0, |
|
"reward": 1.6666667461395264, |
|
"reward_std": 0.1178511306643486, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.6666666865348816, |
|
"step": 394 |
|
}, |
|
{ |
|
"completion_length": 416.79168701171875, |
|
"epoch": 0.3329129886506936, |
|
"grad_norm": 0.23769632812912322, |
|
"kl": 0.0198211669921875, |
|
"learning_rate": 5.461426926247639e-08, |
|
"loss": 0.0, |
|
"reward": 1.6250000298023224, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.6250000149011612, |
|
"step": 396 |
|
}, |
|
{ |
|
"completion_length": 510.0416793823242, |
|
"epoch": 0.33459436738125264, |
|
"grad_norm": 0.4301315282000898, |
|
"kl": 0.0159149169921875, |
|
"learning_rate": 5.261021932491713e-08, |
|
"loss": 0.0, |
|
"reward": 1.541666716337204, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.5416666865348816, |
|
"step": 398 |
|
}, |
|
{ |
|
"completion_length": 507.0000228881836, |
|
"epoch": 0.3362757461118117, |
|
"grad_norm": 0.2327539317664912, |
|
"kl": 0.016632080078125, |
|
"learning_rate": 5.0639297359319846e-08, |
|
"loss": 0.0, |
|
"reward": 1.5833333730697632, |
|
"reward_std": 0.1178511306643486, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.583333358168602, |
|
"step": 400 |
|
}, |
|
{ |
|
"completion_length": 398.75000762939453, |
|
"epoch": 0.3379571248423707, |
|
"grad_norm": 0.6603293560697683, |
|
"kl": 0.01715087890625, |
|
"learning_rate": 4.870183414596793e-08, |
|
"loss": 0.0, |
|
"reward": 1.7916666865348816, |
|
"reward_std": 0.2946278229355812, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.7916666865348816, |
|
"step": 402 |
|
}, |
|
{ |
|
"completion_length": 453.9166793823242, |
|
"epoch": 0.3396385035729298, |
|
"grad_norm": 0.357065731955349, |
|
"kl": 0.0204010009765625, |
|
"learning_rate": 4.679815484975505e-08, |
|
"loss": 0.0, |
|
"reward": 1.541666716337204, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.5416666828095913, |
|
"step": 404 |
|
}, |
|
{ |
|
"completion_length": 329.2916793823242, |
|
"epoch": 0.34131988230348886, |
|
"grad_norm": 0.6145494404346448, |
|
"kl": 0.208160400390625, |
|
"learning_rate": 4.492857896561203e-08, |
|
"loss": 0.0002, |
|
"reward": 1.5416666865348816, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.541666679084301, |
|
"step": 406 |
|
}, |
|
{ |
|
"completion_length": 482.50001525878906, |
|
"epoch": 0.34300126103404793, |
|
"grad_norm": 0.58571987604954, |
|
"kl": 0.041656494140625, |
|
"learning_rate": 4.309342026488652e-08, |
|
"loss": 0.0, |
|
"reward": 1.666666716337204, |
|
"reward_std": 0.2357022613286972, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.6666666865348816, |
|
"step": 408 |
|
}, |
|
{ |
|
"completion_length": 441.2916717529297, |
|
"epoch": 0.344682639764607, |
|
"grad_norm": 1.06985487849842, |
|
"kl": 0.0160675048828125, |
|
"learning_rate": 4.1292986742682254e-08, |
|
"loss": 0.0, |
|
"reward": 1.6250000596046448, |
|
"reward_std": 0.2946278266608715, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.6250000223517418, |
|
"step": 410 |
|
}, |
|
{ |
|
"completion_length": 389.4166717529297, |
|
"epoch": 0.346364018495166, |
|
"grad_norm": 0.7270657915048854, |
|
"kl": 0.019683837890625, |
|
"learning_rate": 3.952758056616826e-08, |
|
"loss": 0.0, |
|
"reward": 1.5833334028720856, |
|
"reward_std": 0.3535533919930458, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.5833333544433117, |
|
"step": 412 |
|
}, |
|
{ |
|
"completion_length": 406.33333587646484, |
|
"epoch": 0.3480453972257251, |
|
"grad_norm": 0.27758659667364055, |
|
"kl": 0.0177001953125, |
|
"learning_rate": 3.7797498023866395e-08, |
|
"loss": 0.0, |
|
"reward": 1.8333333432674408, |
|
"reward_std": 0.1178511306643486, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.8333333432674408, |
|
"step": 414 |
|
}, |
|
{ |
|
"completion_length": 488.58335876464844, |
|
"epoch": 0.34972677595628415, |
|
"grad_norm": 0.3438592498625851, |
|
"kl": 0.0125274658203125, |
|
"learning_rate": 3.6103029475924727e-08, |
|
"loss": 0.0, |
|
"reward": 1.4583333730697632, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.4583333507180214, |
|
"step": 416 |
|
}, |
|
{ |
|
"completion_length": 467.9166717529297, |
|
"epoch": 0.3514081546868432, |
|
"grad_norm": 0.7068973819897363, |
|
"kl": 0.0189666748046875, |
|
"learning_rate": 3.4444459305386504e-08, |
|
"loss": 0.0, |
|
"reward": 1.5833333730697632, |
|
"reward_std": 0.1178511306643486, |
|
"rewards/format_reward_func": 0.9583333432674408, |
|
"rewards/solution_reward_func": 0.6250000111758709, |
|
"step": 418 |
|
}, |
|
{ |
|
"completion_length": 532.6666793823242, |
|
"epoch": 0.3530895334174023, |
|
"grad_norm": 0.30882726803388616, |
|
"kl": 0.013458251953125, |
|
"learning_rate": 3.2822065870462215e-08, |
|
"loss": 0.0, |
|
"reward": 1.7500000298023224, |
|
"reward_std": 0.1178511306643486, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.7500000223517418, |
|
"step": 420 |
|
}, |
|
{ |
|
"completion_length": 393.5416717529297, |
|
"epoch": 0.3547709121479613, |
|
"grad_norm": 0.5034685025489649, |
|
"kl": 0.020172119140625, |
|
"learning_rate": 3.1236121457812545e-08, |
|
"loss": 0.0, |
|
"reward": 1.7083333432674408, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.7083333432674408, |
|
"step": 422 |
|
}, |
|
{ |
|
"completion_length": 373.75000762939453, |
|
"epoch": 0.3564522908785204, |
|
"grad_norm": 0.7676592102825319, |
|
"kl": 0.021453857421875, |
|
"learning_rate": 2.9686892236850336e-08, |
|
"loss": 0.0, |
|
"reward": 1.4583333730697632, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.4583333432674408, |
|
"step": 424 |
|
}, |
|
{ |
|
"completion_length": 386.0416793823242, |
|
"epoch": 0.35813366960907944, |
|
"grad_norm": 1.0126228541091091, |
|
"kl": 0.039794921875, |
|
"learning_rate": 2.817463821506949e-08, |
|
"loss": 0.0, |
|
"reward": 1.5416666865348816, |
|
"reward_std": 0.2946278229355812, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.5416666716337204, |
|
"step": 426 |
|
}, |
|
{ |
|
"completion_length": 414.1666793823242, |
|
"epoch": 0.3598150483396385, |
|
"grad_norm": 0.4701389090006604, |
|
"kl": 0.0238494873046875, |
|
"learning_rate": 2.6699613194407723e-08, |
|
"loss": 0.0, |
|
"reward": 1.6666666865348816, |
|
"reward_std": 0.2357022613286972, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.6666666865348816, |
|
"step": 428 |
|
}, |
|
{ |
|
"completion_length": 474.4166793823242, |
|
"epoch": 0.3614964270701976, |
|
"grad_norm": 0.5957470677297103, |
|
"kl": 0.019805908203125, |
|
"learning_rate": 2.5262064728651194e-08, |
|
"loss": 0.0, |
|
"reward": 1.666666716337204, |
|
"reward_std": 0.3535533919930458, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.6666666865348816, |
|
"step": 430 |
|
}, |
|
{ |
|
"completion_length": 438.8333435058594, |
|
"epoch": 0.36317780580075665, |
|
"grad_norm": 0.23260450142169511, |
|
"kl": 0.016265869140625, |
|
"learning_rate": 2.3862234081887033e-08, |
|
"loss": 0.0, |
|
"reward": 1.8333333432674408, |
|
"reward_std": 0.1178511306643486, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.8333333432674408, |
|
"step": 432 |
|
}, |
|
{ |
|
"completion_length": 376.25000762939453, |
|
"epoch": 0.36485918453131566, |
|
"grad_norm": 0.4819133256066341, |
|
"kl": 0.02789306640625, |
|
"learning_rate": 2.250035618801241e-08, |
|
"loss": 0.0, |
|
"reward": 1.541666716337204, |
|
"reward_std": 0.0589255653321743, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.5416666753590107, |
|
"step": 434 |
|
}, |
|
{ |
|
"completion_length": 395.3333511352539, |
|
"epoch": 0.36654056326187473, |
|
"grad_norm": 0.4753927376230513, |
|
"kl": 0.0201263427734375, |
|
"learning_rate": 2.117665961130513e-08, |
|
"loss": 0.0, |
|
"reward": 1.791666716337204, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.7916666865348816, |
|
"step": 436 |
|
}, |
|
{ |
|
"completion_length": 480.2083511352539, |
|
"epoch": 0.3682219419924338, |
|
"grad_norm": 0.5069210021394791, |
|
"kl": 0.02008056640625, |
|
"learning_rate": 1.9891366508064e-08, |
|
"loss": 0.0, |
|
"reward": 1.6250000298023224, |
|
"reward_std": 0.2946278229355812, |
|
"rewards/format_reward_func": 0.9583333432674408, |
|
"rewards/solution_reward_func": 0.666666679084301, |
|
"step": 438 |
|
}, |
|
{ |
|
"completion_length": 404.62500762939453, |
|
"epoch": 0.36990332072299287, |
|
"grad_norm": 0.5579175174593816, |
|
"kl": 0.0225830078125, |
|
"learning_rate": 1.8644692589323967e-08, |
|
"loss": 0.0, |
|
"reward": 1.5833333730697632, |
|
"reward_std": 0.2357022613286972, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.5833333544433117, |
|
"step": 440 |
|
}, |
|
{ |
|
"completion_length": 450.7083435058594, |
|
"epoch": 0.37158469945355194, |
|
"grad_norm": 0.0011962781703181325, |
|
"kl": 0.022216796875, |
|
"learning_rate": 1.7436847084653456e-08, |
|
"loss": 0.0, |
|
"reward": 1.6666666865348816, |
|
"reward_std": 0.2357022576034069, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.6666666716337204, |
|
"step": 442 |
|
}, |
|
{ |
|
"completion_length": 403.62500762939453, |
|
"epoch": 0.37326607818411095, |
|
"grad_norm": 0.5038101762676807, |
|
"kl": 0.020782470703125, |
|
"learning_rate": 1.626803270703936e-08, |
|
"loss": 0.0, |
|
"reward": 1.666666716337204, |
|
"reward_std": 0.1178511306643486, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.6666666865348816, |
|
"step": 444 |
|
}, |
|
{ |
|
"completion_length": 305.6666717529297, |
|
"epoch": 0.37494745691467, |
|
"grad_norm": 1.1478452212927168, |
|
"kl": 0.02532958984375, |
|
"learning_rate": 1.513844561886554e-08, |
|
"loss": 0.0, |
|
"reward": 1.8333333730697632, |
|
"reward_std": 0.2357022613286972, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.8333333432674408, |
|
"step": 446 |
|
}, |
|
{ |
|
"completion_length": 440.91668701171875, |
|
"epoch": 0.3766288356452291, |
|
"grad_norm": 0.5410906835431928, |
|
"kl": 0.025360107421875, |
|
"learning_rate": 1.4048275398990894e-08, |
|
"loss": 0.0, |
|
"reward": 1.5000000298023224, |
|
"reward_std": 0.2357022613286972, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.5000000111758709, |
|
"step": 448 |
|
}, |
|
{ |
|
"completion_length": 520.8333435058594, |
|
"epoch": 0.37831021437578816, |
|
"grad_norm": 0.6494890901957951, |
|
"kl": 0.0169219970703125, |
|
"learning_rate": 1.2997705010932391e-08, |
|
"loss": 0.0, |
|
"reward": 1.7083333730697632, |
|
"reward_std": 0.2946278229355812, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.7083333432674408, |
|
"step": 450 |
|
}, |
|
{ |
|
"completion_length": 504.0833435058594, |
|
"epoch": 0.3799915931063472, |
|
"grad_norm": 0.11376590313112087, |
|
"kl": 0.043548583984375, |
|
"learning_rate": 1.1986910772158105e-08, |
|
"loss": 0.0, |
|
"reward": 1.4583333730697632, |
|
"reward_std": 0.0589255653321743, |
|
"rewards/format_reward_func": 0.9583333432674408, |
|
"rewards/solution_reward_func": 0.5000000074505806, |
|
"step": 452 |
|
}, |
|
{ |
|
"completion_length": 436.7916717529297, |
|
"epoch": 0.38167297183690624, |
|
"grad_norm": 0.5652200449027903, |
|
"kl": 0.03338623046875, |
|
"learning_rate": 1.1016062324496007e-08, |
|
"loss": 0.0, |
|
"reward": 1.5833333730697632, |
|
"reward_std": 0.2357022613286972, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.5833333507180214, |
|
"step": 454 |
|
}, |
|
{ |
|
"completion_length": 405.7083511352539, |
|
"epoch": 0.3833543505674653, |
|
"grad_norm": 0.43158182108263865, |
|
"kl": 0.0216064453125, |
|
"learning_rate": 1.0085322605662666e-08, |
|
"loss": 0.0, |
|
"reward": 1.7500000596046448, |
|
"reward_std": 0.3535533919930458, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.7500000149011612, |
|
"step": 456 |
|
}, |
|
{ |
|
"completion_length": 472.0833435058594, |
|
"epoch": 0.3850357292980244, |
|
"grad_norm": 0.717982397429151, |
|
"kl": 0.018798828125, |
|
"learning_rate": 9.194847821917623e-09, |
|
"loss": 0.0, |
|
"reward": 1.7083333730697632, |
|
"reward_std": 0.2946278266608715, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.708333358168602, |
|
"step": 458 |
|
}, |
|
{ |
|
"completion_length": 433.37501525878906, |
|
"epoch": 0.38671710802858345, |
|
"grad_norm": 0.3929283618854557, |
|
"kl": 0.021942138671875, |
|
"learning_rate": 8.344787421847216e-09, |
|
"loss": 0.0, |
|
"reward": 1.5833333730697632, |
|
"reward_std": 0.1178511306643486, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.5833333469927311, |
|
"step": 460 |
|
}, |
|
{ |
|
"completion_length": 546.5833435058594, |
|
"epoch": 0.3883984867591425, |
|
"grad_norm": 0.36389040620504864, |
|
"kl": 0.018341064453125, |
|
"learning_rate": 7.535284071282455e-09, |
|
"loss": 0.0, |
|
"reward": 1.666666716337204, |
|
"reward_std": 0.2357022613286972, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.6666666939854622, |
|
"step": 462 |
|
}, |
|
{ |
|
"completion_length": 397.7916717529297, |
|
"epoch": 0.3900798654897016, |
|
"grad_norm": 0.4936781871754855, |
|
"kl": 0.0218505859375, |
|
"learning_rate": 6.766473629355452e-09, |
|
"loss": 0.0, |
|
"reward": 1.7916666865348816, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.7916666865348816, |
|
"step": 464 |
|
}, |
|
{ |
|
"completion_length": 449.37501525878906, |
|
"epoch": 0.3917612442202606, |
|
"grad_norm": 0.416937865685922, |
|
"kl": 0.0172119140625, |
|
"learning_rate": 6.038485125698295e-09, |
|
"loss": 0.0, |
|
"reward": 1.6250000298023224, |
|
"reward_std": 0.0589255653321743, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.6250000074505806, |
|
"step": 466 |
|
}, |
|
{ |
|
"completion_length": 445.41668701171875, |
|
"epoch": 0.39344262295081966, |
|
"grad_norm": 0.7420333816373434, |
|
"kl": 0.020751953125, |
|
"learning_rate": 5.3514407387877936e-09, |
|
"loss": 0.0, |
|
"reward": 1.6666666865348816, |
|
"reward_std": 0.2357022613286972, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.6666666716337204, |
|
"step": 468 |
|
}, |
|
{ |
|
"completion_length": 414.62501525878906, |
|
"epoch": 0.39512400168137873, |
|
"grad_norm": 0.0068766679961718095, |
|
"kl": 0.020599365234375, |
|
"learning_rate": 4.705455775440237e-09, |
|
"loss": 0.0, |
|
"reward": 1.7500000298023224, |
|
"reward_std": 0.1178511306643486, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.7500000149011612, |
|
"step": 470 |
|
}, |
|
{ |
|
"completion_length": 509.6666717529297, |
|
"epoch": 0.3968053804119378, |
|
"grad_norm": 0.5120985795788513, |
|
"kl": 0.049163818359375, |
|
"learning_rate": 4.100638651459542e-09, |
|
"loss": 0.0, |
|
"reward": 1.7083334028720856, |
|
"reward_std": 0.4124789573252201, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.708333358168602, |
|
"step": 472 |
|
}, |
|
{ |
|
"completion_length": 416.00000762939453, |
|
"epoch": 0.39848675914249687, |
|
"grad_norm": 0.001027025835790879, |
|
"kl": 0.0158233642578125, |
|
"learning_rate": 3.5370908734417006e-09, |
|
"loss": 0.0, |
|
"reward": 1.7500000596046448, |
|
"reward_std": 0.0, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.7500000149011612, |
|
"step": 474 |
|
}, |
|
{ |
|
"completion_length": 397.25000762939453, |
|
"epoch": 0.4001681378730559, |
|
"grad_norm": 0.021320987422443205, |
|
"kl": 0.038848876953125, |
|
"learning_rate": 3.0149070217390106e-09, |
|
"loss": 0.0, |
|
"reward": 1.4583333730697632, |
|
"reward_std": 0.0589255653321743, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.4583333395421505, |
|
"step": 476 |
|
}, |
|
{ |
|
"completion_length": 429.4583511352539, |
|
"epoch": 0.40184951660361495, |
|
"grad_norm": 0.4002345142208856, |
|
"kl": 0.0300445556640625, |
|
"learning_rate": 2.5341747345865026e-09, |
|
"loss": 0.0, |
|
"reward": 1.7500000298023224, |
|
"reward_std": 0.2357022613286972, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.7500000298023224, |
|
"step": 478 |
|
}, |
|
{ |
|
"completion_length": 461.50001525878906, |
|
"epoch": 0.403530895334174, |
|
"grad_norm": 0.0029418687663408513, |
|
"kl": 0.021148681640625, |
|
"learning_rate": 2.094974693393731e-09, |
|
"loss": 0.0, |
|
"reward": 1.5833333730697632, |
|
"reward_std": 0.1178511306643486, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.583333358168602, |
|
"step": 480 |
|
}, |
|
{ |
|
"completion_length": 464.3333435058594, |
|
"epoch": 0.4052122740647331, |
|
"grad_norm": 0.17330056984154224, |
|
"kl": 0.0167388916015625, |
|
"learning_rate": 1.6973806092038523e-09, |
|
"loss": 0.0, |
|
"reward": 1.7083333432674408, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.7083333432674408, |
|
"step": 482 |
|
}, |
|
{ |
|
"completion_length": 494.3333511352539, |
|
"epoch": 0.40689365279529216, |
|
"grad_norm": 0.6291866431405297, |
|
"kl": 0.01507568359375, |
|
"learning_rate": 1.3414592103228594e-09, |
|
"loss": 0.0, |
|
"reward": 1.7083333730697632, |
|
"reward_std": 0.1767766922712326, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.7083333432674408, |
|
"step": 484 |
|
}, |
|
{ |
|
"completion_length": 407.79168701171875, |
|
"epoch": 0.4085750315258512, |
|
"grad_norm": 0.7527510846206448, |
|
"kl": 0.02587890625, |
|
"learning_rate": 1.0272702311203695e-09, |
|
"loss": 0.0, |
|
"reward": 1.6250000298023224, |
|
"reward_std": 0.4124789535999298, |
|
"rewards/format_reward_func": 0.9583333432674408, |
|
"rewards/solution_reward_func": 0.6666666865348816, |
|
"step": 486 |
|
}, |
|
{ |
|
"completion_length": 463.5000228881836, |
|
"epoch": 0.41025641025641024, |
|
"grad_norm": 0.5925315928631419, |
|
"kl": 0.020721435546875, |
|
"learning_rate": 7.548664020045059e-10, |
|
"loss": 0.0, |
|
"reward": 1.4583333730697632, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.4583333507180214, |
|
"step": 488 |
|
}, |
|
{ |
|
"completion_length": 437.2083511352539, |
|
"epoch": 0.4119377889869693, |
|
"grad_norm": 0.7831672518645952, |
|
"kl": 0.034820556640625, |
|
"learning_rate": 5.242934405720878e-10, |
|
"loss": 0.0, |
|
"reward": 1.7500000596046448, |
|
"reward_std": 0.2357022613286972, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.7500000298023224, |
|
"step": 490 |
|
}, |
|
{ |
|
"completion_length": 444.3333435058594, |
|
"epoch": 0.4136191677175284, |
|
"grad_norm": 0.5783716082619003, |
|
"kl": 0.019439697265625, |
|
"learning_rate": 3.355900439359072e-10, |
|
"loss": 0.0, |
|
"reward": 1.7083333730697632, |
|
"reward_std": 0.2946278266608715, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.7083333432674408, |
|
"step": 492 |
|
}, |
|
{ |
|
"completion_length": 474.0833511352539, |
|
"epoch": 0.41530054644808745, |
|
"grad_norm": 0.5030728051783439, |
|
"kl": 0.02392578125, |
|
"learning_rate": 1.8878788223009035e-10, |
|
"loss": 0.0, |
|
"reward": 1.3750000298023224, |
|
"reward_std": 0.1767766959965229, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.3750000037252903, |
|
"step": 494 |
|
}, |
|
{ |
|
"completion_length": 471.3333435058594, |
|
"epoch": 0.4169819251786465, |
|
"grad_norm": 0.40396130292565613, |
|
"kl": 0.02294921875, |
|
"learning_rate": 8.391159329496079e-11, |
|
"loss": 0.0, |
|
"reward": 1.791666716337204, |
|
"reward_std": 0.2946278266608715, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.7916667014360428, |
|
"step": 496 |
|
}, |
|
{ |
|
"completion_length": 438.1666793823242, |
|
"epoch": 0.41866330390920553, |
|
"grad_norm": 0.4863347024264504, |
|
"kl": 0.01949310302734375, |
|
"learning_rate": 2.097877854204122e-11, |
|
"loss": 0.0, |
|
"reward": 1.666666716337204, |
|
"reward_std": 0.2357022613286972, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.6666666939854622, |
|
"step": 498 |
|
}, |
|
{ |
|
"completion_length": 391.7083435058594, |
|
"epoch": 0.4203446826397646, |
|
"grad_norm": 0.6480363403962828, |
|
"kl": 0.0219573974609375, |
|
"learning_rate": 0.0, |
|
"loss": 0.0, |
|
"reward": 1.6250000298023224, |
|
"reward_std": 0.2946278229355812, |
|
"rewards/format_reward_func": 1.0, |
|
"rewards/solution_reward_func": 0.6250000149011612, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.4203446826397646, |
|
"step": 500, |
|
"total_flos": 0.0, |
|
"train_loss": 1.7704009043086445e-05, |
|
"train_runtime": 16459.3672, |
|
"train_samples_per_second": 0.182, |
|
"train_steps_per_second": 0.03 |
|
} |
|
], |
|
"logging_steps": 2, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|