qwen2.5-3b-r1-rearc-stage1 / trainer_state.json
spinech's picture
Upload the rest of the model
4496985 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.4203446826397646,
"eval_steps": 500,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 582.8750228881836,
"epoch": 0.0016813787305590584,
"grad_norm": 0.32669930150754334,
"kl": 0.0,
"learning_rate": 6.666666666666667e-08,
"loss": 0.0,
"reward": 0.6666666865348816,
"reward_std": 0.3535533882677555,
"rewards/format_reward_func": 0.5416666865348816,
"rewards/solution_reward_func": 0.1250000037252903,
"step": 2
},
{
"completion_length": 699.4583587646484,
"epoch": 0.003362757461118117,
"grad_norm": 0.5103613895194884,
"kl": 0.0002980232238769531,
"learning_rate": 1.3333333333333334e-07,
"loss": 0.0,
"reward": 0.5833333507180214,
"reward_std": 0.4714045189321041,
"rewards/format_reward_func": 0.541666679084301,
"rewards/solution_reward_func": 0.0416666679084301,
"step": 4
},
{
"completion_length": 575.2083511352539,
"epoch": 0.005044136191677175,
"grad_norm": 0.6433224536587134,
"kl": 0.0003399848937988281,
"learning_rate": 2e-07,
"loss": 0.0,
"reward": 0.7916666865348816,
"reward_std": 0.2946278229355812,
"rewards/format_reward_func": 0.7083333656191826,
"rewards/solution_reward_func": 0.0833333358168602,
"step": 6
},
{
"completion_length": 828.2083435058594,
"epoch": 0.006725514922236234,
"grad_norm": 0.5211006593141526,
"kl": 0.00028324127197265625,
"learning_rate": 2.6666666666666667e-07,
"loss": 0.0,
"reward": 0.708333358168602,
"reward_std": 0.4124789573252201,
"rewards/format_reward_func": 0.6250000149011612,
"rewards/solution_reward_func": 0.0833333358168602,
"step": 8
},
{
"completion_length": 572.6666870117188,
"epoch": 0.008406893652795292,
"grad_norm": 0.7556094738301846,
"kl": 0.0003337860107421875,
"learning_rate": 3.333333333333333e-07,
"loss": 0.0,
"reward": 0.833333358168602,
"reward_std": 0.3535533882677555,
"rewards/format_reward_func": 0.6666666716337204,
"rewards/solution_reward_func": 0.1666666716337204,
"step": 10
},
{
"completion_length": 538.7083511352539,
"epoch": 0.01008827238335435,
"grad_norm": 0.7513982192873419,
"kl": 0.0002989768981933594,
"learning_rate": 4e-07,
"loss": 0.0,
"reward": 0.6666666939854622,
"reward_std": 0.4714045189321041,
"rewards/format_reward_func": 0.5833333358168602,
"rewards/solution_reward_func": 0.0833333358168602,
"step": 12
},
{
"completion_length": 673.7500152587891,
"epoch": 0.011769651113913409,
"grad_norm": 0.5852912050950652,
"kl": 0.0003135204315185547,
"learning_rate": 4.6666666666666666e-07,
"loss": 0.0,
"reward": 0.9166666865348816,
"reward_std": 0.4714045189321041,
"rewards/format_reward_func": 0.7916666865348816,
"rewards/solution_reward_func": 0.1250000037252903,
"step": 14
},
{
"completion_length": 711.3750305175781,
"epoch": 0.013451029844472467,
"grad_norm": 0.9204289802606287,
"kl": 0.0003161430358886719,
"learning_rate": 4.999947552503497e-07,
"loss": 0.0,
"reward": 0.6250000074505806,
"reward_std": 0.5303300842642784,
"rewards/format_reward_func": 0.5000000074505806,
"rewards/solution_reward_func": 0.1250000037252903,
"step": 16
},
{
"completion_length": 552.1250152587891,
"epoch": 0.015132408575031526,
"grad_norm": 0.9100701777961465,
"kl": 0.0004177093505859375,
"learning_rate": 4.999527985734931e-07,
"loss": 0.0,
"reward": 0.833333358168602,
"reward_std": 0.3535533882677555,
"rewards/format_reward_func": 0.75,
"rewards/solution_reward_func": 0.0833333358168602,
"step": 18
},
{
"completion_length": 540.1250228881836,
"epoch": 0.016813787305590584,
"grad_norm": 0.6037876280305763,
"kl": 0.0005059242248535156,
"learning_rate": 4.998688922613787e-07,
"loss": 0.0,
"reward": 0.7916666865348816,
"reward_std": 0.2946278266608715,
"rewards/format_reward_func": 0.6666666865348816,
"rewards/solution_reward_func": 0.1250000037252903,
"step": 20
},
{
"completion_length": 590.2500152587891,
"epoch": 0.018495166036149643,
"grad_norm": 0.9748922522783966,
"kl": 0.0004825592041015625,
"learning_rate": 4.997430503960219e-07,
"loss": 0.0,
"reward": 0.6666666939854622,
"reward_std": 0.3535533919930458,
"rewards/format_reward_func": 0.5416666679084301,
"rewards/solution_reward_func": 0.1250000037252903,
"step": 22
},
{
"completion_length": 689.5416717529297,
"epoch": 0.0201765447667087,
"grad_norm": 0.4822668116388023,
"kl": 0.000446319580078125,
"learning_rate": 4.995752940974918e-07,
"loss": 0.0,
"reward": 0.7916666865348816,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 0.7083333432674408,
"rewards/solution_reward_func": 0.0833333358168602,
"step": 24
},
{
"completion_length": 556.9166870117188,
"epoch": 0.02185792349726776,
"grad_norm": 0.46041536357233115,
"kl": 0.0004811286926269531,
"learning_rate": 4.993656515203662e-07,
"loss": 0.0,
"reward": 0.8333333432674408,
"reward_std": 0.3535533919930458,
"rewards/format_reward_func": 0.7500000149011612,
"rewards/solution_reward_func": 0.0833333358168602,
"step": 26
},
{
"completion_length": 597.2916717529297,
"epoch": 0.023539302227826818,
"grad_norm": 0.5772106772675187,
"kl": 0.0006680488586425781,
"learning_rate": 4.991141578490066e-07,
"loss": 0.0,
"reward": 0.9166666865348816,
"reward_std": 0.2357022576034069,
"rewards/format_reward_func": 0.7916666865348816,
"rewards/solution_reward_func": 0.1250000037252903,
"step": 28
},
{
"completion_length": 610.5416717529297,
"epoch": 0.025220680958385876,
"grad_norm": 0.8453841880983054,
"kl": 0.0008635520935058594,
"learning_rate": 4.988208552916535e-07,
"loss": 0.0,
"reward": 0.9583333730697632,
"reward_std": 0.4124789498746395,
"rewards/format_reward_func": 0.7916666865348816,
"rewards/solution_reward_func": 0.1666666716337204,
"step": 30
},
{
"completion_length": 672.7083587646484,
"epoch": 0.026902059688944935,
"grad_norm": 0.5771700431960171,
"kl": 0.000789642333984375,
"learning_rate": 4.984857930733419e-07,
"loss": 0.0,
"reward": 0.9166666865348816,
"reward_std": 0.2357022613286972,
"rewards/format_reward_func": 0.833333358168602,
"rewards/solution_reward_func": 0.0833333358168602,
"step": 32
},
{
"completion_length": 648.0416717529297,
"epoch": 0.028583438419503993,
"grad_norm": 0.6098823109986267,
"kl": 0.0009946823120117188,
"learning_rate": 4.981090274276405e-07,
"loss": 0.0,
"reward": 0.8333333730697632,
"reward_std": 0.3535533882677555,
"rewards/format_reward_func": 0.708333358168602,
"rewards/solution_reward_func": 0.1250000037252903,
"step": 34
},
{
"completion_length": 821.3333587646484,
"epoch": 0.03026481715006305,
"grad_norm": 0.6980212217774439,
"kl": 0.0006208419799804688,
"learning_rate": 4.976906215872137e-07,
"loss": 0.0,
"reward": 0.8333333432674408,
"reward_std": 0.3535533919930458,
"rewards/format_reward_func": 0.7500000149011612,
"rewards/solution_reward_func": 0.0833333358168602,
"step": 36
},
{
"completion_length": 620.2500152587891,
"epoch": 0.031946195880622114,
"grad_norm": 0.0003861918457850463,
"kl": 0.0010890960693359375,
"learning_rate": 4.97230645773209e-07,
"loss": 0.0,
"reward": 0.833333358168602,
"reward_std": 0.1178511306643486,
"rewards/format_reward_func": 0.7916667014360428,
"rewards/solution_reward_func": 0.0416666679084301,
"step": 38
},
{
"completion_length": 533.5833435058594,
"epoch": 0.03362757461118117,
"grad_norm": 0.19582302389190498,
"kl": 0.0016956329345703125,
"learning_rate": 4.967291771834726e-07,
"loss": 0.0,
"reward": 1.0416666865348816,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 0.833333358168602,
"rewards/solution_reward_func": 0.2083333358168602,
"step": 40
},
{
"completion_length": 554.9166793823242,
"epoch": 0.03530895334174023,
"grad_norm": 0.43787338966334477,
"kl": 0.0012445449829101562,
"learning_rate": 4.961862999795923e-07,
"loss": 0.0,
"reward": 1.0,
"reward_std": 0.2357022613286972,
"rewards/format_reward_func": 0.833333358168602,
"rewards/solution_reward_func": 0.1666666716337204,
"step": 42
},
{
"completion_length": 631.4166946411133,
"epoch": 0.036990332072299285,
"grad_norm": 0.7633407511221333,
"kl": 0.0015544891357421875,
"learning_rate": 4.956021052727731e-07,
"loss": 0.0,
"reward": 0.8750000149011612,
"reward_std": 0.2946278266608715,
"rewards/format_reward_func": 0.7916666865348816,
"rewards/solution_reward_func": 0.0833333358168602,
"step": 44
},
{
"completion_length": 623.9166870117188,
"epoch": 0.03867171080285835,
"grad_norm": 0.9409898030779542,
"kl": 0.0011081695556640625,
"learning_rate": 4.949766911085461e-07,
"loss": 0.0,
"reward": 0.9166666865348816,
"reward_std": 0.3535533882677555,
"rewards/format_reward_func": 0.7500000149011612,
"rewards/solution_reward_func": 0.1666666716337204,
"step": 46
},
{
"completion_length": 576.1250152587891,
"epoch": 0.0403530895334174,
"grad_norm": 0.5174245567750604,
"kl": 0.0010805130004882812,
"learning_rate": 4.943101624503132e-07,
"loss": 0.0,
"reward": 1.0000000298023224,
"reward_std": 0.3535533919930458,
"rewards/format_reward_func": 0.833333358168602,
"rewards/solution_reward_func": 0.1666666716337204,
"step": 48
},
{
"completion_length": 464.91668701171875,
"epoch": 0.042034468263976464,
"grad_norm": 0.721286337233584,
"kl": 0.0024700164794921875,
"learning_rate": 4.936026311617316e-07,
"loss": 0.0,
"reward": 0.9166667014360428,
"reward_std": 0.2357022613286972,
"rewards/format_reward_func": 0.8333333432674408,
"rewards/solution_reward_func": 0.0833333358168602,
"step": 50
},
{
"completion_length": 604.5000228881836,
"epoch": 0.04371584699453552,
"grad_norm": 0.4391469354984254,
"kl": 0.0014371871948242188,
"learning_rate": 4.928542159879385e-07,
"loss": 0.0,
"reward": 1.0833333730697632,
"reward_std": 0.3535533882677555,
"rewards/format_reward_func": 0.8750000149011612,
"rewards/solution_reward_func": 0.2083333395421505,
"step": 52
},
{
"completion_length": 487.04168701171875,
"epoch": 0.04539722572509458,
"grad_norm": 0.5796863501691008,
"kl": 0.001621246337890625,
"learning_rate": 4.920650425356239e-07,
"loss": 0.0,
"reward": 1.1250000596046448,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 0.9583333432674408,
"rewards/solution_reward_func": 0.1666666716337204,
"step": 54
},
{
"completion_length": 702.7917022705078,
"epoch": 0.047078604455653636,
"grad_norm": 0.45499979356207115,
"kl": 0.0010595321655273438,
"learning_rate": 4.912352432519484e-07,
"loss": 0.0,
"reward": 0.9583333730697632,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 0.9166666865348816,
"rewards/solution_reward_func": 0.0416666679084301,
"step": 56
},
{
"completion_length": 432.5416793823242,
"epoch": 0.0487599831862127,
"grad_norm": 0.7200412766464303,
"kl": 0.002208709716796875,
"learning_rate": 4.90364957402315e-07,
"loss": 0.0,
"reward": 1.2083333432674408,
"reward_std": 0.2946278266608715,
"rewards/format_reward_func": 0.9583333432674408,
"rewards/solution_reward_func": 0.2500000037252903,
"step": 58
},
{
"completion_length": 504.87501525878906,
"epoch": 0.05044136191677175,
"grad_norm": 0.69519393769251,
"kl": 0.00209808349609375,
"learning_rate": 4.894543310469967e-07,
"loss": 0.0,
"reward": 0.9583333730697632,
"reward_std": 0.2946278266608715,
"rewards/format_reward_func": 0.8750000298023224,
"rewards/solution_reward_func": 0.0833333358168602,
"step": 60
},
{
"completion_length": 665.458366394043,
"epoch": 0.052122740647330815,
"grad_norm": 0.7276068452685034,
"kl": 0.0016222000122070312,
"learning_rate": 4.885035170166228e-07,
"loss": 0.0,
"reward": 0.958333358168602,
"reward_std": 0.2946278266608715,
"rewards/format_reward_func": 0.8333333432674408,
"rewards/solution_reward_func": 0.1250000037252903,
"step": 62
},
{
"completion_length": 628.0000152587891,
"epoch": 0.05380411937788987,
"grad_norm": 0.5896398911030171,
"kl": 0.0023822784423828125,
"learning_rate": 4.875126748865289e-07,
"loss": 0.0,
"reward": 1.0416667014360428,
"reward_std": 0.4124789535999298,
"rewards/format_reward_func": 0.833333358168602,
"rewards/solution_reward_func": 0.2083333358168602,
"step": 64
},
{
"completion_length": 583.1250228881836,
"epoch": 0.05548549810844893,
"grad_norm": 0.3607454748525709,
"kl": 0.0025081634521484375,
"learning_rate": 4.864819709499761e-07,
"loss": 0.0,
"reward": 1.0000000298023224,
"reward_std": 0.1178511306643486,
"rewards/format_reward_func": 0.8750000298023224,
"rewards/solution_reward_func": 0.1250000037252903,
"step": 66
},
{
"completion_length": 734.5833435058594,
"epoch": 0.057166876839007986,
"grad_norm": 0.5186256820202049,
"kl": 0.0015344619750976562,
"learning_rate": 4.854115781902414e-07,
"loss": 0.0,
"reward": 0.916666716337204,
"reward_std": 0.2357022613286972,
"rewards/format_reward_func": 0.8750000298023224,
"rewards/solution_reward_func": 0.0416666679084301,
"step": 68
},
{
"completion_length": 738.0833511352539,
"epoch": 0.05884825556956705,
"grad_norm": 0.39707553003489315,
"kl": 0.0017681121826171875,
"learning_rate": 4.843016762515859e-07,
"loss": 0.0,
"reward": 1.0416666865348816,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 0.9583333432674408,
"rewards/solution_reward_func": 0.0833333358168602,
"step": 70
},
{
"completion_length": 625.6250305175781,
"epoch": 0.0605296343001261,
"grad_norm": 0.30316010027843815,
"kl": 0.001819610595703125,
"learning_rate": 4.831524514091056e-07,
"loss": 0.0,
"reward": 1.0000000298023224,
"reward_std": 0.2357022576034069,
"rewards/format_reward_func": 0.9166666865348816,
"rewards/solution_reward_func": 0.0833333358168602,
"step": 72
},
{
"completion_length": 636.8750228881836,
"epoch": 0.062211013030685165,
"grad_norm": 0.4721947459888053,
"kl": 0.0018463134765625,
"learning_rate": 4.81964096537468e-07,
"loss": 0.0,
"reward": 1.0416666716337204,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 0.8750000149011612,
"rewards/solution_reward_func": 0.1666666679084301,
"step": 74
},
{
"completion_length": 508.4166793823242,
"epoch": 0.06389239176124423,
"grad_norm": 0.0005282376556457386,
"kl": 0.0027256011962890625,
"learning_rate": 4.80736811078543e-07,
"loss": 0.0,
"reward": 1.1666666865348816,
"reward_std": 0.1178511306643486,
"rewards/format_reward_func": 0.9583333432674408,
"rewards/solution_reward_func": 0.2083333395421505,
"step": 76
},
{
"completion_length": 460.00001525878906,
"epoch": 0.06557377049180328,
"grad_norm": 0.8701537519403687,
"kl": 0.0024814605712890625,
"learning_rate": 4.794708010079288e-07,
"loss": 0.0,
"reward": 1.0833333730697632,
"reward_std": 0.2357022613286972,
"rewards/format_reward_func": 0.9166666865348816,
"rewards/solution_reward_func": 0.1666666716337204,
"step": 78
},
{
"completion_length": 602.1666870117188,
"epoch": 0.06725514922236234,
"grad_norm": 0.5672242931158207,
"kl": 0.005603790283203125,
"learning_rate": 4.78166278800385e-07,
"loss": 0.0,
"reward": 1.041666716337204,
"reward_std": 0.2946278266608715,
"rewards/format_reward_func": 0.9166666865348816,
"rewards/solution_reward_func": 0.1250000037252903,
"step": 80
},
{
"completion_length": 547.5416870117188,
"epoch": 0.06893652795292139,
"grad_norm": 0.6276836580523573,
"kl": 0.004207611083984375,
"learning_rate": 4.7682346339417157e-07,
"loss": 0.0,
"reward": 0.9583333432674408,
"reward_std": 0.2946278266608715,
"rewards/format_reward_func": 0.8750000298023224,
"rewards/solution_reward_func": 0.0833333358168602,
"step": 82
},
{
"completion_length": 668.9583587646484,
"epoch": 0.07061790668348046,
"grad_norm": 0.00047527262535576603,
"kl": 0.00447845458984375,
"learning_rate": 4.754425801543046e-07,
"loss": 0.0,
"reward": 1.0,
"reward_std": 0.1178511306643486,
"rewards/format_reward_func": 0.9583333432674408,
"rewards/solution_reward_func": 0.0416666679084301,
"step": 84
},
{
"completion_length": 542.7083511352539,
"epoch": 0.07229928541403952,
"grad_norm": 0.6328447561571717,
"kl": 0.002597808837890625,
"learning_rate": 4.7402386083473364e-07,
"loss": 0.0,
"reward": 1.166666716337204,
"reward_std": 0.2357022613286972,
"rewards/format_reward_func": 0.9583333432674408,
"rewards/solution_reward_func": 0.2083333395421505,
"step": 86
},
{
"completion_length": 570.7500305175781,
"epoch": 0.07398066414459857,
"grad_norm": 0.48341787098839506,
"kl": 0.0023593902587890625,
"learning_rate": 4.72567543539446e-07,
"loss": 0.0,
"reward": 0.9583333730697632,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 0.9166666865348816,
"rewards/solution_reward_func": 0.0416666679084301,
"step": 88
},
{
"completion_length": 515.6666793823242,
"epoch": 0.07566204287515763,
"grad_norm": 0.4411126705303185,
"kl": 0.003017425537109375,
"learning_rate": 4.7107387268250586e-07,
"loss": 0.0,
"reward": 1.0416666865348816,
"reward_std": 0.2946278266608715,
"rewards/format_reward_func": 0.9166666865348816,
"rewards/solution_reward_func": 0.1250000037252903,
"step": 90
},
{
"completion_length": 608.3333435058594,
"epoch": 0.0773434216057167,
"grad_norm": 0.3226336554890738,
"kl": 0.0033473968505859375,
"learning_rate": 4.6954309894703426e-07,
"loss": 0.0,
"reward": 1.166666716337204,
"reward_std": 0.2357022613286972,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.1666666716337204,
"step": 92
},
{
"completion_length": 456.4166793823242,
"epoch": 0.07902480033627575,
"grad_norm": 0.7644602389120536,
"kl": 0.0032806396484375,
"learning_rate": 4.6797547924313673e-07,
"loss": 0.0,
"reward": 1.1250000298023224,
"reward_std": 0.2946278266608715,
"rewards/format_reward_func": 0.9583333432674408,
"rewards/solution_reward_func": 0.1666666716337204,
"step": 94
},
{
"completion_length": 551.4583587646484,
"epoch": 0.0807061790668348,
"grad_norm": 0.773881603923625,
"kl": 0.003635406494140625,
"learning_rate": 4.6637127666478617e-07,
"loss": 0.0,
"reward": 1.1250000298023224,
"reward_std": 0.2946278266608715,
"rewards/format_reward_func": 0.9583333432674408,
"rewards/solution_reward_func": 0.1666666716337204,
"step": 96
},
{
"completion_length": 494.4583511352539,
"epoch": 0.08238755779739386,
"grad_norm": 0.6252744428861863,
"kl": 0.00386810302734375,
"learning_rate": 4.647307604456674e-07,
"loss": 0.0,
"reward": 1.166666716337204,
"reward_std": 0.2357022613286972,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.1666666716337204,
"step": 98
},
{
"completion_length": 540.9166870117188,
"epoch": 0.08406893652795293,
"grad_norm": 0.0005101931995823585,
"kl": 0.003147125244140625,
"learning_rate": 4.630542059139923e-07,
"loss": 0.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.0,
"step": 100
},
{
"completion_length": 447.2916793823242,
"epoch": 0.08575031525851198,
"grad_norm": 0.7334515099745728,
"kl": 0.00638580322265625,
"learning_rate": 4.613418944462906e-07,
"loss": 0.0,
"reward": 1.2500000596046448,
"reward_std": 0.2357022613286972,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.2500000074505806,
"step": 102
},
{
"completion_length": 465.45835876464844,
"epoch": 0.08743169398907104,
"grad_norm": 0.2608326039080234,
"kl": 0.004886627197265625,
"learning_rate": 4.5959411342018704e-07,
"loss": 0.0,
"reward": 1.1250000298023224,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 0.9583333432674408,
"rewards/solution_reward_func": 0.1666666679084301,
"step": 104
},
{
"completion_length": 479.9166946411133,
"epoch": 0.0891130727196301,
"grad_norm": 0.2883550467956107,
"kl": 0.00536346435546875,
"learning_rate": 4.578111561661702e-07,
"loss": 0.0,
"reward": 1.0833333730697632,
"reward_std": 0.1178511306643486,
"rewards/format_reward_func": 0.9583333432674408,
"rewards/solution_reward_func": 0.1250000037252903,
"step": 106
},
{
"completion_length": 424.5416793823242,
"epoch": 0.09079445145018916,
"grad_norm": 0.6124946815591971,
"kl": 0.00507354736328125,
"learning_rate": 4.559933219183631e-07,
"loss": 0.0,
"reward": 1.0000000298023224,
"reward_std": 0.1178511306643486,
"rewards/format_reward_func": 0.9583333432674408,
"rewards/solution_reward_func": 0.0416666679084301,
"step": 108
},
{
"completion_length": 374.3333435058594,
"epoch": 0.09247583018074822,
"grad_norm": 0.003755671104748954,
"kl": 0.01043701171875,
"learning_rate": 4.541409157643027e-07,
"loss": 0.0,
"reward": 1.1250000298023224,
"reward_std": 0.0589255653321743,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.1250000037252903,
"step": 110
},
{
"completion_length": 527.3750228881836,
"epoch": 0.09415720891130727,
"grad_norm": 0.5338777369576395,
"kl": 0.00551605224609375,
"learning_rate": 4.5225424859373684e-07,
"loss": 0.0,
"reward": 1.166666716337204,
"reward_std": 0.2357022613286972,
"rewards/format_reward_func": 0.9583333432674408,
"rewards/solution_reward_func": 0.2083333395421505,
"step": 112
},
{
"completion_length": 381.62500762939453,
"epoch": 0.09583858764186633,
"grad_norm": 0.47889287389319934,
"kl": 0.007801055908203125,
"learning_rate": 4.503336370464475e-07,
"loss": 0.0,
"reward": 1.1250000298023224,
"reward_std": 0.0589255653321743,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.1250000037252903,
"step": 114
},
{
"completion_length": 464.70835876464844,
"epoch": 0.0975199663724254,
"grad_norm": 0.5542831400022328,
"kl": 0.005016326904296875,
"learning_rate": 4.4837940345910917e-07,
"loss": 0.0,
"reward": 1.1250000298023224,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 0.9583333432674408,
"rewards/solution_reward_func": 0.1666666716337204,
"step": 116
},
{
"completion_length": 537.0000076293945,
"epoch": 0.09920134510298445,
"grad_norm": 0.32822474431275217,
"kl": 0.005290985107421875,
"learning_rate": 4.4639187581119116e-07,
"loss": 0.0,
"reward": 1.0,
"reward_std": 0.1178511306643486,
"rewards/format_reward_func": 0.9583333432674408,
"rewards/solution_reward_func": 0.0416666679084301,
"step": 118
},
{
"completion_length": 449.8333435058594,
"epoch": 0.1008827238335435,
"grad_norm": 0.6078651118658578,
"kl": 0.005950927734375,
"learning_rate": 4.443713876699123e-07,
"loss": 0.0,
"reward": 1.1250000596046448,
"reward_std": 0.2946278266608715,
"rewards/format_reward_func": 0.9583333432674408,
"rewards/solution_reward_func": 0.1666666716337204,
"step": 120
},
{
"completion_length": 487.08335876464844,
"epoch": 0.10256410256410256,
"grad_norm": 0.0015859284209239941,
"kl": 0.00882720947265625,
"learning_rate": 4.423182781342588e-07,
"loss": 0.0,
"reward": 1.0833333730697632,
"reward_std": 0.1178511306643486,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.0833333358168602,
"step": 122
},
{
"completion_length": 477.7916793823242,
"epoch": 0.10424548129466163,
"grad_norm": 0.8316436306146491,
"kl": 0.00827789306640625,
"learning_rate": 4.402328917780728e-07,
"loss": 0.0,
"reward": 1.041666716337204,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 0.9583333432674408,
"rewards/solution_reward_func": 0.0833333358168602,
"step": 124
},
{
"completion_length": 360.37500762939453,
"epoch": 0.10592686002522068,
"grad_norm": 0.5068283502332778,
"kl": 0.0106048583984375,
"learning_rate": 4.381155785922225e-07,
"loss": 0.0,
"reward": 1.166666716337204,
"reward_std": 0.2357022613286972,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.1666666716337204,
"step": 126
},
{
"completion_length": 373.25000762939453,
"epoch": 0.10760823875577974,
"grad_norm": 0.0008593493041121279,
"kl": 0.0091400146484375,
"learning_rate": 4.3596669392586363e-07,
"loss": 0.0,
"reward": 1.0833333730697632,
"reward_std": 0.1178511306643486,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.0833333358168602,
"step": 128
},
{
"completion_length": 373.3333435058594,
"epoch": 0.1092896174863388,
"grad_norm": 0.7307467398455231,
"kl": 0.0075836181640625,
"learning_rate": 4.337865984268001e-07,
"loss": 0.0,
"reward": 1.1250000298023224,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.1250000037252903,
"step": 130
},
{
"completion_length": 394.6666717529297,
"epoch": 0.11097099621689786,
"grad_norm": 0.8811859841007285,
"kl": 0.010345458984375,
"learning_rate": 4.3157565798095746e-07,
"loss": 0.0,
"reward": 1.2500000596046448,
"reward_std": 0.2357022613286972,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.2500000074505806,
"step": 132
},
{
"completion_length": 341.2916793823242,
"epoch": 0.11265237494745692,
"grad_norm": 0.7652707329784861,
"kl": 0.0125885009765625,
"learning_rate": 4.293342436509756e-07,
"loss": 0.0,
"reward": 1.1666666865348816,
"reward_std": 0.3535533919930458,
"rewards/format_reward_func": 0.9166666865348816,
"rewards/solution_reward_func": 0.2500000037252903,
"step": 134
},
{
"completion_length": 478.3333435058594,
"epoch": 0.11433375367801597,
"grad_norm": 0.6268940088226042,
"kl": 0.0119781494140625,
"learning_rate": 4.2706273161393326e-07,
"loss": 0.0,
"reward": 1.1666667461395264,
"reward_std": 0.2357022613286972,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.1666666716337204,
"step": 136
},
{
"completion_length": 357.8333435058594,
"epoch": 0.11601513240857503,
"grad_norm": 0.6940515666291348,
"kl": 0.0140228271484375,
"learning_rate": 4.2476150309821437e-07,
"loss": 0.0,
"reward": 1.2083333730697632,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.2083333395421505,
"step": 138
},
{
"completion_length": 398.0833435058594,
"epoch": 0.1176965111391341,
"grad_norm": 0.4068021983578087,
"kl": 0.0218048095703125,
"learning_rate": 4.2243094431952607e-07,
"loss": 0.0,
"reward": 1.2916666865348816,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 0.9583333432674408,
"rewards/solution_reward_func": 0.3333333395421505,
"step": 140
},
{
"completion_length": 391.5416717529297,
"epoch": 0.11937788986969315,
"grad_norm": 1.054400750652676,
"kl": 0.0133209228515625,
"learning_rate": 4.2007144641608035e-07,
"loss": 0.0,
"reward": 1.2500000596046448,
"reward_std": 0.2357022613286972,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.2500000074505806,
"step": 142
},
{
"completion_length": 362.9166717529297,
"epoch": 0.1210592686002522,
"grad_norm": 0.7202533244508015,
"kl": 0.0169525146484375,
"learning_rate": 4.1768340538294914e-07,
"loss": 0.0,
"reward": 1.2083333730697632,
"reward_std": 0.2946278266608715,
"rewards/format_reward_func": 0.9583333432674408,
"rewards/solution_reward_func": 0.2500000037252903,
"step": 144
},
{
"completion_length": 392.75000762939453,
"epoch": 0.12274064733081126,
"grad_norm": 0.5668355112824374,
"kl": 0.0153656005859375,
"learning_rate": 4.1526722200560436e-07,
"loss": 0.0,
"reward": 1.2083333432674408,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.2083333358168602,
"step": 146
},
{
"completion_length": 488.00001525878906,
"epoch": 0.12442202606137033,
"grad_norm": 0.4109654167954109,
"kl": 0.0104522705078125,
"learning_rate": 4.1282330179265377e-07,
"loss": 0.0,
"reward": 1.0833333730697632,
"reward_std": 0.1178511306643486,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.0833333358168602,
"step": 148
},
{
"completion_length": 379.00000762939453,
"epoch": 0.12610340479192939,
"grad_norm": 0.6511266004361977,
"kl": 0.0153656005859375,
"learning_rate": 4.1035205490778496e-07,
"loss": 0.0,
"reward": 1.291666716337204,
"reward_std": 0.2946278266608715,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.2916666716337204,
"step": 150
},
{
"completion_length": 477.4583511352539,
"epoch": 0.12778478352248845,
"grad_norm": 0.41009026844574226,
"kl": 0.0298004150390625,
"learning_rate": 4.078538961009268e-07,
"loss": 0.0,
"reward": 1.2500000596046448,
"reward_std": 0.2357022613286972,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.2500000037252903,
"step": 152
},
{
"completion_length": 380.6666793823242,
"epoch": 0.1294661622530475,
"grad_norm": 0.41679517817122624,
"kl": 0.0112762451171875,
"learning_rate": 4.0532924463864214e-07,
"loss": 0.0,
"reward": 1.2500000298023224,
"reward_std": 0.2357022613286972,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.2500000037252903,
"step": 154
},
{
"completion_length": 331.00001525878906,
"epoch": 0.13114754098360656,
"grad_norm": 0.002301312664670671,
"kl": 0.0154876708984375,
"learning_rate": 4.027785242337625e-07,
"loss": 0.0,
"reward": 1.291666716337204,
"reward_std": 0.0589255653321743,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.2916666753590107,
"step": 156
},
{
"completion_length": 337.1666717529297,
"epoch": 0.1328289197141656,
"grad_norm": 0.41037493061839153,
"kl": 0.01825714111328125,
"learning_rate": 4.002021629742759e-07,
"loss": 0.0,
"reward": 1.25,
"reward_std": 0.1178511306643486,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.25,
"step": 158
},
{
"completion_length": 355.3333435058594,
"epoch": 0.13451029844472467,
"grad_norm": 0.4834324481747559,
"kl": 0.0101470947265625,
"learning_rate": 3.9760059325148063e-07,
"loss": 0.0,
"reward": 1.291666716337204,
"reward_std": 0.2946278266608715,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.2916666753590107,
"step": 160
},
{
"completion_length": 475.8333511352539,
"epoch": 0.13619167717528374,
"grad_norm": 0.0007131492797933627,
"kl": 0.00870513916015625,
"learning_rate": 3.949742516874175e-07,
"loss": 0.0,
"reward": 1.4583333730697632,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.4583333432674408,
"step": 162
},
{
"completion_length": 400.0416793823242,
"epoch": 0.13787305590584278,
"grad_norm": 0.6704272888706837,
"kl": 0.01324462890625,
"learning_rate": 3.9232357906159065e-07,
"loss": 0.0,
"reward": 1.2500000596046448,
"reward_std": 0.2357022613286972,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.2500000074505806,
"step": 164
},
{
"completion_length": 391.62501525878906,
"epoch": 0.13955443463640185,
"grad_norm": 0.8927607188893651,
"kl": 0.00870513916015625,
"learning_rate": 3.8964902023699234e-07,
"loss": 0.0,
"reward": 1.2916666865348816,
"reward_std": 0.4124789498746395,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.2916666679084301,
"step": 166
},
{
"completion_length": 432.1666793823242,
"epoch": 0.14123581336696092,
"grad_norm": 0.5356539936839141,
"kl": 0.00945281982421875,
"learning_rate": 3.869510240854407e-07,
"loss": 0.0,
"reward": 1.2500000596046448,
"reward_std": 0.2357022613286972,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.2500000037252903,
"step": 168
},
{
"completion_length": 400.9583511352539,
"epoch": 0.14291719209751996,
"grad_norm": 0.6797066476960871,
"kl": 0.0103912353515625,
"learning_rate": 3.8423004341224595e-07,
"loss": 0.0,
"reward": 1.2083334028720856,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.2083333395421505,
"step": 170
},
{
"completion_length": 465.2083511352539,
"epoch": 0.14459857082807903,
"grad_norm": 0.7805415268970916,
"kl": 0.00925445556640625,
"learning_rate": 3.8148653488021566e-07,
"loss": 0.0,
"reward": 1.3333333730697632,
"reward_std": 0.2357022613286972,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.3333333395421505,
"step": 172
},
{
"completion_length": 435.0833435058594,
"epoch": 0.14627994955863807,
"grad_norm": 0.4294043427967522,
"kl": 0.006561279296875,
"learning_rate": 3.787209589330134e-07,
"loss": 0.0,
"reward": 1.2083333730697632,
"reward_std": 0.2946278266608715,
"rewards/format_reward_func": 0.9583333432674408,
"rewards/solution_reward_func": 0.2500000037252903,
"step": 174
},
{
"completion_length": 474.00001525878906,
"epoch": 0.14796132828919714,
"grad_norm": 0.5780056567965537,
"kl": 0.0077667236328125,
"learning_rate": 3.759337797178816e-07,
"loss": 0.0,
"reward": 1.2916667461395264,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.2916666753590107,
"step": 176
},
{
"completion_length": 483.50000762939453,
"epoch": 0.1496427070197562,
"grad_norm": 0.7711416518392432,
"kl": 0.00598907470703125,
"learning_rate": 3.7312546500774455e-07,
"loss": 0.0,
"reward": 1.4583334028720856,
"reward_std": 0.4124789573252201,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.4583333469927311,
"step": 178
},
{
"completion_length": 560.7916793823242,
"epoch": 0.15132408575031525,
"grad_norm": 0.6275316044622387,
"kl": 0.007049560546875,
"learning_rate": 3.7029648612270123e-07,
"loss": 0.0,
"reward": 1.1250000298023224,
"reward_std": 0.2946278229355812,
"rewards/format_reward_func": 0.9583333432674408,
"rewards/solution_reward_func": 0.1666666679084301,
"step": 180
},
{
"completion_length": 493.2916793823242,
"epoch": 0.15300546448087432,
"grad_norm": 0.41941279841402085,
"kl": 0.00750732421875,
"learning_rate": 3.6744731785092393e-07,
"loss": 0.0,
"reward": 1.1666666865348816,
"reward_std": 0.3535533919930458,
"rewards/format_reward_func": 0.9583333432674408,
"rewards/solution_reward_func": 0.2083333395421505,
"step": 182
},
{
"completion_length": 431.25001525878906,
"epoch": 0.1546868432114334,
"grad_norm": 0.5134185107434652,
"kl": 0.0072479248046875,
"learning_rate": 3.6457843836897417e-07,
"loss": 0.0,
"reward": 1.416666716337204,
"reward_std": 0.2357022613286972,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.4166666753590107,
"step": 184
},
{
"completion_length": 427.2916717529297,
"epoch": 0.15636822194199243,
"grad_norm": 0.498752511394473,
"kl": 0.01192474365234375,
"learning_rate": 3.6169032916155055e-07,
"loss": 0.0,
"reward": 1.2083334028720856,
"reward_std": 0.2946278266608715,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.2083333395421505,
"step": 186
},
{
"completion_length": 529.5833435058594,
"epoch": 0.1580496006725515,
"grad_norm": 0.484964125720315,
"kl": 0.0084381103515625,
"learning_rate": 3.587834749406808e-07,
"loss": 0.0,
"reward": 1.2083333730697632,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 0.9583333432674408,
"rewards/solution_reward_func": 0.2500000074505806,
"step": 188
},
{
"completion_length": 442.75000762939453,
"epoch": 0.15973097940311054,
"grad_norm": 0.29670849204677346,
"kl": 0.00942230224609375,
"learning_rate": 3.558583635643726e-07,
"loss": 0.0,
"reward": 1.2500000298023224,
"reward_std": 0.1178511306643486,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.2500000111758709,
"step": 190
},
{
"completion_length": 461.12501525878906,
"epoch": 0.1614123581336696,
"grad_norm": 0.8668624278803033,
"kl": 0.010345458984375,
"learning_rate": 3.52915485954736e-07,
"loss": 0.0,
"reward": 1.2500000596046448,
"reward_std": 0.2357022613286972,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.2500000074505806,
"step": 192
},
{
"completion_length": 552.1250076293945,
"epoch": 0.16309373686422868,
"grad_norm": 0.5686603271249641,
"kl": 0.00679779052734375,
"learning_rate": 3.4995533601559225e-07,
"loss": 0.0,
"reward": 1.2500000596046448,
"reward_std": 0.3535533919930458,
"rewards/format_reward_func": 0.9583333432674408,
"rewards/solution_reward_func": 0.2916666753590107,
"step": 194
},
{
"completion_length": 489.75000762939453,
"epoch": 0.16477511559478772,
"grad_norm": 0.9001750933871568,
"kl": 0.01209259033203125,
"learning_rate": 3.469784105495816e-07,
"loss": 0.0,
"reward": 1.3750000298023224,
"reward_std": 0.4124789573252201,
"rewards/format_reward_func": 0.9583333432674408,
"rewards/solution_reward_func": 0.4166666716337204,
"step": 196
},
{
"completion_length": 504.29168701171875,
"epoch": 0.1664564943253468,
"grad_norm": 0.351872496477236,
"kl": 0.01596832275390625,
"learning_rate": 3.4398520917478476e-07,
"loss": 0.0,
"reward": 1.2500000298023224,
"reward_std": 0.2357022613286972,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.2500000037252903,
"step": 198
},
{
"completion_length": 551.7083511352539,
"epoch": 0.16813787305590586,
"grad_norm": 0.7904185495449961,
"kl": 0.0072021484375,
"learning_rate": 3.409762342408719e-07,
"loss": 0.0,
"reward": 1.2916666865348816,
"reward_std": 0.4124789535999298,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.2916666716337204,
"step": 200
},
{
"completion_length": 436.7916793823242,
"epoch": 0.1698192517864649,
"grad_norm": 0.30374732572574603,
"kl": 0.01061248779296875,
"learning_rate": 3.379519907447931e-07,
"loss": 0.0,
"reward": 1.2916666865348816,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.2916666679084301,
"step": 202
},
{
"completion_length": 494.2083511352539,
"epoch": 0.17150063051702397,
"grad_norm": 0.4216305294562641,
"kl": 0.00971221923828125,
"learning_rate": 3.349129862460251e-07,
"loss": 0.0,
"reward": 1.2500000596046448,
"reward_std": 0.1178511306643486,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.2500000074505806,
"step": 204
},
{
"completion_length": 436.58333587646484,
"epoch": 0.173182009247583,
"grad_norm": 0.34898976819652716,
"kl": 0.00868988037109375,
"learning_rate": 3.318597307813866e-07,
"loss": 0.0,
"reward": 1.5416666865348816,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.541666679084301,
"step": 206
},
{
"completion_length": 552.3333511352539,
"epoch": 0.17486338797814208,
"grad_norm": 0.001036227801857544,
"kl": 0.00815582275390625,
"learning_rate": 3.287927367794397e-07,
"loss": 0.0,
"reward": 1.125,
"reward_std": 0.0589255653321743,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.125,
"step": 208
},
{
"completion_length": 541.2500228881836,
"epoch": 0.17654476670870115,
"grad_norm": 0.6275771663735162,
"kl": 0.11170196533203125,
"learning_rate": 3.2571251897448763e-07,
"loss": 0.0001,
"reward": 1.3333333432674408,
"reward_std": 0.2357022576034069,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.3333333358168602,
"step": 210
},
{
"completion_length": 530.9166717529297,
"epoch": 0.1782261454392602,
"grad_norm": 0.5568701150128608,
"kl": 0.011932373046875,
"learning_rate": 3.226195943201883e-07,
"loss": 0.0,
"reward": 1.041666716337204,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 0.9583333432674408,
"rewards/solution_reward_func": 0.0833333358168602,
"step": 212
},
{
"completion_length": 503.0416717529297,
"epoch": 0.17990752416981926,
"grad_norm": 0.668445204981218,
"kl": 0.01616668701171875,
"learning_rate": 3.1951448190279253e-07,
"loss": 0.0,
"reward": 1.4583334028720856,
"reward_std": 0.2946278266608715,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.4583333469927311,
"step": 214
},
{
"completion_length": 493.5416717529297,
"epoch": 0.18158890290037832,
"grad_norm": 0.8478573951713925,
"kl": 0.00855255126953125,
"learning_rate": 3.163977028540263e-07,
"loss": 0.0,
"reward": 1.416666716337204,
"reward_std": 0.4714045189321041,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.4166666753590107,
"step": 216
},
{
"completion_length": 591.0416870117188,
"epoch": 0.18327028163093737,
"grad_norm": 0.7348765645347503,
"kl": 0.0094451904296875,
"learning_rate": 3.1326978026362905e-07,
"loss": 0.0,
"reward": 1.2083333730697632,
"reward_std": 0.4124789535999298,
"rewards/format_reward_func": 0.9583333432674408,
"rewards/solution_reward_func": 0.2500000074505806,
"step": 218
},
{
"completion_length": 406.12500762939453,
"epoch": 0.18495166036149643,
"grad_norm": 0.6283226728886679,
"kl": 0.00975799560546875,
"learning_rate": 3.101312390915634e-07,
"loss": 0.0,
"reward": 1.2916667461395264,
"reward_std": 0.2946278266608715,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.2916666753590107,
"step": 220
},
{
"completion_length": 434.6666793823242,
"epoch": 0.18663303909205547,
"grad_norm": 0.46740684202706906,
"kl": 0.0135955810546875,
"learning_rate": 3.069826060799109e-07,
"loss": 0.0,
"reward": 1.2083333730697632,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 0.9583333432674408,
"rewards/solution_reward_func": 0.2500000074505806,
"step": 222
},
{
"completion_length": 512.3750152587891,
"epoch": 0.18831441782261454,
"grad_norm": 0.6829987852089712,
"kl": 0.0085601806640625,
"learning_rate": 3.038244096644687e-07,
"loss": 0.0,
"reward": 1.3750000298023224,
"reward_std": 0.2946278266608715,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.3750000037252903,
"step": 224
},
{
"completion_length": 443.3333435058594,
"epoch": 0.1899957965531736,
"grad_norm": 0.7302277802124593,
"kl": 0.0476226806640625,
"learning_rate": 3.0065717988606256e-07,
"loss": 0.0,
"reward": 1.3750000298023224,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.3750000149011612,
"step": 226
},
{
"completion_length": 450.37500762939453,
"epoch": 0.19167717528373265,
"grad_norm": 0.0033526514387488097,
"kl": 0.0142059326171875,
"learning_rate": 2.974814483015892e-07,
"loss": 0.0,
"reward": 1.4583333730697632,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.4583333432674408,
"step": 228
},
{
"completion_length": 484.16668701171875,
"epoch": 0.19335855401429172,
"grad_norm": 0.4236067078997056,
"kl": 0.01483154296875,
"learning_rate": 2.942977478948057e-07,
"loss": 0.0,
"reward": 1.291666716337204,
"reward_std": 0.2946278266608715,
"rewards/format_reward_func": 0.9583333432674408,
"rewards/solution_reward_func": 0.3333333395421505,
"step": 230
},
{
"completion_length": 470.75001525878906,
"epoch": 0.1950399327448508,
"grad_norm": 0.5734136074701052,
"kl": 0.0141448974609375,
"learning_rate": 2.911066129868782e-07,
"loss": 0.0,
"reward": 1.5000000298023224,
"reward_std": 0.2357022613286972,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.5000000074505806,
"step": 232
},
{
"completion_length": 539.8750152587891,
"epoch": 0.19672131147540983,
"grad_norm": 0.8539808804727996,
"kl": 0.02154541015625,
"learning_rate": 2.87908579146707e-07,
"loss": 0.0,
"reward": 1.3333333730697632,
"reward_std": 0.3535533882677555,
"rewards/format_reward_func": 0.9583333432674408,
"rewards/solution_reward_func": 0.3750000074505806,
"step": 234
},
{
"completion_length": 491.7500228881836,
"epoch": 0.1984026902059689,
"grad_norm": 0.3028340455454943,
"kl": 0.0213470458984375,
"learning_rate": 2.847041831010417e-07,
"loss": 0.0,
"reward": 1.3750000596046448,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.3750000074505806,
"step": 236
},
{
"completion_length": 376.50000762939453,
"epoch": 0.20008406893652794,
"grad_norm": 0.31222669099640876,
"kl": 0.020355224609375,
"learning_rate": 2.8149396264440227e-07,
"loss": 0.0,
"reward": 1.4583333730697632,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.4583333432674408,
"step": 238
},
{
"completion_length": 343.5833435058594,
"epoch": 0.201765447667087,
"grad_norm": 0.7628977182906207,
"kl": 0.0173187255859375,
"learning_rate": 2.782784565488211e-07,
"loss": 0.0,
"reward": 1.4166666865348816,
"reward_std": 0.2357022613286972,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.4166666679084301,
"step": 240
},
{
"completion_length": 347.7916717529297,
"epoch": 0.20344682639764608,
"grad_norm": 0.9647796082832435,
"kl": 0.033447265625,
"learning_rate": 2.7505820447342024e-07,
"loss": 0.0,
"reward": 1.541666716337204,
"reward_std": 0.2946278266608715,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.5416666828095913,
"step": 242
},
{
"completion_length": 367.0833435058594,
"epoch": 0.20512820512820512,
"grad_norm": 0.5497011882389701,
"kl": 0.01873779296875,
"learning_rate": 2.7183374687384096e-07,
"loss": 0.0,
"reward": 1.3750000298023224,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.3750000111758709,
"step": 244
},
{
"completion_length": 386.12500762939453,
"epoch": 0.2068095838587642,
"grad_norm": 0.7366647205721275,
"kl": 0.028411865234375,
"learning_rate": 2.686056249115385e-07,
"loss": 0.0,
"reward": 1.3333333432674408,
"reward_std": 0.2357022613286972,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.3333333358168602,
"step": 246
},
{
"completion_length": 490.3333435058594,
"epoch": 0.20849096258932326,
"grad_norm": 0.3640781084742551,
"kl": 0.0124053955078125,
"learning_rate": 2.653743803629587e-07,
"loss": 0.0,
"reward": 1.3750000298023224,
"reward_std": 0.2946278229355812,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.3750000037252903,
"step": 248
},
{
"completion_length": 496.6666793823242,
"epoch": 0.2101723413198823,
"grad_norm": 0.4955566369463096,
"kl": 0.02813720703125,
"learning_rate": 2.621405555286121e-07,
"loss": 0.0,
"reward": 1.3750000596046448,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.3750000111758709,
"step": 250
},
{
"completion_length": 403.0000114440918,
"epoch": 0.21185372005044137,
"grad_norm": 0.4129623000055268,
"kl": 0.02422332763671875,
"learning_rate": 2.589046931420589e-07,
"loss": 0.0,
"reward": 1.5000000596046448,
"reward_std": 0.1178511306643486,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.5000000149011612,
"step": 252
},
{
"completion_length": 381.75000762939453,
"epoch": 0.2135350987810004,
"grad_norm": 0.3516450115328973,
"kl": 0.0178985595703125,
"learning_rate": 2.556673362788225e-07,
"loss": 0.0,
"reward": 1.541666716337204,
"reward_std": 0.0589255653321743,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.541666679084301,
"step": 254
},
{
"completion_length": 458.0416793823242,
"epoch": 0.21521647751155948,
"grad_norm": 0.4125605646498622,
"kl": 0.0144500732421875,
"learning_rate": 2.524290282652443e-07,
"loss": 0.0,
"reward": 1.4583333730697632,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.4583333395421505,
"step": 256
},
{
"completion_length": 514.2083511352539,
"epoch": 0.21689785624211855,
"grad_norm": 0.2564484003528315,
"kl": 0.0143585205078125,
"learning_rate": 2.4919031258729785e-07,
"loss": 0.0,
"reward": 1.291666716337204,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.2916666716337204,
"step": 258
},
{
"completion_length": 437.8333511352539,
"epoch": 0.2185792349726776,
"grad_norm": 1.0909769396928994,
"kl": 0.015350341796875,
"learning_rate": 2.459517327993746e-07,
"loss": 0.0,
"reward": 1.5000000298023224,
"reward_std": 0.3535533882677555,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.5000000074505806,
"step": 260
},
{
"completion_length": 408.3333435058594,
"epoch": 0.22026061370323666,
"grad_norm": 0.4944126035221868,
"kl": 0.036376953125,
"learning_rate": 2.427138324330601e-07,
"loss": 0.0,
"reward": 1.666666716337204,
"reward_std": 0.1178511306643486,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.6666666939854622,
"step": 262
},
{
"completion_length": 484.45835876464844,
"epoch": 0.22194199243379573,
"grad_norm": 0.5710232427079407,
"kl": 0.0914764404296875,
"learning_rate": 2.3947715490591203e-07,
"loss": 0.0001,
"reward": 1.541666716337204,
"reward_std": 0.2946278266608715,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.541666679084301,
"step": 264
},
{
"completion_length": 381.6666793823242,
"epoch": 0.22362337116435477,
"grad_norm": 0.5414781111970816,
"kl": 0.0176849365234375,
"learning_rate": 2.3624224343025876e-07,
"loss": 0.0,
"reward": 1.5833333730697632,
"reward_std": 0.2357022576034069,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.5833333432674408,
"step": 266
},
{
"completion_length": 381.9166793823242,
"epoch": 0.22530474989491384,
"grad_norm": 0.2752143657107066,
"kl": 0.015380859375,
"learning_rate": 2.3300964092203203e-07,
"loss": 0.0,
"reward": 1.4583333730697632,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.4583333432674408,
"step": 268
},
{
"completion_length": 409.4166717529297,
"epoch": 0.22698612862547288,
"grad_norm": 0.7197256320386043,
"kl": 0.012451171875,
"learning_rate": 2.2977988990964896e-07,
"loss": 0.0,
"reward": 1.4583333730697632,
"reward_std": 0.2946278229355812,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.4583333432674408,
"step": 270
},
{
"completion_length": 402.5416717529297,
"epoch": 0.22866750735603195,
"grad_norm": 1.0400541151794251,
"kl": 0.0240020751953125,
"learning_rate": 2.2655353244295927e-07,
"loss": 0.0,
"reward": 1.5000000298023224,
"reward_std": 0.3535533882677555,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.5000000074505806,
"step": 272
},
{
"completion_length": 459.62501525878906,
"epoch": 0.23034888608659101,
"grad_norm": 0.5084744997876609,
"kl": 0.020263671875,
"learning_rate": 2.233311100022734e-07,
"loss": 0.0,
"reward": 1.541666716337204,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.5416666828095913,
"step": 274
},
{
"completion_length": 366.4583435058594,
"epoch": 0.23203026481715006,
"grad_norm": 0.7424502755105113,
"kl": 0.1666107177734375,
"learning_rate": 2.2011316340748528e-07,
"loss": 0.0002,
"reward": 1.6250000298023224,
"reward_std": 0.2946278229355812,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.6250000149011612,
"step": 276
},
{
"completion_length": 495.16668701171875,
"epoch": 0.23371164354770912,
"grad_norm": 0.8257211078506724,
"kl": 0.0146484375,
"learning_rate": 2.1690023272730678e-07,
"loss": 0.0,
"reward": 1.541666716337204,
"reward_std": 0.5303300879895687,
"rewards/format_reward_func": 0.9583333432674408,
"rewards/solution_reward_func": 0.5833333432674408,
"step": 278
},
{
"completion_length": 457.7916793823242,
"epoch": 0.2353930222782682,
"grad_norm": 0.42375918647040944,
"kl": 0.0121307373046875,
"learning_rate": 2.1369285718862748e-07,
"loss": 0.0,
"reward": 1.541666716337204,
"reward_std": 0.0589255653321743,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.541666679084301,
"step": 280
},
{
"completion_length": 420.37500762939453,
"epoch": 0.23707440100882723,
"grad_norm": 0.5152043630269939,
"kl": 0.0162506103515625,
"learning_rate": 2.104915750860164e-07,
"loss": 0.0,
"reward": 1.4583333730697632,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.4583333395421505,
"step": 282
},
{
"completion_length": 385.8333511352539,
"epoch": 0.2387557797393863,
"grad_norm": 0.49294668816422704,
"kl": 0.0164794921875,
"learning_rate": 2.072969236913799e-07,
"loss": 0.0,
"reward": 1.3333333730697632,
"reward_std": 0.2357022613286972,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.3333333358168602,
"step": 284
},
{
"completion_length": 392.62500762939453,
"epoch": 0.24043715846994534,
"grad_norm": 0.6512225875746797,
"kl": 0.01849365234375,
"learning_rate": 2.0410943916379097e-07,
"loss": 0.0,
"reward": 1.416666716337204,
"reward_std": 0.2357022613286972,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.4166666753590107,
"step": 286
},
{
"completion_length": 412.12501525878906,
"epoch": 0.2421185372005044,
"grad_norm": 0.3660817551390846,
"kl": 0.010711669921875,
"learning_rate": 2.0092965645950564e-07,
"loss": 0.0,
"reward": 1.5833333730697632,
"reward_std": 0.1178511306643486,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.5833333395421505,
"step": 288
},
{
"completion_length": 430.12500762939453,
"epoch": 0.24379991593106348,
"grad_norm": 0.626448385607845,
"kl": 0.0183258056640625,
"learning_rate": 1.977581092421812e-07,
"loss": 0.0,
"reward": 1.416666716337204,
"reward_std": 0.2357022613286972,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.4166666753590107,
"step": 290
},
{
"completion_length": 396.7083435058594,
"epoch": 0.24548129466162252,
"grad_norm": 0.004913345703168958,
"kl": 0.020263671875,
"learning_rate": 1.9459532979331148e-07,
"loss": 0.0,
"reward": 1.541666716337204,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.5416666865348816,
"step": 292
},
{
"completion_length": 493.9583511352539,
"epoch": 0.2471626733921816,
"grad_norm": 0.5565359495913534,
"kl": 0.0181427001953125,
"learning_rate": 1.9144184892289336e-07,
"loss": 0.0,
"reward": 1.4583333432674408,
"reward_std": 0.2946278229355812,
"rewards/format_reward_func": 0.9583333432674408,
"rewards/solution_reward_func": 0.5000000074505806,
"step": 294
},
{
"completion_length": 422.45835876464844,
"epoch": 0.24884405212274066,
"grad_norm": 0.449063011244765,
"kl": 0.0212249755859375,
"learning_rate": 1.882981958803414e-07,
"loss": 0.0,
"reward": 1.4583333432674408,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.4583333358168602,
"step": 296
},
{
"completion_length": 511.58335876464844,
"epoch": 0.2505254308532997,
"grad_norm": 0.5020099782167112,
"kl": 0.011138916015625,
"learning_rate": 1.8516489826566374e-07,
"loss": 0.0,
"reward": 1.4583333432674408,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.4583333358168602,
"step": 298
},
{
"completion_length": 412.25000762939453,
"epoch": 0.25220680958385877,
"grad_norm": 0.0014012414260649606,
"kl": 0.02069091796875,
"learning_rate": 1.8204248194091425e-07,
"loss": 0.0,
"reward": 1.7083333730697632,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.7083333507180214,
"step": 300
},
{
"completion_length": 450.8333435058594,
"epoch": 0.25388818831441784,
"grad_norm": 0.24821505940896457,
"kl": 0.013641357421875,
"learning_rate": 1.7893147094193784e-07,
"loss": 0.0,
"reward": 1.541666716337204,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 0.9583333432674408,
"rewards/solution_reward_func": 0.5833333432674408,
"step": 302
},
{
"completion_length": 407.37501525878906,
"epoch": 0.2555695670449769,
"grad_norm": 0.7504631135855148,
"kl": 0.0157470703125,
"learning_rate": 1.7583238739042084e-07,
"loss": 0.0,
"reward": 1.7083333432674408,
"reward_std": 0.2946278229355812,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.7083333432674408,
"step": 304
},
{
"completion_length": 454.37500762939453,
"epoch": 0.2572509457755359,
"grad_norm": 0.6337846991609941,
"kl": 0.0143890380859375,
"learning_rate": 1.7274575140626315e-07,
"loss": 0.0,
"reward": 1.7083334028720856,
"reward_std": 0.2946278266608715,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.708333358168602,
"step": 306
},
{
"completion_length": 444.0833435058594,
"epoch": 0.258932324506095,
"grad_norm": 0.6680546776528479,
"kl": 0.0210113525390625,
"learning_rate": 1.6967208102028696e-07,
"loss": 0.0,
"reward": 1.4583333730697632,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.4583333432674408,
"step": 308
},
{
"completion_length": 415.62500762939453,
"epoch": 0.26061370323665406,
"grad_norm": 0.390703161764533,
"kl": 0.054595947265625,
"learning_rate": 1.6661189208729489e-07,
"loss": 0.0001,
"reward": 1.5000000298023224,
"reward_std": 0.1178511306643486,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.5000000149011612,
"step": 310
},
{
"completion_length": 390.83333587646484,
"epoch": 0.26229508196721313,
"grad_norm": 0.42044370258908614,
"kl": 0.016571044921875,
"learning_rate": 1.6356569819949427e-07,
"loss": 0.0,
"reward": 1.5000000298023224,
"reward_std": 0.1178511306643486,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.5000000149011612,
"step": 312
},
{
"completion_length": 430.87501525878906,
"epoch": 0.2639764606977722,
"grad_norm": 0.30802749969680127,
"kl": 0.020721435546875,
"learning_rate": 1.6053401060030097e-07,
"loss": 0.0,
"reward": 1.7500000596046448,
"reward_std": 0.2357022613286972,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.7500000298023224,
"step": 314
},
{
"completion_length": 381.2916793823242,
"epoch": 0.2656578394283312,
"grad_norm": 0.4549527318175164,
"kl": 0.0202789306640625,
"learning_rate": 1.57517338098537e-07,
"loss": 0.0,
"reward": 1.5833333432674408,
"reward_std": 0.1178511306643486,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.5833333358168602,
"step": 316
},
{
"completion_length": 365.08333587646484,
"epoch": 0.2673392181588903,
"grad_norm": 0.46488928219712783,
"kl": 0.032012939453125,
"learning_rate": 1.545161869830371e-07,
"loss": 0.0,
"reward": 1.666666716337204,
"reward_std": 0.1178511306643486,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.6666666939854622,
"step": 318
},
{
"completion_length": 353.62501525878906,
"epoch": 0.26902059688944935,
"grad_norm": 0.8439115565804695,
"kl": 0.018096923828125,
"learning_rate": 1.5153106093767825e-07,
"loss": 0.0,
"reward": 1.7500000596046448,
"reward_std": 0.3535533919930458,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.7500000298023224,
"step": 320
},
{
"completion_length": 425.3333511352539,
"epoch": 0.2707019756200084,
"grad_norm": 0.0030299941880457203,
"kl": 0.016815185546875,
"learning_rate": 1.4856246095684622e-07,
"loss": 0.0,
"reward": 1.8750000298023224,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.8750000149011612,
"step": 322
},
{
"completion_length": 408.4583435058594,
"epoch": 0.2723833543505675,
"grad_norm": 0.45480636781617306,
"kl": 0.024200439453125,
"learning_rate": 1.4561088526135374e-07,
"loss": 0.0,
"reward": 1.4583333432674408,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.4583333358168602,
"step": 324
},
{
"completion_length": 354.9583435058594,
"epoch": 0.2740647330811265,
"grad_norm": 0.3157474969297678,
"kl": 0.019775390625,
"learning_rate": 1.4267682921482356e-07,
"loss": 0.0,
"reward": 1.5000000298023224,
"reward_std": 0.1178511306643486,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.5000000074505806,
"step": 326
},
{
"completion_length": 461.16668701171875,
"epoch": 0.27574611181168557,
"grad_norm": 0.8364092517911256,
"kl": 0.0169219970703125,
"learning_rate": 1.3976078524055203e-07,
"loss": 0.0,
"reward": 1.3333333730697632,
"reward_std": 0.3535533919930458,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.3333333395421505,
"step": 328
},
{
"completion_length": 403.95833587646484,
"epoch": 0.27742749054224464,
"grad_norm": 0.8372601176868383,
"kl": 0.0169219970703125,
"learning_rate": 1.3686324273886528e-07,
"loss": 0.0,
"reward": 1.4583333730697632,
"reward_std": 0.2946278266608715,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.4583333469927311,
"step": 330
},
{
"completion_length": 381.5833435058594,
"epoch": 0.2791088692728037,
"grad_norm": 0.7788446171406045,
"kl": 0.026214599609375,
"learning_rate": 1.339846880049829e-07,
"loss": 0.0,
"reward": 1.5000000298023224,
"reward_std": 0.2357022613286972,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.5000000111758709,
"step": 332
},
{
"completion_length": 487.7916793823242,
"epoch": 0.2807902480033628,
"grad_norm": 0.9953471506857543,
"kl": 0.0202484130859375,
"learning_rate": 1.3112560414740313e-07,
"loss": 0.0,
"reward": 1.541666716337204,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.541666679084301,
"step": 334
},
{
"completion_length": 499.62500762939453,
"epoch": 0.28247162673392184,
"grad_norm": 0.35616817874834467,
"kl": 0.02581787109375,
"learning_rate": 1.2828647100682261e-07,
"loss": 0.0,
"reward": 1.666666716337204,
"reward_std": 0.2357022613286972,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.666666679084301,
"step": 336
},
{
"completion_length": 454.2083435058594,
"epoch": 0.28415300546448086,
"grad_norm": 0.5428149223244878,
"kl": 0.014923095703125,
"learning_rate": 1.2546776507560467e-07,
"loss": 0.0,
"reward": 1.5000000298023224,
"reward_std": 0.3535533919930458,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.5000000149011612,
"step": 338
},
{
"completion_length": 399.4166717529297,
"epoch": 0.2858343841950399,
"grad_norm": 0.9031786183823988,
"kl": 0.0172576904296875,
"learning_rate": 1.2266995941780933e-07,
"loss": 0.0,
"reward": 1.541666716337204,
"reward_std": 0.2946278266608715,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.541666679084301,
"step": 340
},
{
"completion_length": 453.66667556762695,
"epoch": 0.287515762925599,
"grad_norm": 0.6316532508922427,
"kl": 0.040191650390625,
"learning_rate": 1.1989352358979888e-07,
"loss": 0.0,
"reward": 1.5000000298023224,
"reward_std": 0.2357022613286972,
"rewards/format_reward_func": 0.9583333432674408,
"rewards/solution_reward_func": 0.541666679084301,
"step": 342
},
{
"completion_length": 392.25000762939453,
"epoch": 0.28919714165615806,
"grad_norm": 0.7866237262244063,
"kl": 0.0247802734375,
"learning_rate": 1.1713892356143238e-07,
"loss": 0.0,
"reward": 1.8333333730697632,
"reward_std": 0.2357022613286972,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.8333333730697632,
"step": 344
},
{
"completion_length": 382.58333587646484,
"epoch": 0.29087852038671713,
"grad_norm": 0.8580927088394277,
"kl": 0.0238189697265625,
"learning_rate": 1.1440662163786166e-07,
"loss": 0.0,
"reward": 1.416666716337204,
"reward_std": 0.1178511306643486,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.416666679084301,
"step": 346
},
{
"completion_length": 476.0416793823242,
"epoch": 0.29255989911727615,
"grad_norm": 0.3362942892798578,
"kl": 0.0136871337890625,
"learning_rate": 1.1169707638194237e-07,
"loss": 0.0,
"reward": 1.7500000596046448,
"reward_std": 0.1178511306643486,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.7500000298023224,
"step": 348
},
{
"completion_length": 538.2083511352539,
"epoch": 0.2942412778478352,
"grad_norm": 0.2941167148030311,
"kl": 0.01708984375,
"learning_rate": 1.0901074253727336e-07,
"loss": 0.0,
"reward": 1.6666666865348816,
"reward_std": 0.1178511306643486,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.666666679084301,
"step": 350
},
{
"completion_length": 380.75,
"epoch": 0.2959226565783943,
"grad_norm": 0.5138201935880581,
"kl": 0.023223876953125,
"learning_rate": 1.0634807095187737e-07,
"loss": 0.0,
"reward": 1.5833333730697632,
"reward_std": 0.1178511306643486,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.5833333432674408,
"step": 352
},
{
"completion_length": 380.4583435058594,
"epoch": 0.29760403530895335,
"grad_norm": 0.6613574228632922,
"kl": 0.14385986328125,
"learning_rate": 1.0370950850253449e-07,
"loss": 0.0001,
"reward": 1.6666666865348816,
"reward_std": 0.2357022613286972,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.6666666865348816,
"step": 354
},
{
"completion_length": 424.87500762939453,
"epoch": 0.2992854140395124,
"grad_norm": 0.4785401178732256,
"kl": 0.0345916748046875,
"learning_rate": 1.0109549801978304e-07,
"loss": 0.0,
"reward": 1.7083333432674408,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.7083333432674408,
"step": 356
},
{
"completion_length": 348.7916793823242,
"epoch": 0.30096679277007143,
"grad_norm": 0.0020426538152803417,
"kl": 0.024261474609375,
"learning_rate": 9.850647821359917e-08,
"loss": 0.0,
"reward": 1.4583333730697632,
"reward_std": 0.0589255653321743,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.4583333432674408,
"step": 358
},
{
"completion_length": 475.79168701171875,
"epoch": 0.3026481715006305,
"grad_norm": 0.429195015613074,
"kl": 0.0204010009765625,
"learning_rate": 9.594288359976815e-08,
"loss": 0.0,
"reward": 1.6250000596046448,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.6250000149011612,
"step": 360
},
{
"completion_length": 494.25001525878906,
"epoch": 0.30432955023118957,
"grad_norm": 0.8473545963282998,
"kl": 0.0186004638671875,
"learning_rate": 9.340514442695952e-08,
"loss": 0.0,
"reward": 1.541666716337204,
"reward_std": 0.4124789573252201,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.5416666865348816,
"step": 362
},
{
"completion_length": 492.8750228881836,
"epoch": 0.30601092896174864,
"grad_norm": 0.3234727873687572,
"kl": 0.01751708984375,
"learning_rate": 9.089368660451798e-08,
"loss": 0.0,
"reward": 1.5833333432674408,
"reward_std": 0.1178511306643486,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.5833333358168602,
"step": 364
},
{
"completion_length": 452.2916793823242,
"epoch": 0.3076923076923077,
"grad_norm": 0.40422101797827664,
"kl": 0.025787353515625,
"learning_rate": 8.840893163098332e-08,
"loss": 0.0,
"reward": 1.6666666865348816,
"reward_std": 0.2357022613286972,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.6666666865348816,
"step": 366
},
{
"completion_length": 368.6666793823242,
"epoch": 0.3093736864228668,
"grad_norm": 0.7245593032700843,
"kl": 0.0194549560546875,
"learning_rate": 8.595129652335017e-08,
"loss": 0.0,
"reward": 1.7083334028720856,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.708333358168602,
"step": 368
},
{
"completion_length": 397.08334732055664,
"epoch": 0.3110550651534258,
"grad_norm": 0.8562497096589754,
"kl": 0.0177154541015625,
"learning_rate": 8.352119374707977e-08,
"loss": 0.0,
"reward": 1.7916666865348816,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.7916666865348816,
"step": 370
},
{
"completion_length": 443.2083435058594,
"epoch": 0.31273644388398486,
"grad_norm": 0.43124200301124715,
"kl": 0.020538330078125,
"learning_rate": 8.11190311468759e-08,
"loss": 0.0,
"reward": 1.666666716337204,
"reward_std": 0.2357022613286972,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.6666666939854622,
"step": 372
},
{
"completion_length": 469.9583435058594,
"epoch": 0.31441782261454393,
"grad_norm": 0.6436306808211317,
"kl": 0.076263427734375,
"learning_rate": 7.87452118782363e-08,
"loss": 0.0001,
"reward": 1.4583333730697632,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.4583333507180214,
"step": 374
},
{
"completion_length": 424.7083511352539,
"epoch": 0.316099201345103,
"grad_norm": 0.8624260825303681,
"kl": 0.0174713134765625,
"learning_rate": 7.640013433979093e-08,
"loss": 0.0,
"reward": 1.666666716337204,
"reward_std": 0.3535533919930458,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.6666666865348816,
"step": 376
},
{
"completion_length": 480.4583435058594,
"epoch": 0.31778058007566207,
"grad_norm": 0.4768870927728356,
"kl": 0.019195556640625,
"learning_rate": 7.408419210643846e-08,
"loss": 0.0,
"reward": 1.5833333730697632,
"reward_std": 0.3535533919930458,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.583333358168602,
"step": 378
},
{
"completion_length": 457.7083435058594,
"epoch": 0.3194619588062211,
"grad_norm": 0.7325139897043152,
"kl": 0.0226287841796875,
"learning_rate": 7.179777386329275e-08,
"loss": 0.0,
"reward": 1.7083333730697632,
"reward_std": 0.2946278266608715,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.708333358168602,
"step": 380
},
{
"completion_length": 492.0416793823242,
"epoch": 0.32114333753678015,
"grad_norm": 0.6812039461038883,
"kl": 0.0167388916015625,
"learning_rate": 6.954126334044949e-08,
"loss": 0.0,
"reward": 1.4583333730697632,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.4583333432674408,
"step": 382
},
{
"completion_length": 427.75001525878906,
"epoch": 0.3228247162673392,
"grad_norm": 0.26789925678872634,
"kl": 0.0200653076171875,
"learning_rate": 6.731503924858516e-08,
"loss": 0.0,
"reward": 1.5833333730697632,
"reward_std": 0.1178511306643486,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.5833333507180214,
"step": 384
},
{
"completion_length": 460.37500762939453,
"epoch": 0.3245060949978983,
"grad_norm": 0.4139795134217995,
"kl": 0.0171051025390625,
"learning_rate": 6.511947521539737e-08,
"loss": 0.0,
"reward": 1.8750000298023224,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.8750000298023224,
"step": 386
},
{
"completion_length": 401.37500762939453,
"epoch": 0.32618747372845736,
"grad_norm": 0.9550140447715619,
"kl": 0.04052734375,
"learning_rate": 6.295493972289903e-08,
"loss": 0.0,
"reward": 1.541666716337204,
"reward_std": 0.2946278266608715,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.5416666828095913,
"step": 388
},
{
"completion_length": 381.4583435058594,
"epoch": 0.32786885245901637,
"grad_norm": 0.8642430155063329,
"kl": 0.018341064453125,
"learning_rate": 6.082179604557616e-08,
"loss": 0.0,
"reward": 1.5833333730697632,
"reward_std": 0.2357022613286972,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.5833333507180214,
"step": 390
},
{
"completion_length": 504.0000228881836,
"epoch": 0.32955023118957544,
"grad_norm": 0.45869091032068066,
"kl": 0.0639801025390625,
"learning_rate": 5.8720402189419286e-08,
"loss": 0.0001,
"reward": 1.5833333730697632,
"reward_std": 0.2357022613286972,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.5833333544433117,
"step": 392
},
{
"completion_length": 499.0833511352539,
"epoch": 0.3312316099201345,
"grad_norm": 0.001226330191092669,
"kl": 0.0165252685546875,
"learning_rate": 5.6651110831839046e-08,
"loss": 0.0,
"reward": 1.6666667461395264,
"reward_std": 0.1178511306643486,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.6666666865348816,
"step": 394
},
{
"completion_length": 416.79168701171875,
"epoch": 0.3329129886506936,
"grad_norm": 0.23769632812912322,
"kl": 0.0198211669921875,
"learning_rate": 5.461426926247639e-08,
"loss": 0.0,
"reward": 1.6250000298023224,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.6250000149011612,
"step": 396
},
{
"completion_length": 510.0416793823242,
"epoch": 0.33459436738125264,
"grad_norm": 0.4301315282000898,
"kl": 0.0159149169921875,
"learning_rate": 5.261021932491713e-08,
"loss": 0.0,
"reward": 1.541666716337204,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.5416666865348816,
"step": 398
},
{
"completion_length": 507.0000228881836,
"epoch": 0.3362757461118117,
"grad_norm": 0.2327539317664912,
"kl": 0.016632080078125,
"learning_rate": 5.0639297359319846e-08,
"loss": 0.0,
"reward": 1.5833333730697632,
"reward_std": 0.1178511306643486,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.583333358168602,
"step": 400
},
{
"completion_length": 398.75000762939453,
"epoch": 0.3379571248423707,
"grad_norm": 0.6603293560697683,
"kl": 0.01715087890625,
"learning_rate": 4.870183414596793e-08,
"loss": 0.0,
"reward": 1.7916666865348816,
"reward_std": 0.2946278229355812,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.7916666865348816,
"step": 402
},
{
"completion_length": 453.9166793823242,
"epoch": 0.3396385035729298,
"grad_norm": 0.357065731955349,
"kl": 0.0204010009765625,
"learning_rate": 4.679815484975505e-08,
"loss": 0.0,
"reward": 1.541666716337204,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.5416666828095913,
"step": 404
},
{
"completion_length": 329.2916793823242,
"epoch": 0.34131988230348886,
"grad_norm": 0.6145494404346448,
"kl": 0.208160400390625,
"learning_rate": 4.492857896561203e-08,
"loss": 0.0002,
"reward": 1.5416666865348816,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.541666679084301,
"step": 406
},
{
"completion_length": 482.50001525878906,
"epoch": 0.34300126103404793,
"grad_norm": 0.58571987604954,
"kl": 0.041656494140625,
"learning_rate": 4.309342026488652e-08,
"loss": 0.0,
"reward": 1.666666716337204,
"reward_std": 0.2357022613286972,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.6666666865348816,
"step": 408
},
{
"completion_length": 441.2916717529297,
"epoch": 0.344682639764607,
"grad_norm": 1.06985487849842,
"kl": 0.0160675048828125,
"learning_rate": 4.1292986742682254e-08,
"loss": 0.0,
"reward": 1.6250000596046448,
"reward_std": 0.2946278266608715,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.6250000223517418,
"step": 410
},
{
"completion_length": 389.4166717529297,
"epoch": 0.346364018495166,
"grad_norm": 0.7270657915048854,
"kl": 0.019683837890625,
"learning_rate": 3.952758056616826e-08,
"loss": 0.0,
"reward": 1.5833334028720856,
"reward_std": 0.3535533919930458,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.5833333544433117,
"step": 412
},
{
"completion_length": 406.33333587646484,
"epoch": 0.3480453972257251,
"grad_norm": 0.27758659667364055,
"kl": 0.0177001953125,
"learning_rate": 3.7797498023866395e-08,
"loss": 0.0,
"reward": 1.8333333432674408,
"reward_std": 0.1178511306643486,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.8333333432674408,
"step": 414
},
{
"completion_length": 488.58335876464844,
"epoch": 0.34972677595628415,
"grad_norm": 0.3438592498625851,
"kl": 0.0125274658203125,
"learning_rate": 3.6103029475924727e-08,
"loss": 0.0,
"reward": 1.4583333730697632,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.4583333507180214,
"step": 416
},
{
"completion_length": 467.9166717529297,
"epoch": 0.3514081546868432,
"grad_norm": 0.7068973819897363,
"kl": 0.0189666748046875,
"learning_rate": 3.4444459305386504e-08,
"loss": 0.0,
"reward": 1.5833333730697632,
"reward_std": 0.1178511306643486,
"rewards/format_reward_func": 0.9583333432674408,
"rewards/solution_reward_func": 0.6250000111758709,
"step": 418
},
{
"completion_length": 532.6666793823242,
"epoch": 0.3530895334174023,
"grad_norm": 0.30882726803388616,
"kl": 0.013458251953125,
"learning_rate": 3.2822065870462215e-08,
"loss": 0.0,
"reward": 1.7500000298023224,
"reward_std": 0.1178511306643486,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.7500000223517418,
"step": 420
},
{
"completion_length": 393.5416717529297,
"epoch": 0.3547709121479613,
"grad_norm": 0.5034685025489649,
"kl": 0.020172119140625,
"learning_rate": 3.1236121457812545e-08,
"loss": 0.0,
"reward": 1.7083333432674408,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.7083333432674408,
"step": 422
},
{
"completion_length": 373.75000762939453,
"epoch": 0.3564522908785204,
"grad_norm": 0.7676592102825319,
"kl": 0.021453857421875,
"learning_rate": 2.9686892236850336e-08,
"loss": 0.0,
"reward": 1.4583333730697632,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.4583333432674408,
"step": 424
},
{
"completion_length": 386.0416793823242,
"epoch": 0.35813366960907944,
"grad_norm": 1.0126228541091091,
"kl": 0.039794921875,
"learning_rate": 2.817463821506949e-08,
"loss": 0.0,
"reward": 1.5416666865348816,
"reward_std": 0.2946278229355812,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.5416666716337204,
"step": 426
},
{
"completion_length": 414.1666793823242,
"epoch": 0.3598150483396385,
"grad_norm": 0.4701389090006604,
"kl": 0.0238494873046875,
"learning_rate": 2.6699613194407723e-08,
"loss": 0.0,
"reward": 1.6666666865348816,
"reward_std": 0.2357022613286972,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.6666666865348816,
"step": 428
},
{
"completion_length": 474.4166793823242,
"epoch": 0.3614964270701976,
"grad_norm": 0.5957470677297103,
"kl": 0.019805908203125,
"learning_rate": 2.5262064728651194e-08,
"loss": 0.0,
"reward": 1.666666716337204,
"reward_std": 0.3535533919930458,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.6666666865348816,
"step": 430
},
{
"completion_length": 438.8333435058594,
"epoch": 0.36317780580075665,
"grad_norm": 0.23260450142169511,
"kl": 0.016265869140625,
"learning_rate": 2.3862234081887033e-08,
"loss": 0.0,
"reward": 1.8333333432674408,
"reward_std": 0.1178511306643486,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.8333333432674408,
"step": 432
},
{
"completion_length": 376.25000762939453,
"epoch": 0.36485918453131566,
"grad_norm": 0.4819133256066341,
"kl": 0.02789306640625,
"learning_rate": 2.250035618801241e-08,
"loss": 0.0,
"reward": 1.541666716337204,
"reward_std": 0.0589255653321743,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.5416666753590107,
"step": 434
},
{
"completion_length": 395.3333511352539,
"epoch": 0.36654056326187473,
"grad_norm": 0.4753927376230513,
"kl": 0.0201263427734375,
"learning_rate": 2.117665961130513e-08,
"loss": 0.0,
"reward": 1.791666716337204,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.7916666865348816,
"step": 436
},
{
"completion_length": 480.2083511352539,
"epoch": 0.3682219419924338,
"grad_norm": 0.5069210021394791,
"kl": 0.02008056640625,
"learning_rate": 1.9891366508064e-08,
"loss": 0.0,
"reward": 1.6250000298023224,
"reward_std": 0.2946278229355812,
"rewards/format_reward_func": 0.9583333432674408,
"rewards/solution_reward_func": 0.666666679084301,
"step": 438
},
{
"completion_length": 404.62500762939453,
"epoch": 0.36990332072299287,
"grad_norm": 0.5579175174593816,
"kl": 0.0225830078125,
"learning_rate": 1.8644692589323967e-08,
"loss": 0.0,
"reward": 1.5833333730697632,
"reward_std": 0.2357022613286972,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.5833333544433117,
"step": 440
},
{
"completion_length": 450.7083435058594,
"epoch": 0.37158469945355194,
"grad_norm": 0.0011962781703181325,
"kl": 0.022216796875,
"learning_rate": 1.7436847084653456e-08,
"loss": 0.0,
"reward": 1.6666666865348816,
"reward_std": 0.2357022576034069,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.6666666716337204,
"step": 442
},
{
"completion_length": 403.62500762939453,
"epoch": 0.37326607818411095,
"grad_norm": 0.5038101762676807,
"kl": 0.020782470703125,
"learning_rate": 1.626803270703936e-08,
"loss": 0.0,
"reward": 1.666666716337204,
"reward_std": 0.1178511306643486,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.6666666865348816,
"step": 444
},
{
"completion_length": 305.6666717529297,
"epoch": 0.37494745691467,
"grad_norm": 1.1478452212927168,
"kl": 0.02532958984375,
"learning_rate": 1.513844561886554e-08,
"loss": 0.0,
"reward": 1.8333333730697632,
"reward_std": 0.2357022613286972,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.8333333432674408,
"step": 446
},
{
"completion_length": 440.91668701171875,
"epoch": 0.3766288356452291,
"grad_norm": 0.5410906835431928,
"kl": 0.025360107421875,
"learning_rate": 1.4048275398990894e-08,
"loss": 0.0,
"reward": 1.5000000298023224,
"reward_std": 0.2357022613286972,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.5000000111758709,
"step": 448
},
{
"completion_length": 520.8333435058594,
"epoch": 0.37831021437578816,
"grad_norm": 0.6494890901957951,
"kl": 0.0169219970703125,
"learning_rate": 1.2997705010932391e-08,
"loss": 0.0,
"reward": 1.7083333730697632,
"reward_std": 0.2946278229355812,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.7083333432674408,
"step": 450
},
{
"completion_length": 504.0833435058594,
"epoch": 0.3799915931063472,
"grad_norm": 0.11376590313112087,
"kl": 0.043548583984375,
"learning_rate": 1.1986910772158105e-08,
"loss": 0.0,
"reward": 1.4583333730697632,
"reward_std": 0.0589255653321743,
"rewards/format_reward_func": 0.9583333432674408,
"rewards/solution_reward_func": 0.5000000074505806,
"step": 452
},
{
"completion_length": 436.7916717529297,
"epoch": 0.38167297183690624,
"grad_norm": 0.5652200449027903,
"kl": 0.03338623046875,
"learning_rate": 1.1016062324496007e-08,
"loss": 0.0,
"reward": 1.5833333730697632,
"reward_std": 0.2357022613286972,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.5833333507180214,
"step": 454
},
{
"completion_length": 405.7083511352539,
"epoch": 0.3833543505674653,
"grad_norm": 0.43158182108263865,
"kl": 0.0216064453125,
"learning_rate": 1.0085322605662666e-08,
"loss": 0.0,
"reward": 1.7500000596046448,
"reward_std": 0.3535533919930458,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.7500000149011612,
"step": 456
},
{
"completion_length": 472.0833435058594,
"epoch": 0.3850357292980244,
"grad_norm": 0.717982397429151,
"kl": 0.018798828125,
"learning_rate": 9.194847821917623e-09,
"loss": 0.0,
"reward": 1.7083333730697632,
"reward_std": 0.2946278266608715,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.708333358168602,
"step": 458
},
{
"completion_length": 433.37501525878906,
"epoch": 0.38671710802858345,
"grad_norm": 0.3929283618854557,
"kl": 0.021942138671875,
"learning_rate": 8.344787421847216e-09,
"loss": 0.0,
"reward": 1.5833333730697632,
"reward_std": 0.1178511306643486,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.5833333469927311,
"step": 460
},
{
"completion_length": 546.5833435058594,
"epoch": 0.3883984867591425,
"grad_norm": 0.36389040620504864,
"kl": 0.018341064453125,
"learning_rate": 7.535284071282455e-09,
"loss": 0.0,
"reward": 1.666666716337204,
"reward_std": 0.2357022613286972,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.6666666939854622,
"step": 462
},
{
"completion_length": 397.7916717529297,
"epoch": 0.3900798654897016,
"grad_norm": 0.4936781871754855,
"kl": 0.0218505859375,
"learning_rate": 6.766473629355452e-09,
"loss": 0.0,
"reward": 1.7916666865348816,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.7916666865348816,
"step": 464
},
{
"completion_length": 449.37501525878906,
"epoch": 0.3917612442202606,
"grad_norm": 0.416937865685922,
"kl": 0.0172119140625,
"learning_rate": 6.038485125698295e-09,
"loss": 0.0,
"reward": 1.6250000298023224,
"reward_std": 0.0589255653321743,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.6250000074505806,
"step": 466
},
{
"completion_length": 445.41668701171875,
"epoch": 0.39344262295081966,
"grad_norm": 0.7420333816373434,
"kl": 0.020751953125,
"learning_rate": 5.3514407387877936e-09,
"loss": 0.0,
"reward": 1.6666666865348816,
"reward_std": 0.2357022613286972,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.6666666716337204,
"step": 468
},
{
"completion_length": 414.62501525878906,
"epoch": 0.39512400168137873,
"grad_norm": 0.0068766679961718095,
"kl": 0.020599365234375,
"learning_rate": 4.705455775440237e-09,
"loss": 0.0,
"reward": 1.7500000298023224,
"reward_std": 0.1178511306643486,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.7500000149011612,
"step": 470
},
{
"completion_length": 509.6666717529297,
"epoch": 0.3968053804119378,
"grad_norm": 0.5120985795788513,
"kl": 0.049163818359375,
"learning_rate": 4.100638651459542e-09,
"loss": 0.0,
"reward": 1.7083334028720856,
"reward_std": 0.4124789573252201,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.708333358168602,
"step": 472
},
{
"completion_length": 416.00000762939453,
"epoch": 0.39848675914249687,
"grad_norm": 0.001027025835790879,
"kl": 0.0158233642578125,
"learning_rate": 3.5370908734417006e-09,
"loss": 0.0,
"reward": 1.7500000596046448,
"reward_std": 0.0,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.7500000149011612,
"step": 474
},
{
"completion_length": 397.25000762939453,
"epoch": 0.4001681378730559,
"grad_norm": 0.021320987422443205,
"kl": 0.038848876953125,
"learning_rate": 3.0149070217390106e-09,
"loss": 0.0,
"reward": 1.4583333730697632,
"reward_std": 0.0589255653321743,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.4583333395421505,
"step": 476
},
{
"completion_length": 429.4583511352539,
"epoch": 0.40184951660361495,
"grad_norm": 0.4002345142208856,
"kl": 0.0300445556640625,
"learning_rate": 2.5341747345865026e-09,
"loss": 0.0,
"reward": 1.7500000298023224,
"reward_std": 0.2357022613286972,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.7500000298023224,
"step": 478
},
{
"completion_length": 461.50001525878906,
"epoch": 0.403530895334174,
"grad_norm": 0.0029418687663408513,
"kl": 0.021148681640625,
"learning_rate": 2.094974693393731e-09,
"loss": 0.0,
"reward": 1.5833333730697632,
"reward_std": 0.1178511306643486,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.583333358168602,
"step": 480
},
{
"completion_length": 464.3333435058594,
"epoch": 0.4052122740647331,
"grad_norm": 0.17330056984154224,
"kl": 0.0167388916015625,
"learning_rate": 1.6973806092038523e-09,
"loss": 0.0,
"reward": 1.7083333432674408,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.7083333432674408,
"step": 482
},
{
"completion_length": 494.3333511352539,
"epoch": 0.40689365279529216,
"grad_norm": 0.6291866431405297,
"kl": 0.01507568359375,
"learning_rate": 1.3414592103228594e-09,
"loss": 0.0,
"reward": 1.7083333730697632,
"reward_std": 0.1767766922712326,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.7083333432674408,
"step": 484
},
{
"completion_length": 407.79168701171875,
"epoch": 0.4085750315258512,
"grad_norm": 0.7527510846206448,
"kl": 0.02587890625,
"learning_rate": 1.0272702311203695e-09,
"loss": 0.0,
"reward": 1.6250000298023224,
"reward_std": 0.4124789535999298,
"rewards/format_reward_func": 0.9583333432674408,
"rewards/solution_reward_func": 0.6666666865348816,
"step": 486
},
{
"completion_length": 463.5000228881836,
"epoch": 0.41025641025641024,
"grad_norm": 0.5925315928631419,
"kl": 0.020721435546875,
"learning_rate": 7.548664020045059e-10,
"loss": 0.0,
"reward": 1.4583333730697632,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.4583333507180214,
"step": 488
},
{
"completion_length": 437.2083511352539,
"epoch": 0.4119377889869693,
"grad_norm": 0.7831672518645952,
"kl": 0.034820556640625,
"learning_rate": 5.242934405720878e-10,
"loss": 0.0,
"reward": 1.7500000596046448,
"reward_std": 0.2357022613286972,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.7500000298023224,
"step": 490
},
{
"completion_length": 444.3333435058594,
"epoch": 0.4136191677175284,
"grad_norm": 0.5783716082619003,
"kl": 0.019439697265625,
"learning_rate": 3.355900439359072e-10,
"loss": 0.0,
"reward": 1.7083333730697632,
"reward_std": 0.2946278266608715,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.7083333432674408,
"step": 492
},
{
"completion_length": 474.0833511352539,
"epoch": 0.41530054644808745,
"grad_norm": 0.5030728051783439,
"kl": 0.02392578125,
"learning_rate": 1.8878788223009035e-10,
"loss": 0.0,
"reward": 1.3750000298023224,
"reward_std": 0.1767766959965229,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.3750000037252903,
"step": 494
},
{
"completion_length": 471.3333435058594,
"epoch": 0.4169819251786465,
"grad_norm": 0.40396130292565613,
"kl": 0.02294921875,
"learning_rate": 8.391159329496079e-11,
"loss": 0.0,
"reward": 1.791666716337204,
"reward_std": 0.2946278266608715,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.7916667014360428,
"step": 496
},
{
"completion_length": 438.1666793823242,
"epoch": 0.41866330390920553,
"grad_norm": 0.4863347024264504,
"kl": 0.01949310302734375,
"learning_rate": 2.097877854204122e-11,
"loss": 0.0,
"reward": 1.666666716337204,
"reward_std": 0.2357022613286972,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.6666666939854622,
"step": 498
},
{
"completion_length": 391.7083435058594,
"epoch": 0.4203446826397646,
"grad_norm": 0.6480363403962828,
"kl": 0.0219573974609375,
"learning_rate": 0.0,
"loss": 0.0,
"reward": 1.6250000298023224,
"reward_std": 0.2946278229355812,
"rewards/format_reward_func": 1.0,
"rewards/solution_reward_func": 0.6250000149011612,
"step": 500
},
{
"epoch": 0.4203446826397646,
"step": 500,
"total_flos": 0.0,
"train_loss": 1.7704009043086445e-05,
"train_runtime": 16459.3672,
"train_samples_per_second": 0.182,
"train_steps_per_second": 0.03
}
],
"logging_steps": 2,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}