{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.4203446826397646, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 582.8750228881836, "epoch": 0.0016813787305590584, "grad_norm": 0.32669930150754334, "kl": 0.0, "learning_rate": 6.666666666666667e-08, "loss": 0.0, "reward": 0.6666666865348816, "reward_std": 0.3535533882677555, "rewards/format_reward_func": 0.5416666865348816, "rewards/solution_reward_func": 0.1250000037252903, "step": 2 }, { "completion_length": 699.4583587646484, "epoch": 0.003362757461118117, "grad_norm": 0.5103613895194884, "kl": 0.0002980232238769531, "learning_rate": 1.3333333333333334e-07, "loss": 0.0, "reward": 0.5833333507180214, "reward_std": 0.4714045189321041, "rewards/format_reward_func": 0.541666679084301, "rewards/solution_reward_func": 0.0416666679084301, "step": 4 }, { "completion_length": 575.2083511352539, "epoch": 0.005044136191677175, "grad_norm": 0.6433224536587134, "kl": 0.0003399848937988281, "learning_rate": 2e-07, "loss": 0.0, "reward": 0.7916666865348816, "reward_std": 0.2946278229355812, "rewards/format_reward_func": 0.7083333656191826, "rewards/solution_reward_func": 0.0833333358168602, "step": 6 }, { "completion_length": 828.2083435058594, "epoch": 0.006725514922236234, "grad_norm": 0.5211006593141526, "kl": 0.00028324127197265625, "learning_rate": 2.6666666666666667e-07, "loss": 0.0, "reward": 0.708333358168602, "reward_std": 0.4124789573252201, "rewards/format_reward_func": 0.6250000149011612, "rewards/solution_reward_func": 0.0833333358168602, "step": 8 }, { "completion_length": 572.6666870117188, "epoch": 0.008406893652795292, "grad_norm": 0.7556094738301846, "kl": 0.0003337860107421875, "learning_rate": 3.333333333333333e-07, "loss": 0.0, "reward": 0.833333358168602, "reward_std": 0.3535533882677555, "rewards/format_reward_func": 0.6666666716337204, "rewards/solution_reward_func": 0.1666666716337204, "step": 10 }, { "completion_length": 538.7083511352539, "epoch": 0.01008827238335435, "grad_norm": 0.7513982192873419, "kl": 0.0002989768981933594, "learning_rate": 4e-07, "loss": 0.0, "reward": 0.6666666939854622, "reward_std": 0.4714045189321041, "rewards/format_reward_func": 0.5833333358168602, "rewards/solution_reward_func": 0.0833333358168602, "step": 12 }, { "completion_length": 673.7500152587891, "epoch": 0.011769651113913409, "grad_norm": 0.5852912050950652, "kl": 0.0003135204315185547, "learning_rate": 4.6666666666666666e-07, "loss": 0.0, "reward": 0.9166666865348816, "reward_std": 0.4714045189321041, "rewards/format_reward_func": 0.7916666865348816, "rewards/solution_reward_func": 0.1250000037252903, "step": 14 }, { "completion_length": 711.3750305175781, "epoch": 0.013451029844472467, "grad_norm": 0.9204289802606287, "kl": 0.0003161430358886719, "learning_rate": 4.999947552503497e-07, "loss": 0.0, "reward": 0.6250000074505806, "reward_std": 0.5303300842642784, "rewards/format_reward_func": 0.5000000074505806, "rewards/solution_reward_func": 0.1250000037252903, "step": 16 }, { "completion_length": 552.1250152587891, "epoch": 0.015132408575031526, "grad_norm": 0.9100701777961465, "kl": 0.0004177093505859375, "learning_rate": 4.999527985734931e-07, "loss": 0.0, "reward": 0.833333358168602, "reward_std": 0.3535533882677555, "rewards/format_reward_func": 0.75, "rewards/solution_reward_func": 0.0833333358168602, "step": 18 }, { "completion_length": 540.1250228881836, "epoch": 0.016813787305590584, "grad_norm": 0.6037876280305763, "kl": 0.0005059242248535156, "learning_rate": 4.998688922613787e-07, "loss": 0.0, "reward": 0.7916666865348816, "reward_std": 0.2946278266608715, "rewards/format_reward_func": 0.6666666865348816, "rewards/solution_reward_func": 0.1250000037252903, "step": 20 }, { "completion_length": 590.2500152587891, "epoch": 0.018495166036149643, "grad_norm": 0.9748922522783966, "kl": 0.0004825592041015625, "learning_rate": 4.997430503960219e-07, "loss": 0.0, "reward": 0.6666666939854622, "reward_std": 0.3535533919930458, "rewards/format_reward_func": 0.5416666679084301, "rewards/solution_reward_func": 0.1250000037252903, "step": 22 }, { "completion_length": 689.5416717529297, "epoch": 0.0201765447667087, "grad_norm": 0.4822668116388023, "kl": 0.000446319580078125, "learning_rate": 4.995752940974918e-07, "loss": 0.0, "reward": 0.7916666865348816, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 0.7083333432674408, "rewards/solution_reward_func": 0.0833333358168602, "step": 24 }, { "completion_length": 556.9166870117188, "epoch": 0.02185792349726776, "grad_norm": 0.46041536357233115, "kl": 0.0004811286926269531, "learning_rate": 4.993656515203662e-07, "loss": 0.0, "reward": 0.8333333432674408, "reward_std": 0.3535533919930458, "rewards/format_reward_func": 0.7500000149011612, "rewards/solution_reward_func": 0.0833333358168602, "step": 26 }, { "completion_length": 597.2916717529297, "epoch": 0.023539302227826818, "grad_norm": 0.5772106772675187, "kl": 0.0006680488586425781, "learning_rate": 4.991141578490066e-07, "loss": 0.0, "reward": 0.9166666865348816, "reward_std": 0.2357022576034069, "rewards/format_reward_func": 0.7916666865348816, "rewards/solution_reward_func": 0.1250000037252903, "step": 28 }, { "completion_length": 610.5416717529297, "epoch": 0.025220680958385876, "grad_norm": 0.8453841880983054, "kl": 0.0008635520935058594, "learning_rate": 4.988208552916535e-07, "loss": 0.0, "reward": 0.9583333730697632, "reward_std": 0.4124789498746395, "rewards/format_reward_func": 0.7916666865348816, "rewards/solution_reward_func": 0.1666666716337204, "step": 30 }, { "completion_length": 672.7083587646484, "epoch": 0.026902059688944935, "grad_norm": 0.5771700431960171, "kl": 0.000789642333984375, "learning_rate": 4.984857930733419e-07, "loss": 0.0, "reward": 0.9166666865348816, "reward_std": 0.2357022613286972, "rewards/format_reward_func": 0.833333358168602, "rewards/solution_reward_func": 0.0833333358168602, "step": 32 }, { "completion_length": 648.0416717529297, "epoch": 0.028583438419503993, "grad_norm": 0.6098823109986267, "kl": 0.0009946823120117188, "learning_rate": 4.981090274276405e-07, "loss": 0.0, "reward": 0.8333333730697632, "reward_std": 0.3535533882677555, "rewards/format_reward_func": 0.708333358168602, "rewards/solution_reward_func": 0.1250000037252903, "step": 34 }, { "completion_length": 821.3333587646484, "epoch": 0.03026481715006305, "grad_norm": 0.6980212217774439, "kl": 0.0006208419799804688, "learning_rate": 4.976906215872137e-07, "loss": 0.0, "reward": 0.8333333432674408, "reward_std": 0.3535533919930458, "rewards/format_reward_func": 0.7500000149011612, "rewards/solution_reward_func": 0.0833333358168602, "step": 36 }, { "completion_length": 620.2500152587891, "epoch": 0.031946195880622114, "grad_norm": 0.0003861918457850463, "kl": 0.0010890960693359375, "learning_rate": 4.97230645773209e-07, "loss": 0.0, "reward": 0.833333358168602, "reward_std": 0.1178511306643486, "rewards/format_reward_func": 0.7916667014360428, "rewards/solution_reward_func": 0.0416666679084301, "step": 38 }, { "completion_length": 533.5833435058594, "epoch": 0.03362757461118117, "grad_norm": 0.19582302389190498, "kl": 0.0016956329345703125, "learning_rate": 4.967291771834726e-07, "loss": 0.0, "reward": 1.0416666865348816, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 0.833333358168602, "rewards/solution_reward_func": 0.2083333358168602, "step": 40 }, { "completion_length": 554.9166793823242, "epoch": 0.03530895334174023, "grad_norm": 0.43787338966334477, "kl": 0.0012445449829101562, "learning_rate": 4.961862999795923e-07, "loss": 0.0, "reward": 1.0, "reward_std": 0.2357022613286972, "rewards/format_reward_func": 0.833333358168602, "rewards/solution_reward_func": 0.1666666716337204, "step": 42 }, { "completion_length": 631.4166946411133, "epoch": 0.036990332072299285, "grad_norm": 0.7633407511221333, "kl": 0.0015544891357421875, "learning_rate": 4.956021052727731e-07, "loss": 0.0, "reward": 0.8750000149011612, "reward_std": 0.2946278266608715, "rewards/format_reward_func": 0.7916666865348816, "rewards/solution_reward_func": 0.0833333358168602, "step": 44 }, { "completion_length": 623.9166870117188, "epoch": 0.03867171080285835, "grad_norm": 0.9409898030779542, "kl": 0.0011081695556640625, "learning_rate": 4.949766911085461e-07, "loss": 0.0, "reward": 0.9166666865348816, "reward_std": 0.3535533882677555, "rewards/format_reward_func": 0.7500000149011612, "rewards/solution_reward_func": 0.1666666716337204, "step": 46 }, { "completion_length": 576.1250152587891, "epoch": 0.0403530895334174, "grad_norm": 0.5174245567750604, "kl": 0.0010805130004882812, "learning_rate": 4.943101624503132e-07, "loss": 0.0, "reward": 1.0000000298023224, "reward_std": 0.3535533919930458, "rewards/format_reward_func": 0.833333358168602, "rewards/solution_reward_func": 0.1666666716337204, "step": 48 }, { "completion_length": 464.91668701171875, "epoch": 0.042034468263976464, "grad_norm": 0.721286337233584, "kl": 0.0024700164794921875, "learning_rate": 4.936026311617316e-07, "loss": 0.0, "reward": 0.9166667014360428, "reward_std": 0.2357022613286972, "rewards/format_reward_func": 0.8333333432674408, "rewards/solution_reward_func": 0.0833333358168602, "step": 50 }, { "completion_length": 604.5000228881836, "epoch": 0.04371584699453552, "grad_norm": 0.4391469354984254, "kl": 0.0014371871948242188, "learning_rate": 4.928542159879385e-07, "loss": 0.0, "reward": 1.0833333730697632, "reward_std": 0.3535533882677555, "rewards/format_reward_func": 0.8750000149011612, "rewards/solution_reward_func": 0.2083333395421505, "step": 52 }, { "completion_length": 487.04168701171875, "epoch": 0.04539722572509458, "grad_norm": 0.5796863501691008, "kl": 0.001621246337890625, "learning_rate": 4.920650425356239e-07, "loss": 0.0, "reward": 1.1250000596046448, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 0.9583333432674408, "rewards/solution_reward_func": 0.1666666716337204, "step": 54 }, { "completion_length": 702.7917022705078, "epoch": 0.047078604455653636, "grad_norm": 0.45499979356207115, "kl": 0.0010595321655273438, "learning_rate": 4.912352432519484e-07, "loss": 0.0, "reward": 0.9583333730697632, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 0.9166666865348816, "rewards/solution_reward_func": 0.0416666679084301, "step": 56 }, { "completion_length": 432.5416793823242, "epoch": 0.0487599831862127, "grad_norm": 0.7200412766464303, "kl": 0.002208709716796875, "learning_rate": 4.90364957402315e-07, "loss": 0.0, "reward": 1.2083333432674408, "reward_std": 0.2946278266608715, "rewards/format_reward_func": 0.9583333432674408, "rewards/solution_reward_func": 0.2500000037252903, "step": 58 }, { "completion_length": 504.87501525878906, "epoch": 0.05044136191677175, "grad_norm": 0.69519393769251, "kl": 0.00209808349609375, "learning_rate": 4.894543310469967e-07, "loss": 0.0, "reward": 0.9583333730697632, "reward_std": 0.2946278266608715, "rewards/format_reward_func": 0.8750000298023224, "rewards/solution_reward_func": 0.0833333358168602, "step": 60 }, { "completion_length": 665.458366394043, "epoch": 0.052122740647330815, "grad_norm": 0.7276068452685034, "kl": 0.0016222000122070312, "learning_rate": 4.885035170166228e-07, "loss": 0.0, "reward": 0.958333358168602, "reward_std": 0.2946278266608715, "rewards/format_reward_func": 0.8333333432674408, "rewards/solution_reward_func": 0.1250000037252903, "step": 62 }, { "completion_length": 628.0000152587891, "epoch": 0.05380411937788987, "grad_norm": 0.5896398911030171, "kl": 0.0023822784423828125, "learning_rate": 4.875126748865289e-07, "loss": 0.0, "reward": 1.0416667014360428, "reward_std": 0.4124789535999298, "rewards/format_reward_func": 0.833333358168602, "rewards/solution_reward_func": 0.2083333358168602, "step": 64 }, { "completion_length": 583.1250228881836, "epoch": 0.05548549810844893, "grad_norm": 0.3607454748525709, "kl": 0.0025081634521484375, "learning_rate": 4.864819709499761e-07, "loss": 0.0, "reward": 1.0000000298023224, "reward_std": 0.1178511306643486, "rewards/format_reward_func": 0.8750000298023224, "rewards/solution_reward_func": 0.1250000037252903, "step": 66 }, { "completion_length": 734.5833435058594, "epoch": 0.057166876839007986, "grad_norm": 0.5186256820202049, "kl": 0.0015344619750976562, "learning_rate": 4.854115781902414e-07, "loss": 0.0, "reward": 0.916666716337204, "reward_std": 0.2357022613286972, "rewards/format_reward_func": 0.8750000298023224, "rewards/solution_reward_func": 0.0416666679084301, "step": 68 }, { "completion_length": 738.0833511352539, "epoch": 0.05884825556956705, "grad_norm": 0.39707553003489315, "kl": 0.0017681121826171875, "learning_rate": 4.843016762515859e-07, "loss": 0.0, "reward": 1.0416666865348816, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 0.9583333432674408, "rewards/solution_reward_func": 0.0833333358168602, "step": 70 }, { "completion_length": 625.6250305175781, "epoch": 0.0605296343001261, "grad_norm": 0.30316010027843815, "kl": 0.001819610595703125, "learning_rate": 4.831524514091056e-07, "loss": 0.0, "reward": 1.0000000298023224, "reward_std": 0.2357022576034069, "rewards/format_reward_func": 0.9166666865348816, "rewards/solution_reward_func": 0.0833333358168602, "step": 72 }, { "completion_length": 636.8750228881836, "epoch": 0.062211013030685165, "grad_norm": 0.4721947459888053, "kl": 0.0018463134765625, "learning_rate": 4.81964096537468e-07, "loss": 0.0, "reward": 1.0416666716337204, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 0.8750000149011612, "rewards/solution_reward_func": 0.1666666679084301, "step": 74 }, { "completion_length": 508.4166793823242, "epoch": 0.06389239176124423, "grad_norm": 0.0005282376556457386, "kl": 0.0027256011962890625, "learning_rate": 4.80736811078543e-07, "loss": 0.0, "reward": 1.1666666865348816, "reward_std": 0.1178511306643486, "rewards/format_reward_func": 0.9583333432674408, "rewards/solution_reward_func": 0.2083333395421505, "step": 76 }, { "completion_length": 460.00001525878906, "epoch": 0.06557377049180328, "grad_norm": 0.8701537519403687, "kl": 0.0024814605712890625, "learning_rate": 4.794708010079288e-07, "loss": 0.0, "reward": 1.0833333730697632, "reward_std": 0.2357022613286972, "rewards/format_reward_func": 0.9166666865348816, "rewards/solution_reward_func": 0.1666666716337204, "step": 78 }, { "completion_length": 602.1666870117188, "epoch": 0.06725514922236234, "grad_norm": 0.5672242931158207, "kl": 0.005603790283203125, "learning_rate": 4.78166278800385e-07, "loss": 0.0, "reward": 1.041666716337204, "reward_std": 0.2946278266608715, "rewards/format_reward_func": 0.9166666865348816, "rewards/solution_reward_func": 0.1250000037252903, "step": 80 }, { "completion_length": 547.5416870117188, "epoch": 0.06893652795292139, "grad_norm": 0.6276836580523573, "kl": 0.004207611083984375, "learning_rate": 4.7682346339417157e-07, "loss": 0.0, "reward": 0.9583333432674408, "reward_std": 0.2946278266608715, "rewards/format_reward_func": 0.8750000298023224, "rewards/solution_reward_func": 0.0833333358168602, "step": 82 }, { "completion_length": 668.9583587646484, "epoch": 0.07061790668348046, "grad_norm": 0.00047527262535576603, "kl": 0.00447845458984375, "learning_rate": 4.754425801543046e-07, "loss": 0.0, "reward": 1.0, "reward_std": 0.1178511306643486, "rewards/format_reward_func": 0.9583333432674408, "rewards/solution_reward_func": 0.0416666679084301, "step": 84 }, { "completion_length": 542.7083511352539, "epoch": 0.07229928541403952, "grad_norm": 0.6328447561571717, "kl": 0.002597808837890625, "learning_rate": 4.7402386083473364e-07, "loss": 0.0, "reward": 1.166666716337204, "reward_std": 0.2357022613286972, "rewards/format_reward_func": 0.9583333432674408, "rewards/solution_reward_func": 0.2083333395421505, "step": 86 }, { "completion_length": 570.7500305175781, "epoch": 0.07398066414459857, "grad_norm": 0.48341787098839506, "kl": 0.0023593902587890625, "learning_rate": 4.72567543539446e-07, "loss": 0.0, "reward": 0.9583333730697632, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 0.9166666865348816, "rewards/solution_reward_func": 0.0416666679084301, "step": 88 }, { "completion_length": 515.6666793823242, "epoch": 0.07566204287515763, "grad_norm": 0.4411126705303185, "kl": 0.003017425537109375, "learning_rate": 4.7107387268250586e-07, "loss": 0.0, "reward": 1.0416666865348816, "reward_std": 0.2946278266608715, "rewards/format_reward_func": 0.9166666865348816, "rewards/solution_reward_func": 0.1250000037252903, "step": 90 }, { "completion_length": 608.3333435058594, "epoch": 0.0773434216057167, "grad_norm": 0.3226336554890738, "kl": 0.0033473968505859375, "learning_rate": 4.6954309894703426e-07, "loss": 0.0, "reward": 1.166666716337204, "reward_std": 0.2357022613286972, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.1666666716337204, "step": 92 }, { "completion_length": 456.4166793823242, "epoch": 0.07902480033627575, "grad_norm": 0.7644602389120536, "kl": 0.0032806396484375, "learning_rate": 4.6797547924313673e-07, "loss": 0.0, "reward": 1.1250000298023224, "reward_std": 0.2946278266608715, "rewards/format_reward_func": 0.9583333432674408, "rewards/solution_reward_func": 0.1666666716337204, "step": 94 }, { "completion_length": 551.4583587646484, "epoch": 0.0807061790668348, "grad_norm": 0.773881603923625, "kl": 0.003635406494140625, "learning_rate": 4.6637127666478617e-07, "loss": 0.0, "reward": 1.1250000298023224, "reward_std": 0.2946278266608715, "rewards/format_reward_func": 0.9583333432674408, "rewards/solution_reward_func": 0.1666666716337204, "step": 96 }, { "completion_length": 494.4583511352539, "epoch": 0.08238755779739386, "grad_norm": 0.6252744428861863, "kl": 0.00386810302734375, "learning_rate": 4.647307604456674e-07, "loss": 0.0, "reward": 1.166666716337204, "reward_std": 0.2357022613286972, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.1666666716337204, "step": 98 }, { "completion_length": 540.9166870117188, "epoch": 0.08406893652795293, "grad_norm": 0.0005101931995823585, "kl": 0.003147125244140625, "learning_rate": 4.630542059139923e-07, "loss": 0.0, "reward": 1.0, "reward_std": 0.0, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.0, "step": 100 }, { "completion_length": 447.2916793823242, "epoch": 0.08575031525851198, "grad_norm": 0.7334515099745728, "kl": 0.00638580322265625, "learning_rate": 4.613418944462906e-07, "loss": 0.0, "reward": 1.2500000596046448, "reward_std": 0.2357022613286972, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.2500000074505806, "step": 102 }, { "completion_length": 465.45835876464844, "epoch": 0.08743169398907104, "grad_norm": 0.2608326039080234, "kl": 0.004886627197265625, "learning_rate": 4.5959411342018704e-07, "loss": 0.0, "reward": 1.1250000298023224, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 0.9583333432674408, "rewards/solution_reward_func": 0.1666666679084301, "step": 104 }, { "completion_length": 479.9166946411133, "epoch": 0.0891130727196301, "grad_norm": 0.2883550467956107, "kl": 0.00536346435546875, "learning_rate": 4.578111561661702e-07, "loss": 0.0, "reward": 1.0833333730697632, "reward_std": 0.1178511306643486, "rewards/format_reward_func": 0.9583333432674408, "rewards/solution_reward_func": 0.1250000037252903, "step": 106 }, { "completion_length": 424.5416793823242, "epoch": 0.09079445145018916, "grad_norm": 0.6124946815591971, "kl": 0.00507354736328125, "learning_rate": 4.559933219183631e-07, "loss": 0.0, "reward": 1.0000000298023224, "reward_std": 0.1178511306643486, "rewards/format_reward_func": 0.9583333432674408, "rewards/solution_reward_func": 0.0416666679084301, "step": 108 }, { "completion_length": 374.3333435058594, "epoch": 0.09247583018074822, "grad_norm": 0.003755671104748954, "kl": 0.01043701171875, "learning_rate": 4.541409157643027e-07, "loss": 0.0, "reward": 1.1250000298023224, "reward_std": 0.0589255653321743, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.1250000037252903, "step": 110 }, { "completion_length": 527.3750228881836, "epoch": 0.09415720891130727, "grad_norm": 0.5338777369576395, "kl": 0.00551605224609375, "learning_rate": 4.5225424859373684e-07, "loss": 0.0, "reward": 1.166666716337204, "reward_std": 0.2357022613286972, "rewards/format_reward_func": 0.9583333432674408, "rewards/solution_reward_func": 0.2083333395421505, "step": 112 }, { "completion_length": 381.62500762939453, "epoch": 0.09583858764186633, "grad_norm": 0.47889287389319934, "kl": 0.007801055908203125, "learning_rate": 4.503336370464475e-07, "loss": 0.0, "reward": 1.1250000298023224, "reward_std": 0.0589255653321743, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.1250000037252903, "step": 114 }, { "completion_length": 464.70835876464844, "epoch": 0.0975199663724254, "grad_norm": 0.5542831400022328, "kl": 0.005016326904296875, "learning_rate": 4.4837940345910917e-07, "loss": 0.0, "reward": 1.1250000298023224, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 0.9583333432674408, "rewards/solution_reward_func": 0.1666666716337204, "step": 116 }, { "completion_length": 537.0000076293945, "epoch": 0.09920134510298445, "grad_norm": 0.32822474431275217, "kl": 0.005290985107421875, "learning_rate": 4.4639187581119116e-07, "loss": 0.0, "reward": 1.0, "reward_std": 0.1178511306643486, "rewards/format_reward_func": 0.9583333432674408, "rewards/solution_reward_func": 0.0416666679084301, "step": 118 }, { "completion_length": 449.8333435058594, "epoch": 0.1008827238335435, "grad_norm": 0.6078651118658578, "kl": 0.005950927734375, "learning_rate": 4.443713876699123e-07, "loss": 0.0, "reward": 1.1250000596046448, "reward_std": 0.2946278266608715, "rewards/format_reward_func": 0.9583333432674408, "rewards/solution_reward_func": 0.1666666716337204, "step": 120 }, { "completion_length": 487.08335876464844, "epoch": 0.10256410256410256, "grad_norm": 0.0015859284209239941, "kl": 0.00882720947265625, "learning_rate": 4.423182781342588e-07, "loss": 0.0, "reward": 1.0833333730697632, "reward_std": 0.1178511306643486, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.0833333358168602, "step": 122 }, { "completion_length": 477.7916793823242, "epoch": 0.10424548129466163, "grad_norm": 0.8316436306146491, "kl": 0.00827789306640625, "learning_rate": 4.402328917780728e-07, "loss": 0.0, "reward": 1.041666716337204, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 0.9583333432674408, "rewards/solution_reward_func": 0.0833333358168602, "step": 124 }, { "completion_length": 360.37500762939453, "epoch": 0.10592686002522068, "grad_norm": 0.5068283502332778, "kl": 0.0106048583984375, "learning_rate": 4.381155785922225e-07, "loss": 0.0, "reward": 1.166666716337204, "reward_std": 0.2357022613286972, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.1666666716337204, "step": 126 }, { "completion_length": 373.25000762939453, "epoch": 0.10760823875577974, "grad_norm": 0.0008593493041121279, "kl": 0.0091400146484375, "learning_rate": 4.3596669392586363e-07, "loss": 0.0, "reward": 1.0833333730697632, "reward_std": 0.1178511306643486, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.0833333358168602, "step": 128 }, { "completion_length": 373.3333435058594, "epoch": 0.1092896174863388, "grad_norm": 0.7307467398455231, "kl": 0.0075836181640625, "learning_rate": 4.337865984268001e-07, "loss": 0.0, "reward": 1.1250000298023224, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.1250000037252903, "step": 130 }, { "completion_length": 394.6666717529297, "epoch": 0.11097099621689786, "grad_norm": 0.8811859841007285, "kl": 0.010345458984375, "learning_rate": 4.3157565798095746e-07, "loss": 0.0, "reward": 1.2500000596046448, "reward_std": 0.2357022613286972, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.2500000074505806, "step": 132 }, { "completion_length": 341.2916793823242, "epoch": 0.11265237494745692, "grad_norm": 0.7652707329784861, "kl": 0.0125885009765625, "learning_rate": 4.293342436509756e-07, "loss": 0.0, "reward": 1.1666666865348816, "reward_std": 0.3535533919930458, "rewards/format_reward_func": 0.9166666865348816, "rewards/solution_reward_func": 0.2500000037252903, "step": 134 }, { "completion_length": 478.3333435058594, "epoch": 0.11433375367801597, "grad_norm": 0.6268940088226042, "kl": 0.0119781494140625, "learning_rate": 4.2706273161393326e-07, "loss": 0.0, "reward": 1.1666667461395264, "reward_std": 0.2357022613286972, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.1666666716337204, "step": 136 }, { "completion_length": 357.8333435058594, "epoch": 0.11601513240857503, "grad_norm": 0.6940515666291348, "kl": 0.0140228271484375, "learning_rate": 4.2476150309821437e-07, "loss": 0.0, "reward": 1.2083333730697632, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.2083333395421505, "step": 138 }, { "completion_length": 398.0833435058594, "epoch": 0.1176965111391341, "grad_norm": 0.4068021983578087, "kl": 0.0218048095703125, "learning_rate": 4.2243094431952607e-07, "loss": 0.0, "reward": 1.2916666865348816, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 0.9583333432674408, "rewards/solution_reward_func": 0.3333333395421505, "step": 140 }, { "completion_length": 391.5416717529297, "epoch": 0.11937788986969315, "grad_norm": 1.054400750652676, "kl": 0.0133209228515625, "learning_rate": 4.2007144641608035e-07, "loss": 0.0, "reward": 1.2500000596046448, "reward_std": 0.2357022613286972, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.2500000074505806, "step": 142 }, { "completion_length": 362.9166717529297, "epoch": 0.1210592686002522, "grad_norm": 0.7202533244508015, "kl": 0.0169525146484375, "learning_rate": 4.1768340538294914e-07, "loss": 0.0, "reward": 1.2083333730697632, "reward_std": 0.2946278266608715, "rewards/format_reward_func": 0.9583333432674408, "rewards/solution_reward_func": 0.2500000037252903, "step": 144 }, { "completion_length": 392.75000762939453, "epoch": 0.12274064733081126, "grad_norm": 0.5668355112824374, "kl": 0.0153656005859375, "learning_rate": 4.1526722200560436e-07, "loss": 0.0, "reward": 1.2083333432674408, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.2083333358168602, "step": 146 }, { "completion_length": 488.00001525878906, "epoch": 0.12442202606137033, "grad_norm": 0.4109654167954109, "kl": 0.0104522705078125, "learning_rate": 4.1282330179265377e-07, "loss": 0.0, "reward": 1.0833333730697632, "reward_std": 0.1178511306643486, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.0833333358168602, "step": 148 }, { "completion_length": 379.00000762939453, "epoch": 0.12610340479192939, "grad_norm": 0.6511266004361977, "kl": 0.0153656005859375, "learning_rate": 4.1035205490778496e-07, "loss": 0.0, "reward": 1.291666716337204, "reward_std": 0.2946278266608715, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.2916666716337204, "step": 150 }, { "completion_length": 477.4583511352539, "epoch": 0.12778478352248845, "grad_norm": 0.41009026844574226, "kl": 0.0298004150390625, "learning_rate": 4.078538961009268e-07, "loss": 0.0, "reward": 1.2500000596046448, "reward_std": 0.2357022613286972, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.2500000037252903, "step": 152 }, { "completion_length": 380.6666793823242, "epoch": 0.1294661622530475, "grad_norm": 0.41679517817122624, "kl": 0.0112762451171875, "learning_rate": 4.0532924463864214e-07, "loss": 0.0, "reward": 1.2500000298023224, "reward_std": 0.2357022613286972, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.2500000037252903, "step": 154 }, { "completion_length": 331.00001525878906, "epoch": 0.13114754098360656, "grad_norm": 0.002301312664670671, "kl": 0.0154876708984375, "learning_rate": 4.027785242337625e-07, "loss": 0.0, "reward": 1.291666716337204, "reward_std": 0.0589255653321743, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.2916666753590107, "step": 156 }, { "completion_length": 337.1666717529297, "epoch": 0.1328289197141656, "grad_norm": 0.41037493061839153, "kl": 0.01825714111328125, "learning_rate": 4.002021629742759e-07, "loss": 0.0, "reward": 1.25, "reward_std": 0.1178511306643486, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.25, "step": 158 }, { "completion_length": 355.3333435058594, "epoch": 0.13451029844472467, "grad_norm": 0.4834324481747559, "kl": 0.0101470947265625, "learning_rate": 3.9760059325148063e-07, "loss": 0.0, "reward": 1.291666716337204, "reward_std": 0.2946278266608715, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.2916666753590107, "step": 160 }, { "completion_length": 475.8333511352539, "epoch": 0.13619167717528374, "grad_norm": 0.0007131492797933627, "kl": 0.00870513916015625, "learning_rate": 3.949742516874175e-07, "loss": 0.0, "reward": 1.4583333730697632, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.4583333432674408, "step": 162 }, { "completion_length": 400.0416793823242, "epoch": 0.13787305590584278, "grad_norm": 0.6704272888706837, "kl": 0.01324462890625, "learning_rate": 3.9232357906159065e-07, "loss": 0.0, "reward": 1.2500000596046448, "reward_std": 0.2357022613286972, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.2500000074505806, "step": 164 }, { "completion_length": 391.62501525878906, "epoch": 0.13955443463640185, "grad_norm": 0.8927607188893651, "kl": 0.00870513916015625, "learning_rate": 3.8964902023699234e-07, "loss": 0.0, "reward": 1.2916666865348816, "reward_std": 0.4124789498746395, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.2916666679084301, "step": 166 }, { "completion_length": 432.1666793823242, "epoch": 0.14123581336696092, "grad_norm": 0.5356539936839141, "kl": 0.00945281982421875, "learning_rate": 3.869510240854407e-07, "loss": 0.0, "reward": 1.2500000596046448, "reward_std": 0.2357022613286972, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.2500000037252903, "step": 168 }, { "completion_length": 400.9583511352539, "epoch": 0.14291719209751996, "grad_norm": 0.6797066476960871, "kl": 0.0103912353515625, "learning_rate": 3.8423004341224595e-07, "loss": 0.0, "reward": 1.2083334028720856, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.2083333395421505, "step": 170 }, { "completion_length": 465.2083511352539, "epoch": 0.14459857082807903, "grad_norm": 0.7805415268970916, "kl": 0.00925445556640625, "learning_rate": 3.8148653488021566e-07, "loss": 0.0, "reward": 1.3333333730697632, "reward_std": 0.2357022613286972, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.3333333395421505, "step": 172 }, { "completion_length": 435.0833435058594, "epoch": 0.14627994955863807, "grad_norm": 0.4294043427967522, "kl": 0.006561279296875, "learning_rate": 3.787209589330134e-07, "loss": 0.0, "reward": 1.2083333730697632, "reward_std": 0.2946278266608715, "rewards/format_reward_func": 0.9583333432674408, "rewards/solution_reward_func": 0.2500000037252903, "step": 174 }, { "completion_length": 474.00001525878906, "epoch": 0.14796132828919714, "grad_norm": 0.5780056567965537, "kl": 0.0077667236328125, "learning_rate": 3.759337797178816e-07, "loss": 0.0, "reward": 1.2916667461395264, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.2916666753590107, "step": 176 }, { "completion_length": 483.50000762939453, "epoch": 0.1496427070197562, "grad_norm": 0.7711416518392432, "kl": 0.00598907470703125, "learning_rate": 3.7312546500774455e-07, "loss": 0.0, "reward": 1.4583334028720856, "reward_std": 0.4124789573252201, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.4583333469927311, "step": 178 }, { "completion_length": 560.7916793823242, "epoch": 0.15132408575031525, "grad_norm": 0.6275316044622387, "kl": 0.007049560546875, "learning_rate": 3.7029648612270123e-07, "loss": 0.0, "reward": 1.1250000298023224, "reward_std": 0.2946278229355812, "rewards/format_reward_func": 0.9583333432674408, "rewards/solution_reward_func": 0.1666666679084301, "step": 180 }, { "completion_length": 493.2916793823242, "epoch": 0.15300546448087432, "grad_norm": 0.41941279841402085, "kl": 0.00750732421875, "learning_rate": 3.6744731785092393e-07, "loss": 0.0, "reward": 1.1666666865348816, "reward_std": 0.3535533919930458, "rewards/format_reward_func": 0.9583333432674408, "rewards/solution_reward_func": 0.2083333395421505, "step": 182 }, { "completion_length": 431.25001525878906, "epoch": 0.1546868432114334, "grad_norm": 0.5134185107434652, "kl": 0.0072479248046875, "learning_rate": 3.6457843836897417e-07, "loss": 0.0, "reward": 1.416666716337204, "reward_std": 0.2357022613286972, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.4166666753590107, "step": 184 }, { "completion_length": 427.2916717529297, "epoch": 0.15636822194199243, "grad_norm": 0.498752511394473, "kl": 0.01192474365234375, "learning_rate": 3.6169032916155055e-07, "loss": 0.0, "reward": 1.2083334028720856, "reward_std": 0.2946278266608715, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.2083333395421505, "step": 186 }, { "completion_length": 529.5833435058594, "epoch": 0.1580496006725515, "grad_norm": 0.484964125720315, "kl": 0.0084381103515625, "learning_rate": 3.587834749406808e-07, "loss": 0.0, "reward": 1.2083333730697632, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 0.9583333432674408, "rewards/solution_reward_func": 0.2500000074505806, "step": 188 }, { "completion_length": 442.75000762939453, "epoch": 0.15973097940311054, "grad_norm": 0.29670849204677346, "kl": 0.00942230224609375, "learning_rate": 3.558583635643726e-07, "loss": 0.0, "reward": 1.2500000298023224, "reward_std": 0.1178511306643486, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.2500000111758709, "step": 190 }, { "completion_length": 461.12501525878906, "epoch": 0.1614123581336696, "grad_norm": 0.8668624278803033, "kl": 0.010345458984375, "learning_rate": 3.52915485954736e-07, "loss": 0.0, "reward": 1.2500000596046448, "reward_std": 0.2357022613286972, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.2500000074505806, "step": 192 }, { "completion_length": 552.1250076293945, "epoch": 0.16309373686422868, "grad_norm": 0.5686603271249641, "kl": 0.00679779052734375, "learning_rate": 3.4995533601559225e-07, "loss": 0.0, "reward": 1.2500000596046448, "reward_std": 0.3535533919930458, "rewards/format_reward_func": 0.9583333432674408, "rewards/solution_reward_func": 0.2916666753590107, "step": 194 }, { "completion_length": 489.75000762939453, "epoch": 0.16477511559478772, "grad_norm": 0.9001750933871568, "kl": 0.01209259033203125, "learning_rate": 3.469784105495816e-07, "loss": 0.0, "reward": 1.3750000298023224, "reward_std": 0.4124789573252201, "rewards/format_reward_func": 0.9583333432674408, "rewards/solution_reward_func": 0.4166666716337204, "step": 196 }, { "completion_length": 504.29168701171875, "epoch": 0.1664564943253468, "grad_norm": 0.351872496477236, "kl": 0.01596832275390625, "learning_rate": 3.4398520917478476e-07, "loss": 0.0, "reward": 1.2500000298023224, "reward_std": 0.2357022613286972, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.2500000037252903, "step": 198 }, { "completion_length": 551.7083511352539, "epoch": 0.16813787305590586, "grad_norm": 0.7904185495449961, "kl": 0.0072021484375, "learning_rate": 3.409762342408719e-07, "loss": 0.0, "reward": 1.2916666865348816, "reward_std": 0.4124789535999298, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.2916666716337204, "step": 200 }, { "completion_length": 436.7916793823242, "epoch": 0.1698192517864649, "grad_norm": 0.30374732572574603, "kl": 0.01061248779296875, "learning_rate": 3.379519907447931e-07, "loss": 0.0, "reward": 1.2916666865348816, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.2916666679084301, "step": 202 }, { "completion_length": 494.2083511352539, "epoch": 0.17150063051702397, "grad_norm": 0.4216305294562641, "kl": 0.00971221923828125, "learning_rate": 3.349129862460251e-07, "loss": 0.0, "reward": 1.2500000596046448, "reward_std": 0.1178511306643486, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.2500000074505806, "step": 204 }, { "completion_length": 436.58333587646484, "epoch": 0.173182009247583, "grad_norm": 0.34898976819652716, "kl": 0.00868988037109375, "learning_rate": 3.318597307813866e-07, "loss": 0.0, "reward": 1.5416666865348816, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.541666679084301, "step": 206 }, { "completion_length": 552.3333511352539, "epoch": 0.17486338797814208, "grad_norm": 0.001036227801857544, "kl": 0.00815582275390625, "learning_rate": 3.287927367794397e-07, "loss": 0.0, "reward": 1.125, "reward_std": 0.0589255653321743, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.125, "step": 208 }, { "completion_length": 541.2500228881836, "epoch": 0.17654476670870115, "grad_norm": 0.6275771663735162, "kl": 0.11170196533203125, "learning_rate": 3.2571251897448763e-07, "loss": 0.0001, "reward": 1.3333333432674408, "reward_std": 0.2357022576034069, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.3333333358168602, "step": 210 }, { "completion_length": 530.9166717529297, "epoch": 0.1782261454392602, "grad_norm": 0.5568701150128608, "kl": 0.011932373046875, "learning_rate": 3.226195943201883e-07, "loss": 0.0, "reward": 1.041666716337204, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 0.9583333432674408, "rewards/solution_reward_func": 0.0833333358168602, "step": 212 }, { "completion_length": 503.0416717529297, "epoch": 0.17990752416981926, "grad_norm": 0.668445204981218, "kl": 0.01616668701171875, "learning_rate": 3.1951448190279253e-07, "loss": 0.0, "reward": 1.4583334028720856, "reward_std": 0.2946278266608715, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.4583333469927311, "step": 214 }, { "completion_length": 493.5416717529297, "epoch": 0.18158890290037832, "grad_norm": 0.8478573951713925, "kl": 0.00855255126953125, "learning_rate": 3.163977028540263e-07, "loss": 0.0, "reward": 1.416666716337204, "reward_std": 0.4714045189321041, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.4166666753590107, "step": 216 }, { "completion_length": 591.0416870117188, "epoch": 0.18327028163093737, "grad_norm": 0.7348765645347503, "kl": 0.0094451904296875, "learning_rate": 3.1326978026362905e-07, "loss": 0.0, "reward": 1.2083333730697632, "reward_std": 0.4124789535999298, "rewards/format_reward_func": 0.9583333432674408, "rewards/solution_reward_func": 0.2500000074505806, "step": 218 }, { "completion_length": 406.12500762939453, "epoch": 0.18495166036149643, "grad_norm": 0.6283226728886679, "kl": 0.00975799560546875, "learning_rate": 3.101312390915634e-07, "loss": 0.0, "reward": 1.2916667461395264, "reward_std": 0.2946278266608715, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.2916666753590107, "step": 220 }, { "completion_length": 434.6666793823242, "epoch": 0.18663303909205547, "grad_norm": 0.46740684202706906, "kl": 0.0135955810546875, "learning_rate": 3.069826060799109e-07, "loss": 0.0, "reward": 1.2083333730697632, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 0.9583333432674408, "rewards/solution_reward_func": 0.2500000074505806, "step": 222 }, { "completion_length": 512.3750152587891, "epoch": 0.18831441782261454, "grad_norm": 0.6829987852089712, "kl": 0.0085601806640625, "learning_rate": 3.038244096644687e-07, "loss": 0.0, "reward": 1.3750000298023224, "reward_std": 0.2946278266608715, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.3750000037252903, "step": 224 }, { "completion_length": 443.3333435058594, "epoch": 0.1899957965531736, "grad_norm": 0.7302277802124593, "kl": 0.0476226806640625, "learning_rate": 3.0065717988606256e-07, "loss": 0.0, "reward": 1.3750000298023224, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.3750000149011612, "step": 226 }, { "completion_length": 450.37500762939453, "epoch": 0.19167717528373265, "grad_norm": 0.0033526514387488097, "kl": 0.0142059326171875, "learning_rate": 2.974814483015892e-07, "loss": 0.0, "reward": 1.4583333730697632, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.4583333432674408, "step": 228 }, { "completion_length": 484.16668701171875, "epoch": 0.19335855401429172, "grad_norm": 0.4236067078997056, "kl": 0.01483154296875, "learning_rate": 2.942977478948057e-07, "loss": 0.0, "reward": 1.291666716337204, "reward_std": 0.2946278266608715, "rewards/format_reward_func": 0.9583333432674408, "rewards/solution_reward_func": 0.3333333395421505, "step": 230 }, { "completion_length": 470.75001525878906, "epoch": 0.1950399327448508, "grad_norm": 0.5734136074701052, "kl": 0.0141448974609375, "learning_rate": 2.911066129868782e-07, "loss": 0.0, "reward": 1.5000000298023224, "reward_std": 0.2357022613286972, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.5000000074505806, "step": 232 }, { "completion_length": 539.8750152587891, "epoch": 0.19672131147540983, "grad_norm": 0.8539808804727996, "kl": 0.02154541015625, "learning_rate": 2.87908579146707e-07, "loss": 0.0, "reward": 1.3333333730697632, "reward_std": 0.3535533882677555, "rewards/format_reward_func": 0.9583333432674408, "rewards/solution_reward_func": 0.3750000074505806, "step": 234 }, { "completion_length": 491.7500228881836, "epoch": 0.1984026902059689, "grad_norm": 0.3028340455454943, "kl": 0.0213470458984375, "learning_rate": 2.847041831010417e-07, "loss": 0.0, "reward": 1.3750000596046448, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.3750000074505806, "step": 236 }, { "completion_length": 376.50000762939453, "epoch": 0.20008406893652794, "grad_norm": 0.31222669099640876, "kl": 0.020355224609375, "learning_rate": 2.8149396264440227e-07, "loss": 0.0, "reward": 1.4583333730697632, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.4583333432674408, "step": 238 }, { "completion_length": 343.5833435058594, "epoch": 0.201765447667087, "grad_norm": 0.7628977182906207, "kl": 0.0173187255859375, "learning_rate": 2.782784565488211e-07, "loss": 0.0, "reward": 1.4166666865348816, "reward_std": 0.2357022613286972, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.4166666679084301, "step": 240 }, { "completion_length": 347.7916717529297, "epoch": 0.20344682639764608, "grad_norm": 0.9647796082832435, "kl": 0.033447265625, "learning_rate": 2.7505820447342024e-07, "loss": 0.0, "reward": 1.541666716337204, "reward_std": 0.2946278266608715, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.5416666828095913, "step": 242 }, { "completion_length": 367.0833435058594, "epoch": 0.20512820512820512, "grad_norm": 0.5497011882389701, "kl": 0.01873779296875, "learning_rate": 2.7183374687384096e-07, "loss": 0.0, "reward": 1.3750000298023224, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.3750000111758709, "step": 244 }, { "completion_length": 386.12500762939453, "epoch": 0.2068095838587642, "grad_norm": 0.7366647205721275, "kl": 0.028411865234375, "learning_rate": 2.686056249115385e-07, "loss": 0.0, "reward": 1.3333333432674408, "reward_std": 0.2357022613286972, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.3333333358168602, "step": 246 }, { "completion_length": 490.3333435058594, "epoch": 0.20849096258932326, "grad_norm": 0.3640781084742551, "kl": 0.0124053955078125, "learning_rate": 2.653743803629587e-07, "loss": 0.0, "reward": 1.3750000298023224, "reward_std": 0.2946278229355812, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.3750000037252903, "step": 248 }, { "completion_length": 496.6666793823242, "epoch": 0.2101723413198823, "grad_norm": 0.4955566369463096, "kl": 0.02813720703125, "learning_rate": 2.621405555286121e-07, "loss": 0.0, "reward": 1.3750000596046448, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.3750000111758709, "step": 250 }, { "completion_length": 403.0000114440918, "epoch": 0.21185372005044137, "grad_norm": 0.4129623000055268, "kl": 0.02422332763671875, "learning_rate": 2.589046931420589e-07, "loss": 0.0, "reward": 1.5000000596046448, "reward_std": 0.1178511306643486, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.5000000149011612, "step": 252 }, { "completion_length": 381.75000762939453, "epoch": 0.2135350987810004, "grad_norm": 0.3516450115328973, "kl": 0.0178985595703125, "learning_rate": 2.556673362788225e-07, "loss": 0.0, "reward": 1.541666716337204, "reward_std": 0.0589255653321743, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.541666679084301, "step": 254 }, { "completion_length": 458.0416793823242, "epoch": 0.21521647751155948, "grad_norm": 0.4125605646498622, "kl": 0.0144500732421875, "learning_rate": 2.524290282652443e-07, "loss": 0.0, "reward": 1.4583333730697632, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.4583333395421505, "step": 256 }, { "completion_length": 514.2083511352539, "epoch": 0.21689785624211855, "grad_norm": 0.2564484003528315, "kl": 0.0143585205078125, "learning_rate": 2.4919031258729785e-07, "loss": 0.0, "reward": 1.291666716337204, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.2916666716337204, "step": 258 }, { "completion_length": 437.8333511352539, "epoch": 0.2185792349726776, "grad_norm": 1.0909769396928994, "kl": 0.015350341796875, "learning_rate": 2.459517327993746e-07, "loss": 0.0, "reward": 1.5000000298023224, "reward_std": 0.3535533882677555, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.5000000074505806, "step": 260 }, { "completion_length": 408.3333435058594, "epoch": 0.22026061370323666, "grad_norm": 0.4944126035221868, "kl": 0.036376953125, "learning_rate": 2.427138324330601e-07, "loss": 0.0, "reward": 1.666666716337204, "reward_std": 0.1178511306643486, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.6666666939854622, "step": 262 }, { "completion_length": 484.45835876464844, "epoch": 0.22194199243379573, "grad_norm": 0.5710232427079407, "kl": 0.0914764404296875, "learning_rate": 2.3947715490591203e-07, "loss": 0.0001, "reward": 1.541666716337204, "reward_std": 0.2946278266608715, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.541666679084301, "step": 264 }, { "completion_length": 381.6666793823242, "epoch": 0.22362337116435477, "grad_norm": 0.5414781111970816, "kl": 0.0176849365234375, "learning_rate": 2.3624224343025876e-07, "loss": 0.0, "reward": 1.5833333730697632, "reward_std": 0.2357022576034069, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.5833333432674408, "step": 266 }, { "completion_length": 381.9166793823242, "epoch": 0.22530474989491384, "grad_norm": 0.2752143657107066, "kl": 0.015380859375, "learning_rate": 2.3300964092203203e-07, "loss": 0.0, "reward": 1.4583333730697632, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.4583333432674408, "step": 268 }, { "completion_length": 409.4166717529297, "epoch": 0.22698612862547288, "grad_norm": 0.7197256320386043, "kl": 0.012451171875, "learning_rate": 2.2977988990964896e-07, "loss": 0.0, "reward": 1.4583333730697632, "reward_std": 0.2946278229355812, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.4583333432674408, "step": 270 }, { "completion_length": 402.5416717529297, "epoch": 0.22866750735603195, "grad_norm": 1.0400541151794251, "kl": 0.0240020751953125, "learning_rate": 2.2655353244295927e-07, "loss": 0.0, "reward": 1.5000000298023224, "reward_std": 0.3535533882677555, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.5000000074505806, "step": 272 }, { "completion_length": 459.62501525878906, "epoch": 0.23034888608659101, "grad_norm": 0.5084744997876609, "kl": 0.020263671875, "learning_rate": 2.233311100022734e-07, "loss": 0.0, "reward": 1.541666716337204, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.5416666828095913, "step": 274 }, { "completion_length": 366.4583435058594, "epoch": 0.23203026481715006, "grad_norm": 0.7424502755105113, "kl": 0.1666107177734375, "learning_rate": 2.2011316340748528e-07, "loss": 0.0002, "reward": 1.6250000298023224, "reward_std": 0.2946278229355812, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.6250000149011612, "step": 276 }, { "completion_length": 495.16668701171875, "epoch": 0.23371164354770912, "grad_norm": 0.8257211078506724, "kl": 0.0146484375, "learning_rate": 2.1690023272730678e-07, "loss": 0.0, "reward": 1.541666716337204, "reward_std": 0.5303300879895687, "rewards/format_reward_func": 0.9583333432674408, "rewards/solution_reward_func": 0.5833333432674408, "step": 278 }, { "completion_length": 457.7916793823242, "epoch": 0.2353930222782682, "grad_norm": 0.42375918647040944, "kl": 0.0121307373046875, "learning_rate": 2.1369285718862748e-07, "loss": 0.0, "reward": 1.541666716337204, "reward_std": 0.0589255653321743, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.541666679084301, "step": 280 }, { "completion_length": 420.37500762939453, "epoch": 0.23707440100882723, "grad_norm": 0.5152043630269939, "kl": 0.0162506103515625, "learning_rate": 2.104915750860164e-07, "loss": 0.0, "reward": 1.4583333730697632, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.4583333395421505, "step": 282 }, { "completion_length": 385.8333511352539, "epoch": 0.2387557797393863, "grad_norm": 0.49294668816422704, "kl": 0.0164794921875, "learning_rate": 2.072969236913799e-07, "loss": 0.0, "reward": 1.3333333730697632, "reward_std": 0.2357022613286972, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.3333333358168602, "step": 284 }, { "completion_length": 392.62500762939453, "epoch": 0.24043715846994534, "grad_norm": 0.6512225875746797, "kl": 0.01849365234375, "learning_rate": 2.0410943916379097e-07, "loss": 0.0, "reward": 1.416666716337204, "reward_std": 0.2357022613286972, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.4166666753590107, "step": 286 }, { "completion_length": 412.12501525878906, "epoch": 0.2421185372005044, "grad_norm": 0.3660817551390846, "kl": 0.010711669921875, "learning_rate": 2.0092965645950564e-07, "loss": 0.0, "reward": 1.5833333730697632, "reward_std": 0.1178511306643486, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.5833333395421505, "step": 288 }, { "completion_length": 430.12500762939453, "epoch": 0.24379991593106348, "grad_norm": 0.626448385607845, "kl": 0.0183258056640625, "learning_rate": 1.977581092421812e-07, "loss": 0.0, "reward": 1.416666716337204, "reward_std": 0.2357022613286972, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.4166666753590107, "step": 290 }, { "completion_length": 396.7083435058594, "epoch": 0.24548129466162252, "grad_norm": 0.004913345703168958, "kl": 0.020263671875, "learning_rate": 1.9459532979331148e-07, "loss": 0.0, "reward": 1.541666716337204, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.5416666865348816, "step": 292 }, { "completion_length": 493.9583511352539, "epoch": 0.2471626733921816, "grad_norm": 0.5565359495913534, "kl": 0.0181427001953125, "learning_rate": 1.9144184892289336e-07, "loss": 0.0, "reward": 1.4583333432674408, "reward_std": 0.2946278229355812, "rewards/format_reward_func": 0.9583333432674408, "rewards/solution_reward_func": 0.5000000074505806, "step": 294 }, { "completion_length": 422.45835876464844, "epoch": 0.24884405212274066, "grad_norm": 0.449063011244765, "kl": 0.0212249755859375, "learning_rate": 1.882981958803414e-07, "loss": 0.0, "reward": 1.4583333432674408, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.4583333358168602, "step": 296 }, { "completion_length": 511.58335876464844, "epoch": 0.2505254308532997, "grad_norm": 0.5020099782167112, "kl": 0.011138916015625, "learning_rate": 1.8516489826566374e-07, "loss": 0.0, "reward": 1.4583333432674408, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.4583333358168602, "step": 298 }, { "completion_length": 412.25000762939453, "epoch": 0.25220680958385877, "grad_norm": 0.0014012414260649606, "kl": 0.02069091796875, "learning_rate": 1.8204248194091425e-07, "loss": 0.0, "reward": 1.7083333730697632, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.7083333507180214, "step": 300 }, { "completion_length": 450.8333435058594, "epoch": 0.25388818831441784, "grad_norm": 0.24821505940896457, "kl": 0.013641357421875, "learning_rate": 1.7893147094193784e-07, "loss": 0.0, "reward": 1.541666716337204, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 0.9583333432674408, "rewards/solution_reward_func": 0.5833333432674408, "step": 302 }, { "completion_length": 407.37501525878906, "epoch": 0.2555695670449769, "grad_norm": 0.7504631135855148, "kl": 0.0157470703125, "learning_rate": 1.7583238739042084e-07, "loss": 0.0, "reward": 1.7083333432674408, "reward_std": 0.2946278229355812, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.7083333432674408, "step": 304 }, { "completion_length": 454.37500762939453, "epoch": 0.2572509457755359, "grad_norm": 0.6337846991609941, "kl": 0.0143890380859375, "learning_rate": 1.7274575140626315e-07, "loss": 0.0, "reward": 1.7083334028720856, "reward_std": 0.2946278266608715, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.708333358168602, "step": 306 }, { "completion_length": 444.0833435058594, "epoch": 0.258932324506095, "grad_norm": 0.6680546776528479, "kl": 0.0210113525390625, "learning_rate": 1.6967208102028696e-07, "loss": 0.0, "reward": 1.4583333730697632, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.4583333432674408, "step": 308 }, { "completion_length": 415.62500762939453, "epoch": 0.26061370323665406, "grad_norm": 0.390703161764533, "kl": 0.054595947265625, "learning_rate": 1.6661189208729489e-07, "loss": 0.0001, "reward": 1.5000000298023224, "reward_std": 0.1178511306643486, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.5000000149011612, "step": 310 }, { "completion_length": 390.83333587646484, "epoch": 0.26229508196721313, "grad_norm": 0.42044370258908614, "kl": 0.016571044921875, "learning_rate": 1.6356569819949427e-07, "loss": 0.0, "reward": 1.5000000298023224, "reward_std": 0.1178511306643486, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.5000000149011612, "step": 312 }, { "completion_length": 430.87501525878906, "epoch": 0.2639764606977722, "grad_norm": 0.30802749969680127, "kl": 0.020721435546875, "learning_rate": 1.6053401060030097e-07, "loss": 0.0, "reward": 1.7500000596046448, "reward_std": 0.2357022613286972, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.7500000298023224, "step": 314 }, { "completion_length": 381.2916793823242, "epoch": 0.2656578394283312, "grad_norm": 0.4549527318175164, "kl": 0.0202789306640625, "learning_rate": 1.57517338098537e-07, "loss": 0.0, "reward": 1.5833333432674408, "reward_std": 0.1178511306643486, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.5833333358168602, "step": 316 }, { "completion_length": 365.08333587646484, "epoch": 0.2673392181588903, "grad_norm": 0.46488928219712783, "kl": 0.032012939453125, "learning_rate": 1.545161869830371e-07, "loss": 0.0, "reward": 1.666666716337204, "reward_std": 0.1178511306643486, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.6666666939854622, "step": 318 }, { "completion_length": 353.62501525878906, "epoch": 0.26902059688944935, "grad_norm": 0.8439115565804695, "kl": 0.018096923828125, "learning_rate": 1.5153106093767825e-07, "loss": 0.0, "reward": 1.7500000596046448, "reward_std": 0.3535533919930458, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.7500000298023224, "step": 320 }, { "completion_length": 425.3333511352539, "epoch": 0.2707019756200084, "grad_norm": 0.0030299941880457203, "kl": 0.016815185546875, "learning_rate": 1.4856246095684622e-07, "loss": 0.0, "reward": 1.8750000298023224, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.8750000149011612, "step": 322 }, { "completion_length": 408.4583435058594, "epoch": 0.2723833543505675, "grad_norm": 0.45480636781617306, "kl": 0.024200439453125, "learning_rate": 1.4561088526135374e-07, "loss": 0.0, "reward": 1.4583333432674408, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.4583333358168602, "step": 324 }, { "completion_length": 354.9583435058594, "epoch": 0.2740647330811265, "grad_norm": 0.3157474969297678, "kl": 0.019775390625, "learning_rate": 1.4267682921482356e-07, "loss": 0.0, "reward": 1.5000000298023224, "reward_std": 0.1178511306643486, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.5000000074505806, "step": 326 }, { "completion_length": 461.16668701171875, "epoch": 0.27574611181168557, "grad_norm": 0.8364092517911256, "kl": 0.0169219970703125, "learning_rate": 1.3976078524055203e-07, "loss": 0.0, "reward": 1.3333333730697632, "reward_std": 0.3535533919930458, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.3333333395421505, "step": 328 }, { "completion_length": 403.95833587646484, "epoch": 0.27742749054224464, "grad_norm": 0.8372601176868383, "kl": 0.0169219970703125, "learning_rate": 1.3686324273886528e-07, "loss": 0.0, "reward": 1.4583333730697632, "reward_std": 0.2946278266608715, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.4583333469927311, "step": 330 }, { "completion_length": 381.5833435058594, "epoch": 0.2791088692728037, "grad_norm": 0.7788446171406045, "kl": 0.026214599609375, "learning_rate": 1.339846880049829e-07, "loss": 0.0, "reward": 1.5000000298023224, "reward_std": 0.2357022613286972, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.5000000111758709, "step": 332 }, { "completion_length": 487.7916793823242, "epoch": 0.2807902480033628, "grad_norm": 0.9953471506857543, "kl": 0.0202484130859375, "learning_rate": 1.3112560414740313e-07, "loss": 0.0, "reward": 1.541666716337204, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.541666679084301, "step": 334 }, { "completion_length": 499.62500762939453, "epoch": 0.28247162673392184, "grad_norm": 0.35616817874834467, "kl": 0.02581787109375, "learning_rate": 1.2828647100682261e-07, "loss": 0.0, "reward": 1.666666716337204, "reward_std": 0.2357022613286972, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.666666679084301, "step": 336 }, { "completion_length": 454.2083435058594, "epoch": 0.28415300546448086, "grad_norm": 0.5428149223244878, "kl": 0.014923095703125, "learning_rate": 1.2546776507560467e-07, "loss": 0.0, "reward": 1.5000000298023224, "reward_std": 0.3535533919930458, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.5000000149011612, "step": 338 }, { "completion_length": 399.4166717529297, "epoch": 0.2858343841950399, "grad_norm": 0.9031786183823988, "kl": 0.0172576904296875, "learning_rate": 1.2266995941780933e-07, "loss": 0.0, "reward": 1.541666716337204, "reward_std": 0.2946278266608715, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.541666679084301, "step": 340 }, { "completion_length": 453.66667556762695, "epoch": 0.287515762925599, "grad_norm": 0.6316532508922427, "kl": 0.040191650390625, "learning_rate": 1.1989352358979888e-07, "loss": 0.0, "reward": 1.5000000298023224, "reward_std": 0.2357022613286972, "rewards/format_reward_func": 0.9583333432674408, "rewards/solution_reward_func": 0.541666679084301, "step": 342 }, { "completion_length": 392.25000762939453, "epoch": 0.28919714165615806, "grad_norm": 0.7866237262244063, "kl": 0.0247802734375, "learning_rate": 1.1713892356143238e-07, "loss": 0.0, "reward": 1.8333333730697632, "reward_std": 0.2357022613286972, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.8333333730697632, "step": 344 }, { "completion_length": 382.58333587646484, "epoch": 0.29087852038671713, "grad_norm": 0.8580927088394277, "kl": 0.0238189697265625, "learning_rate": 1.1440662163786166e-07, "loss": 0.0, "reward": 1.416666716337204, "reward_std": 0.1178511306643486, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.416666679084301, "step": 346 }, { "completion_length": 476.0416793823242, "epoch": 0.29255989911727615, "grad_norm": 0.3362942892798578, "kl": 0.0136871337890625, "learning_rate": 1.1169707638194237e-07, "loss": 0.0, "reward": 1.7500000596046448, "reward_std": 0.1178511306643486, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.7500000298023224, "step": 348 }, { "completion_length": 538.2083511352539, "epoch": 0.2942412778478352, "grad_norm": 0.2941167148030311, "kl": 0.01708984375, "learning_rate": 1.0901074253727336e-07, "loss": 0.0, "reward": 1.6666666865348816, "reward_std": 0.1178511306643486, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.666666679084301, "step": 350 }, { "completion_length": 380.75, "epoch": 0.2959226565783943, "grad_norm": 0.5138201935880581, "kl": 0.023223876953125, "learning_rate": 1.0634807095187737e-07, "loss": 0.0, "reward": 1.5833333730697632, "reward_std": 0.1178511306643486, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.5833333432674408, "step": 352 }, { "completion_length": 380.4583435058594, "epoch": 0.29760403530895335, "grad_norm": 0.6613574228632922, "kl": 0.14385986328125, "learning_rate": 1.0370950850253449e-07, "loss": 0.0001, "reward": 1.6666666865348816, "reward_std": 0.2357022613286972, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.6666666865348816, "step": 354 }, { "completion_length": 424.87500762939453, "epoch": 0.2992854140395124, "grad_norm": 0.4785401178732256, "kl": 0.0345916748046875, "learning_rate": 1.0109549801978304e-07, "loss": 0.0, "reward": 1.7083333432674408, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.7083333432674408, "step": 356 }, { "completion_length": 348.7916793823242, "epoch": 0.30096679277007143, "grad_norm": 0.0020426538152803417, "kl": 0.024261474609375, "learning_rate": 9.850647821359917e-08, "loss": 0.0, "reward": 1.4583333730697632, "reward_std": 0.0589255653321743, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.4583333432674408, "step": 358 }, { "completion_length": 475.79168701171875, "epoch": 0.3026481715006305, "grad_norm": 0.429195015613074, "kl": 0.0204010009765625, "learning_rate": 9.594288359976815e-08, "loss": 0.0, "reward": 1.6250000596046448, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.6250000149011612, "step": 360 }, { "completion_length": 494.25001525878906, "epoch": 0.30432955023118957, "grad_norm": 0.8473545963282998, "kl": 0.0186004638671875, "learning_rate": 9.340514442695952e-08, "loss": 0.0, "reward": 1.541666716337204, "reward_std": 0.4124789573252201, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.5416666865348816, "step": 362 }, { "completion_length": 492.8750228881836, "epoch": 0.30601092896174864, "grad_norm": 0.3234727873687572, "kl": 0.01751708984375, "learning_rate": 9.089368660451798e-08, "loss": 0.0, "reward": 1.5833333432674408, "reward_std": 0.1178511306643486, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.5833333358168602, "step": 364 }, { "completion_length": 452.2916793823242, "epoch": 0.3076923076923077, "grad_norm": 0.40422101797827664, "kl": 0.025787353515625, "learning_rate": 8.840893163098332e-08, "loss": 0.0, "reward": 1.6666666865348816, "reward_std": 0.2357022613286972, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.6666666865348816, "step": 366 }, { "completion_length": 368.6666793823242, "epoch": 0.3093736864228668, "grad_norm": 0.7245593032700843, "kl": 0.0194549560546875, "learning_rate": 8.595129652335017e-08, "loss": 0.0, "reward": 1.7083334028720856, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.708333358168602, "step": 368 }, { "completion_length": 397.08334732055664, "epoch": 0.3110550651534258, "grad_norm": 0.8562497096589754, "kl": 0.0177154541015625, "learning_rate": 8.352119374707977e-08, "loss": 0.0, "reward": 1.7916666865348816, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.7916666865348816, "step": 370 }, { "completion_length": 443.2083435058594, "epoch": 0.31273644388398486, "grad_norm": 0.43124200301124715, "kl": 0.020538330078125, "learning_rate": 8.11190311468759e-08, "loss": 0.0, "reward": 1.666666716337204, "reward_std": 0.2357022613286972, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.6666666939854622, "step": 372 }, { "completion_length": 469.9583435058594, "epoch": 0.31441782261454393, "grad_norm": 0.6436306808211317, "kl": 0.076263427734375, "learning_rate": 7.87452118782363e-08, "loss": 0.0001, "reward": 1.4583333730697632, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.4583333507180214, "step": 374 }, { "completion_length": 424.7083511352539, "epoch": 0.316099201345103, "grad_norm": 0.8624260825303681, "kl": 0.0174713134765625, "learning_rate": 7.640013433979093e-08, "loss": 0.0, "reward": 1.666666716337204, "reward_std": 0.3535533919930458, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.6666666865348816, "step": 376 }, { "completion_length": 480.4583435058594, "epoch": 0.31778058007566207, "grad_norm": 0.4768870927728356, "kl": 0.019195556640625, "learning_rate": 7.408419210643846e-08, "loss": 0.0, "reward": 1.5833333730697632, "reward_std": 0.3535533919930458, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.583333358168602, "step": 378 }, { "completion_length": 457.7083435058594, "epoch": 0.3194619588062211, "grad_norm": 0.7325139897043152, "kl": 0.0226287841796875, "learning_rate": 7.179777386329275e-08, "loss": 0.0, "reward": 1.7083333730697632, "reward_std": 0.2946278266608715, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.708333358168602, "step": 380 }, { "completion_length": 492.0416793823242, "epoch": 0.32114333753678015, "grad_norm": 0.6812039461038883, "kl": 0.0167388916015625, "learning_rate": 6.954126334044949e-08, "loss": 0.0, "reward": 1.4583333730697632, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.4583333432674408, "step": 382 }, { "completion_length": 427.75001525878906, "epoch": 0.3228247162673392, "grad_norm": 0.26789925678872634, "kl": 0.0200653076171875, "learning_rate": 6.731503924858516e-08, "loss": 0.0, "reward": 1.5833333730697632, "reward_std": 0.1178511306643486, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.5833333507180214, "step": 384 }, { "completion_length": 460.37500762939453, "epoch": 0.3245060949978983, "grad_norm": 0.4139795134217995, "kl": 0.0171051025390625, "learning_rate": 6.511947521539737e-08, "loss": 0.0, "reward": 1.8750000298023224, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.8750000298023224, "step": 386 }, { "completion_length": 401.37500762939453, "epoch": 0.32618747372845736, "grad_norm": 0.9550140447715619, "kl": 0.04052734375, "learning_rate": 6.295493972289903e-08, "loss": 0.0, "reward": 1.541666716337204, "reward_std": 0.2946278266608715, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.5416666828095913, "step": 388 }, { "completion_length": 381.4583435058594, "epoch": 0.32786885245901637, "grad_norm": 0.8642430155063329, "kl": 0.018341064453125, "learning_rate": 6.082179604557616e-08, "loss": 0.0, "reward": 1.5833333730697632, "reward_std": 0.2357022613286972, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.5833333507180214, "step": 390 }, { "completion_length": 504.0000228881836, "epoch": 0.32955023118957544, "grad_norm": 0.45869091032068066, "kl": 0.0639801025390625, "learning_rate": 5.8720402189419286e-08, "loss": 0.0001, "reward": 1.5833333730697632, "reward_std": 0.2357022613286972, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.5833333544433117, "step": 392 }, { "completion_length": 499.0833511352539, "epoch": 0.3312316099201345, "grad_norm": 0.001226330191092669, "kl": 0.0165252685546875, "learning_rate": 5.6651110831839046e-08, "loss": 0.0, "reward": 1.6666667461395264, "reward_std": 0.1178511306643486, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.6666666865348816, "step": 394 }, { "completion_length": 416.79168701171875, "epoch": 0.3329129886506936, "grad_norm": 0.23769632812912322, "kl": 0.0198211669921875, "learning_rate": 5.461426926247639e-08, "loss": 0.0, "reward": 1.6250000298023224, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.6250000149011612, "step": 396 }, { "completion_length": 510.0416793823242, "epoch": 0.33459436738125264, "grad_norm": 0.4301315282000898, "kl": 0.0159149169921875, "learning_rate": 5.261021932491713e-08, "loss": 0.0, "reward": 1.541666716337204, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.5416666865348816, "step": 398 }, { "completion_length": 507.0000228881836, "epoch": 0.3362757461118117, "grad_norm": 0.2327539317664912, "kl": 0.016632080078125, "learning_rate": 5.0639297359319846e-08, "loss": 0.0, "reward": 1.5833333730697632, "reward_std": 0.1178511306643486, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.583333358168602, "step": 400 }, { "completion_length": 398.75000762939453, "epoch": 0.3379571248423707, "grad_norm": 0.6603293560697683, "kl": 0.01715087890625, "learning_rate": 4.870183414596793e-08, "loss": 0.0, "reward": 1.7916666865348816, "reward_std": 0.2946278229355812, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.7916666865348816, "step": 402 }, { "completion_length": 453.9166793823242, "epoch": 0.3396385035729298, "grad_norm": 0.357065731955349, "kl": 0.0204010009765625, "learning_rate": 4.679815484975505e-08, "loss": 0.0, "reward": 1.541666716337204, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.5416666828095913, "step": 404 }, { "completion_length": 329.2916793823242, "epoch": 0.34131988230348886, "grad_norm": 0.6145494404346448, "kl": 0.208160400390625, "learning_rate": 4.492857896561203e-08, "loss": 0.0002, "reward": 1.5416666865348816, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.541666679084301, "step": 406 }, { "completion_length": 482.50001525878906, "epoch": 0.34300126103404793, "grad_norm": 0.58571987604954, "kl": 0.041656494140625, "learning_rate": 4.309342026488652e-08, "loss": 0.0, "reward": 1.666666716337204, "reward_std": 0.2357022613286972, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.6666666865348816, "step": 408 }, { "completion_length": 441.2916717529297, "epoch": 0.344682639764607, "grad_norm": 1.06985487849842, "kl": 0.0160675048828125, "learning_rate": 4.1292986742682254e-08, "loss": 0.0, "reward": 1.6250000596046448, "reward_std": 0.2946278266608715, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.6250000223517418, "step": 410 }, { "completion_length": 389.4166717529297, "epoch": 0.346364018495166, "grad_norm": 0.7270657915048854, "kl": 0.019683837890625, "learning_rate": 3.952758056616826e-08, "loss": 0.0, "reward": 1.5833334028720856, "reward_std": 0.3535533919930458, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.5833333544433117, "step": 412 }, { "completion_length": 406.33333587646484, "epoch": 0.3480453972257251, "grad_norm": 0.27758659667364055, "kl": 0.0177001953125, "learning_rate": 3.7797498023866395e-08, "loss": 0.0, "reward": 1.8333333432674408, "reward_std": 0.1178511306643486, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.8333333432674408, "step": 414 }, { "completion_length": 488.58335876464844, "epoch": 0.34972677595628415, "grad_norm": 0.3438592498625851, "kl": 0.0125274658203125, "learning_rate": 3.6103029475924727e-08, "loss": 0.0, "reward": 1.4583333730697632, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.4583333507180214, "step": 416 }, { "completion_length": 467.9166717529297, "epoch": 0.3514081546868432, "grad_norm": 0.7068973819897363, "kl": 0.0189666748046875, "learning_rate": 3.4444459305386504e-08, "loss": 0.0, "reward": 1.5833333730697632, "reward_std": 0.1178511306643486, "rewards/format_reward_func": 0.9583333432674408, "rewards/solution_reward_func": 0.6250000111758709, "step": 418 }, { "completion_length": 532.6666793823242, "epoch": 0.3530895334174023, "grad_norm": 0.30882726803388616, "kl": 0.013458251953125, "learning_rate": 3.2822065870462215e-08, "loss": 0.0, "reward": 1.7500000298023224, "reward_std": 0.1178511306643486, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.7500000223517418, "step": 420 }, { "completion_length": 393.5416717529297, "epoch": 0.3547709121479613, "grad_norm": 0.5034685025489649, "kl": 0.020172119140625, "learning_rate": 3.1236121457812545e-08, "loss": 0.0, "reward": 1.7083333432674408, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.7083333432674408, "step": 422 }, { "completion_length": 373.75000762939453, "epoch": 0.3564522908785204, "grad_norm": 0.7676592102825319, "kl": 0.021453857421875, "learning_rate": 2.9686892236850336e-08, "loss": 0.0, "reward": 1.4583333730697632, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.4583333432674408, "step": 424 }, { "completion_length": 386.0416793823242, "epoch": 0.35813366960907944, "grad_norm": 1.0126228541091091, "kl": 0.039794921875, "learning_rate": 2.817463821506949e-08, "loss": 0.0, "reward": 1.5416666865348816, "reward_std": 0.2946278229355812, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.5416666716337204, "step": 426 }, { "completion_length": 414.1666793823242, "epoch": 0.3598150483396385, "grad_norm": 0.4701389090006604, "kl": 0.0238494873046875, "learning_rate": 2.6699613194407723e-08, "loss": 0.0, "reward": 1.6666666865348816, "reward_std": 0.2357022613286972, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.6666666865348816, "step": 428 }, { "completion_length": 474.4166793823242, "epoch": 0.3614964270701976, "grad_norm": 0.5957470677297103, "kl": 0.019805908203125, "learning_rate": 2.5262064728651194e-08, "loss": 0.0, "reward": 1.666666716337204, "reward_std": 0.3535533919930458, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.6666666865348816, "step": 430 }, { "completion_length": 438.8333435058594, "epoch": 0.36317780580075665, "grad_norm": 0.23260450142169511, "kl": 0.016265869140625, "learning_rate": 2.3862234081887033e-08, "loss": 0.0, "reward": 1.8333333432674408, "reward_std": 0.1178511306643486, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.8333333432674408, "step": 432 }, { "completion_length": 376.25000762939453, "epoch": 0.36485918453131566, "grad_norm": 0.4819133256066341, "kl": 0.02789306640625, "learning_rate": 2.250035618801241e-08, "loss": 0.0, "reward": 1.541666716337204, "reward_std": 0.0589255653321743, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.5416666753590107, "step": 434 }, { "completion_length": 395.3333511352539, "epoch": 0.36654056326187473, "grad_norm": 0.4753927376230513, "kl": 0.0201263427734375, "learning_rate": 2.117665961130513e-08, "loss": 0.0, "reward": 1.791666716337204, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.7916666865348816, "step": 436 }, { "completion_length": 480.2083511352539, "epoch": 0.3682219419924338, "grad_norm": 0.5069210021394791, "kl": 0.02008056640625, "learning_rate": 1.9891366508064e-08, "loss": 0.0, "reward": 1.6250000298023224, "reward_std": 0.2946278229355812, "rewards/format_reward_func": 0.9583333432674408, "rewards/solution_reward_func": 0.666666679084301, "step": 438 }, { "completion_length": 404.62500762939453, "epoch": 0.36990332072299287, "grad_norm": 0.5579175174593816, "kl": 0.0225830078125, "learning_rate": 1.8644692589323967e-08, "loss": 0.0, "reward": 1.5833333730697632, "reward_std": 0.2357022613286972, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.5833333544433117, "step": 440 }, { "completion_length": 450.7083435058594, "epoch": 0.37158469945355194, "grad_norm": 0.0011962781703181325, "kl": 0.022216796875, "learning_rate": 1.7436847084653456e-08, "loss": 0.0, "reward": 1.6666666865348816, "reward_std": 0.2357022576034069, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.6666666716337204, "step": 442 }, { "completion_length": 403.62500762939453, "epoch": 0.37326607818411095, "grad_norm": 0.5038101762676807, "kl": 0.020782470703125, "learning_rate": 1.626803270703936e-08, "loss": 0.0, "reward": 1.666666716337204, "reward_std": 0.1178511306643486, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.6666666865348816, "step": 444 }, { "completion_length": 305.6666717529297, "epoch": 0.37494745691467, "grad_norm": 1.1478452212927168, "kl": 0.02532958984375, "learning_rate": 1.513844561886554e-08, "loss": 0.0, "reward": 1.8333333730697632, "reward_std": 0.2357022613286972, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.8333333432674408, "step": 446 }, { "completion_length": 440.91668701171875, "epoch": 0.3766288356452291, "grad_norm": 0.5410906835431928, "kl": 0.025360107421875, "learning_rate": 1.4048275398990894e-08, "loss": 0.0, "reward": 1.5000000298023224, "reward_std": 0.2357022613286972, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.5000000111758709, "step": 448 }, { "completion_length": 520.8333435058594, "epoch": 0.37831021437578816, "grad_norm": 0.6494890901957951, "kl": 0.0169219970703125, "learning_rate": 1.2997705010932391e-08, "loss": 0.0, "reward": 1.7083333730697632, "reward_std": 0.2946278229355812, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.7083333432674408, "step": 450 }, { "completion_length": 504.0833435058594, "epoch": 0.3799915931063472, "grad_norm": 0.11376590313112087, "kl": 0.043548583984375, "learning_rate": 1.1986910772158105e-08, "loss": 0.0, "reward": 1.4583333730697632, "reward_std": 0.0589255653321743, "rewards/format_reward_func": 0.9583333432674408, "rewards/solution_reward_func": 0.5000000074505806, "step": 452 }, { "completion_length": 436.7916717529297, "epoch": 0.38167297183690624, "grad_norm": 0.5652200449027903, "kl": 0.03338623046875, "learning_rate": 1.1016062324496007e-08, "loss": 0.0, "reward": 1.5833333730697632, "reward_std": 0.2357022613286972, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.5833333507180214, "step": 454 }, { "completion_length": 405.7083511352539, "epoch": 0.3833543505674653, "grad_norm": 0.43158182108263865, "kl": 0.0216064453125, "learning_rate": 1.0085322605662666e-08, "loss": 0.0, "reward": 1.7500000596046448, "reward_std": 0.3535533919930458, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.7500000149011612, "step": 456 }, { "completion_length": 472.0833435058594, "epoch": 0.3850357292980244, "grad_norm": 0.717982397429151, "kl": 0.018798828125, "learning_rate": 9.194847821917623e-09, "loss": 0.0, "reward": 1.7083333730697632, "reward_std": 0.2946278266608715, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.708333358168602, "step": 458 }, { "completion_length": 433.37501525878906, "epoch": 0.38671710802858345, "grad_norm": 0.3929283618854557, "kl": 0.021942138671875, "learning_rate": 8.344787421847216e-09, "loss": 0.0, "reward": 1.5833333730697632, "reward_std": 0.1178511306643486, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.5833333469927311, "step": 460 }, { "completion_length": 546.5833435058594, "epoch": 0.3883984867591425, "grad_norm": 0.36389040620504864, "kl": 0.018341064453125, "learning_rate": 7.535284071282455e-09, "loss": 0.0, "reward": 1.666666716337204, "reward_std": 0.2357022613286972, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.6666666939854622, "step": 462 }, { "completion_length": 397.7916717529297, "epoch": 0.3900798654897016, "grad_norm": 0.4936781871754855, "kl": 0.0218505859375, "learning_rate": 6.766473629355452e-09, "loss": 0.0, "reward": 1.7916666865348816, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.7916666865348816, "step": 464 }, { "completion_length": 449.37501525878906, "epoch": 0.3917612442202606, "grad_norm": 0.416937865685922, "kl": 0.0172119140625, "learning_rate": 6.038485125698295e-09, "loss": 0.0, "reward": 1.6250000298023224, "reward_std": 0.0589255653321743, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.6250000074505806, "step": 466 }, { "completion_length": 445.41668701171875, "epoch": 0.39344262295081966, "grad_norm": 0.7420333816373434, "kl": 0.020751953125, "learning_rate": 5.3514407387877936e-09, "loss": 0.0, "reward": 1.6666666865348816, "reward_std": 0.2357022613286972, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.6666666716337204, "step": 468 }, { "completion_length": 414.62501525878906, "epoch": 0.39512400168137873, "grad_norm": 0.0068766679961718095, "kl": 0.020599365234375, "learning_rate": 4.705455775440237e-09, "loss": 0.0, "reward": 1.7500000298023224, "reward_std": 0.1178511306643486, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.7500000149011612, "step": 470 }, { "completion_length": 509.6666717529297, "epoch": 0.3968053804119378, "grad_norm": 0.5120985795788513, "kl": 0.049163818359375, "learning_rate": 4.100638651459542e-09, "loss": 0.0, "reward": 1.7083334028720856, "reward_std": 0.4124789573252201, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.708333358168602, "step": 472 }, { "completion_length": 416.00000762939453, "epoch": 0.39848675914249687, "grad_norm": 0.001027025835790879, "kl": 0.0158233642578125, "learning_rate": 3.5370908734417006e-09, "loss": 0.0, "reward": 1.7500000596046448, "reward_std": 0.0, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.7500000149011612, "step": 474 }, { "completion_length": 397.25000762939453, "epoch": 0.4001681378730559, "grad_norm": 0.021320987422443205, "kl": 0.038848876953125, "learning_rate": 3.0149070217390106e-09, "loss": 0.0, "reward": 1.4583333730697632, "reward_std": 0.0589255653321743, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.4583333395421505, "step": 476 }, { "completion_length": 429.4583511352539, "epoch": 0.40184951660361495, "grad_norm": 0.4002345142208856, "kl": 0.0300445556640625, "learning_rate": 2.5341747345865026e-09, "loss": 0.0, "reward": 1.7500000298023224, "reward_std": 0.2357022613286972, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.7500000298023224, "step": 478 }, { "completion_length": 461.50001525878906, "epoch": 0.403530895334174, "grad_norm": 0.0029418687663408513, "kl": 0.021148681640625, "learning_rate": 2.094974693393731e-09, "loss": 0.0, "reward": 1.5833333730697632, "reward_std": 0.1178511306643486, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.583333358168602, "step": 480 }, { "completion_length": 464.3333435058594, "epoch": 0.4052122740647331, "grad_norm": 0.17330056984154224, "kl": 0.0167388916015625, "learning_rate": 1.6973806092038523e-09, "loss": 0.0, "reward": 1.7083333432674408, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.7083333432674408, "step": 482 }, { "completion_length": 494.3333511352539, "epoch": 0.40689365279529216, "grad_norm": 0.6291866431405297, "kl": 0.01507568359375, "learning_rate": 1.3414592103228594e-09, "loss": 0.0, "reward": 1.7083333730697632, "reward_std": 0.1767766922712326, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.7083333432674408, "step": 484 }, { "completion_length": 407.79168701171875, "epoch": 0.4085750315258512, "grad_norm": 0.7527510846206448, "kl": 0.02587890625, "learning_rate": 1.0272702311203695e-09, "loss": 0.0, "reward": 1.6250000298023224, "reward_std": 0.4124789535999298, "rewards/format_reward_func": 0.9583333432674408, "rewards/solution_reward_func": 0.6666666865348816, "step": 486 }, { "completion_length": 463.5000228881836, "epoch": 0.41025641025641024, "grad_norm": 0.5925315928631419, "kl": 0.020721435546875, "learning_rate": 7.548664020045059e-10, "loss": 0.0, "reward": 1.4583333730697632, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.4583333507180214, "step": 488 }, { "completion_length": 437.2083511352539, "epoch": 0.4119377889869693, "grad_norm": 0.7831672518645952, "kl": 0.034820556640625, "learning_rate": 5.242934405720878e-10, "loss": 0.0, "reward": 1.7500000596046448, "reward_std": 0.2357022613286972, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.7500000298023224, "step": 490 }, { "completion_length": 444.3333435058594, "epoch": 0.4136191677175284, "grad_norm": 0.5783716082619003, "kl": 0.019439697265625, "learning_rate": 3.355900439359072e-10, "loss": 0.0, "reward": 1.7083333730697632, "reward_std": 0.2946278266608715, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.7083333432674408, "step": 492 }, { "completion_length": 474.0833511352539, "epoch": 0.41530054644808745, "grad_norm": 0.5030728051783439, "kl": 0.02392578125, "learning_rate": 1.8878788223009035e-10, "loss": 0.0, "reward": 1.3750000298023224, "reward_std": 0.1767766959965229, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.3750000037252903, "step": 494 }, { "completion_length": 471.3333435058594, "epoch": 0.4169819251786465, "grad_norm": 0.40396130292565613, "kl": 0.02294921875, "learning_rate": 8.391159329496079e-11, "loss": 0.0, "reward": 1.791666716337204, "reward_std": 0.2946278266608715, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.7916667014360428, "step": 496 }, { "completion_length": 438.1666793823242, "epoch": 0.41866330390920553, "grad_norm": 0.4863347024264504, "kl": 0.01949310302734375, "learning_rate": 2.097877854204122e-11, "loss": 0.0, "reward": 1.666666716337204, "reward_std": 0.2357022613286972, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.6666666939854622, "step": 498 }, { "completion_length": 391.7083435058594, "epoch": 0.4203446826397646, "grad_norm": 0.6480363403962828, "kl": 0.0219573974609375, "learning_rate": 0.0, "loss": 0.0, "reward": 1.6250000298023224, "reward_std": 0.2946278229355812, "rewards/format_reward_func": 1.0, "rewards/solution_reward_func": 0.6250000149011612, "step": 500 }, { "epoch": 0.4203446826397646, "step": 500, "total_flos": 0.0, "train_loss": 1.7704009043086445e-05, "train_runtime": 16459.3672, "train_samples_per_second": 0.182, "train_steps_per_second": 0.03 } ], "logging_steps": 2, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }