{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.747126436781609, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 267.875, "epoch": 0.022988505747126436, "grad_norm": 15489.8603515625, "kl": 15445.687591552734, "learning_rate": 3.3333333333333335e-07, "loss": 15.4457, "reward": 0.04048949736170471, "reward_std": 0.013678457878995687, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.04048949759453535, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 2 }, { "completion_length": 310.875, "epoch": 0.04597701149425287, "grad_norm": 20448.56640625, "kl": 22287.903568267822, "learning_rate": 1.0000000000000002e-06, "loss": 22.2879, "reward": 0.048436759621836245, "reward_std": 0.023734198708552867, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.0484367590979673, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 4 }, { "completion_length": 454.625, "epoch": 0.06896551724137931, "grad_norm": 11425.4560546875, "kl": 8294.522334814072, "learning_rate": 1.6666666666666667e-06, "loss": 8.2945, "reward": 0.04298628534888849, "reward_std": 0.02207429221016355, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.04298628534888849, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 6 }, { "completion_length": 243.0, "epoch": 0.09195402298850575, "grad_norm": 841.8639526367188, "kl": 1415.567174911499, "learning_rate": 2.3333333333333336e-06, "loss": 1.4156, "reward": 0.04227710422128439, "reward_std": 0.018735986901447177, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.04227710410486907, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 8 }, { "completion_length": 514.75, "epoch": 0.11494252873563218, "grad_norm": 1855.7728271484375, "kl": 2456.106436252594, "learning_rate": 3e-06, "loss": 2.4561, "reward": 0.02142503301729448, "reward_std": 0.01620337264466798, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.02142503162031062, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 10 }, { "completion_length": 262.875, "epoch": 0.13793103448275862, "grad_norm": 567.3268432617188, "kl": 1198.1407985687256, "learning_rate": 3.6666666666666666e-06, "loss": 1.1981, "reward": 0.024914476030971855, "reward_std": 0.013680961128557101, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.024914475972764194, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 12 }, { "completion_length": 328.25, "epoch": 0.16091954022988506, "grad_norm": 136.10317993164062, "kl": 820.0500531196594, "learning_rate": 4.333333333333334e-06, "loss": 0.8201, "reward": 0.027978417929261923, "reward_std": 0.01527385163353756, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.02797841769643128, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 14 }, { "completion_length": 295.25, "epoch": 0.1839080459770115, "grad_norm": 249.54904174804688, "kl": 157.7834677696228, "learning_rate": 5e-06, "loss": 0.1578, "reward": 0.024856400617863983, "reward_std": 0.014128750306554139, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.02095015096710995, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.00390625, "step": 16 }, { "completion_length": 300.25, "epoch": 0.20689655172413793, "grad_norm": 123.20221710205078, "kl": 86.70643353462219, "learning_rate": 4.99979021221458e-06, "loss": 0.0867, "reward": 0.04340612230589613, "reward_std": 0.02084702106367331, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.04340612271334976, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 18 }, { "completion_length": 312.75, "epoch": 0.22988505747126436, "grad_norm": 71.37357330322266, "kl": 121.41249084472656, "learning_rate": 4.999160884067051e-06, "loss": 0.1214, "reward": 0.041425400297157466, "reward_std": 0.030150338672683574, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.041425400937441736, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 20 }, { "completion_length": 209.75, "epoch": 0.25287356321839083, "grad_norm": 64.09546661376953, "kl": 57.147058844566345, "learning_rate": 4.9981121211777e-06, "loss": 0.0571, "reward": 0.05420591635629535, "reward_std": 0.02624344697687775, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.054205916239880025, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 22 }, { "completion_length": 173.125, "epoch": 0.27586206896551724, "grad_norm": 103.89892578125, "kl": 59.34820091724396, "learning_rate": 4.9966440995606415e-06, "loss": 0.0593, "reward": 0.03442433197051287, "reward_std": 0.012924832059070468, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.03442433290183544, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 24 }, { "completion_length": 177.875, "epoch": 0.2988505747126437, "grad_norm": 67.56229400634766, "kl": 38.416051745414734, "learning_rate": 4.99475706559428e-06, "loss": 0.0384, "reward": 0.050014273379929364, "reward_std": 0.023354795324848965, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.050014272914268076, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 26 }, { "completion_length": 305.625, "epoch": 0.3218390804597701, "grad_norm": 74.08392333984375, "kl": 50.467414021492004, "learning_rate": 4.9924513359799555e-06, "loss": 0.0505, "reward": 0.037250664085149765, "reward_std": 0.02188962057698518, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.037250664783641696, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 28 }, { "completion_length": 266.5, "epoch": 0.3448275862068966, "grad_norm": 70.50927734375, "kl": 97.512868642807, "learning_rate": 4.989727297688797e-06, "loss": 0.0975, "reward": 0.03627986880019307, "reward_std": 0.017330561415292323, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.03627986891660839, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 30 }, { "completion_length": 378.375, "epoch": 0.367816091954023, "grad_norm": 41.392112731933594, "kl": 71.54510736465454, "learning_rate": 4.9865854078967715e-06, "loss": 0.0715, "reward": 0.017296138452365994, "reward_std": 0.00997625885065645, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.01729613821953535, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 32 }, { "completion_length": 280.375, "epoch": 0.39080459770114945, "grad_norm": 55.41687774658203, "kl": 60.01027548313141, "learning_rate": 4.983026193907962e-06, "loss": 0.06, "reward": 0.05314687779173255, "reward_std": 0.03268945781746879, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.053146878723055124, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 34 }, { "completion_length": 134.0, "epoch": 0.41379310344827586, "grad_norm": 101.97993469238281, "kl": 141.00524878501892, "learning_rate": 4.979050253066064e-06, "loss": 0.141, "reward": 0.025667070760391653, "reward_std": 0.010980784136336297, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.02566707052756101, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 36 }, { "completion_length": 170.375, "epoch": 0.4367816091954023, "grad_norm": 34.432926177978516, "kl": 45.302165031433105, "learning_rate": 4.974658252654135e-06, "loss": 0.0453, "reward": 0.044665039982646704, "reward_std": 0.020652918959967792, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.044665040099062026, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 38 }, { "completion_length": 140.5, "epoch": 0.45977011494252873, "grad_norm": 48.98166275024414, "kl": 44.7465535402298, "learning_rate": 4.96985092978261e-06, "loss": 0.0447, "reward": 0.05151920788921416, "reward_std": 0.02609842922538519, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.05151920719072223, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 40 }, { "completion_length": 264.75, "epoch": 0.4827586206896552, "grad_norm": 50.67466735839844, "kl": 37.88811469078064, "learning_rate": 4.964629091265583e-06, "loss": 0.0379, "reward": 0.06031158403493464, "reward_std": 0.027984489977825433, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.06031158100813627, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 42 }, { "completion_length": 310.25, "epoch": 0.5057471264367817, "grad_norm": 87.53923034667969, "kl": 73.59773737192154, "learning_rate": 4.958993613485406e-06, "loss": 0.0736, "reward": 0.036572819808498025, "reward_std": 0.020597186085069552, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.03657282004132867, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 44 }, { "completion_length": 326.875, "epoch": 0.5287356321839081, "grad_norm": 67.48158264160156, "kl": 91.61651086807251, "learning_rate": 4.952945442245598e-06, "loss": 0.0916, "reward": 0.04992801428306848, "reward_std": 0.031167739973170683, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.04602176556363702, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.00390625, "step": 46 }, { "completion_length": 305.125, "epoch": 0.5517241379310345, "grad_norm": 22.411195755004883, "kl": 82.91102123260498, "learning_rate": 4.946485592612122e-06, "loss": 0.0829, "reward": 0.08468491910025477, "reward_std": 0.050579800736159086, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.0846849181689322, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 48 }, { "completion_length": 93.875, "epoch": 0.5747126436781609, "grad_norm": 65.9449234008789, "kl": 27.778719305992126, "learning_rate": 4.939615148743017e-06, "loss": 0.0278, "reward": 0.04651718633249402, "reward_std": 0.021823747898451984, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.04651718633249402, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 50 }, { "completion_length": 352.25, "epoch": 0.5977011494252874, "grad_norm": 49.707332611083984, "kl": 106.14258408546448, "learning_rate": 4.932335263706446e-06, "loss": 0.1061, "reward": 0.06676553084980696, "reward_std": 0.03281888831406832, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.06676553084980696, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 52 }, { "completion_length": 270.75, "epoch": 0.6206896551724138, "grad_norm": 48.479469299316406, "kl": 121.85716557502747, "learning_rate": 4.924647159287176e-06, "loss": 0.1219, "reward": 0.039502770407125354, "reward_std": 0.020723832567455247, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.03950276947580278, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 54 }, { "completion_length": 453.0, "epoch": 0.6436781609195402, "grad_norm": 57.90752029418945, "kl": 75.638800740242, "learning_rate": 4.916552125781529e-06, "loss": 0.0756, "reward": 0.04748519998975098, "reward_std": 0.020864710793830454, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.04748520057182759, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 56 }, { "completion_length": 135.25, "epoch": 0.6666666666666666, "grad_norm": 60.72783660888672, "kl": 97.29351049661636, "learning_rate": 4.908051521780824e-06, "loss": 0.0973, "reward": 0.03725634841248393, "reward_std": 0.018743265536613762, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.037256348645314574, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 58 }, { "completion_length": 172.75, "epoch": 0.6896551724137931, "grad_norm": 46.48042678833008, "kl": 38.82199025154114, "learning_rate": 4.899146773943374e-06, "loss": 0.0388, "reward": 0.03978487430140376, "reward_std": 0.0119002779974835, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.039784873370081186, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 60 }, { "completion_length": 191.625, "epoch": 0.7126436781609196, "grad_norm": 246.42416381835938, "kl": 249.49076426029205, "learning_rate": 4.889839376755041e-06, "loss": 0.2495, "reward": 0.01685239444486797, "reward_std": 0.007526621964643709, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.016852394095622003, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 62 }, { "completion_length": 299.25, "epoch": 0.735632183908046, "grad_norm": 31.09661865234375, "kl": 148.89759922027588, "learning_rate": 4.88013089227842e-06, "loss": 0.1489, "reward": 0.04315828834660351, "reward_std": 0.02518067165510729, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.043158287298865616, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 64 }, { "completion_length": 294.625, "epoch": 0.7586206896551724, "grad_norm": 205.9741973876953, "kl": 187.19354581832886, "learning_rate": 4.870022949890676e-06, "loss": 0.1872, "reward": 0.05923265521414578, "reward_std": 0.02364629483781755, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.05923265707679093, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 66 }, { "completion_length": 134.25, "epoch": 0.7816091954022989, "grad_norm": 105.90323638916016, "kl": 51.5447211265564, "learning_rate": 4.8595172460100914e-06, "loss": 0.0515, "reward": 0.05770075123291463, "reward_std": 0.023957947007147595, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.05770075030159205, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 68 }, { "completion_length": 374.25, "epoch": 0.8045977011494253, "grad_norm": 48.21788787841797, "kl": 66.45522928237915, "learning_rate": 4.8486155438113455e-06, "loss": 0.0665, "reward": 0.05807957309298217, "reward_std": 0.04090029839426279, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.05417332355864346, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.00390625, "step": 70 }, { "completion_length": 179.125, "epoch": 0.8275862068965517, "grad_norm": 964.2046508789062, "kl": 478.07892072200775, "learning_rate": 4.837319672929606e-06, "loss": 0.4781, "reward": 0.047723546042107046, "reward_std": 0.01890685805119574, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.04772354627493769, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 72 }, { "completion_length": 98.375, "epoch": 0.8505747126436781, "grad_norm": 59.981346130371094, "kl": 38.217466592788696, "learning_rate": 4.825631529153466e-06, "loss": 0.0382, "reward": 0.0730273281224072, "reward_std": 0.0362726898456458, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.07302732882089913, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 74 }, { "completion_length": 87.375, "epoch": 0.8735632183908046, "grad_norm": 34.10300827026367, "kl": 23.951123476028442, "learning_rate": 4.813553074106761e-06, "loss": 0.024, "reward": 0.04558887903112918, "reward_std": 0.02155381056945771, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.04558887809980661, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 76 }, { "completion_length": 198.125, "epoch": 0.896551724137931, "grad_norm": 35.774898529052734, "kl": 83.67917680740356, "learning_rate": 4.8010863349193605e-06, "loss": 0.0837, "reward": 0.05536620563361794, "reward_std": 0.0264898365130648, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.055366206099279225, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 78 }, { "completion_length": 78.0, "epoch": 0.9195402298850575, "grad_norm": 105.502197265625, "kl": 15.211400270462036, "learning_rate": 4.78823340388695e-06, "loss": 0.0152, "reward": 0.09725192002952099, "reward_std": 0.04003778542391956, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.09725191909819841, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 80 }, { "completion_length": 130.25, "epoch": 0.9425287356321839, "grad_norm": 86.3935317993164, "kl": 21.690183252096176, "learning_rate": 4.774996438119876e-06, "loss": 0.0217, "reward": 0.054088836535811424, "reward_std": 0.026792482065502554, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.05408883560448885, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 82 }, { "completion_length": 185.5, "epoch": 0.9655172413793104, "grad_norm": 96.50428771972656, "kl": 33.65036225318909, "learning_rate": 4.76137765918113e-06, "loss": 0.0337, "reward": 0.0608408038970083, "reward_std": 0.026070831139804795, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.06084080250002444, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 84 }, { "completion_length": 136.625, "epoch": 0.9885057471264368, "grad_norm": 219.9915008544922, "kl": 32.739950597286224, "learning_rate": 4.747379352713489e-06, "loss": 0.0327, "reward": 0.04385182715486735, "reward_std": 0.02458828000817448, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.04385182727128267, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 86 }, { "completion_length": 139.375, "epoch": 1.0114942528735633, "grad_norm": 73.7523193359375, "kl": 22.321685910224915, "learning_rate": 4.733003868055923e-06, "loss": 0.0223, "reward": 0.060238878009840846, "reward_std": 0.022416275285650045, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.060238879290409386, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 88 }, { "completion_length": 107.5, "epoch": 1.0344827586206897, "grad_norm": 121.1543960571289, "kl": 22.722941994667053, "learning_rate": 4.718253617849306e-06, "loss": 0.0227, "reward": 0.03226850915234536, "reward_std": 0.0161996342940256, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.03226850915234536, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 90 }, { "completion_length": 97.0, "epoch": 1.0574712643678161, "grad_norm": 131.43910217285156, "kl": 27.77715367078781, "learning_rate": 4.703131077631498e-06, "loss": 0.0278, "reward": 0.03938274132087827, "reward_std": 0.026403514784760773, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.03938274132087827, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 92 }, { "completion_length": 121.375, "epoch": 1.0804597701149425, "grad_norm": 198.93453979492188, "kl": 16.508545219898224, "learning_rate": 4.687638785421875e-06, "loss": 0.0165, "reward": 0.04298875690437853, "reward_std": 0.01768132916186005, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.042988755740225315, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 94 }, { "completion_length": 110.5, "epoch": 1.103448275862069, "grad_norm": 580.1483154296875, "kl": 69.74777483940125, "learning_rate": 4.671779341295378e-06, "loss": 0.0697, "reward": 0.04085242818109691, "reward_std": 0.022953280131332576, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.04085242818109691, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 96 }, { "completion_length": 155.375, "epoch": 1.1264367816091954, "grad_norm": 35050408.0, "kl": 195909.09659838676, "learning_rate": 4.655555406946135e-06, "loss": 195.9091, "reward": 0.062153353122994304, "reward_std": 0.026913663954474032, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.06215334951411933, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 98 }, { "completion_length": 95.375, "epoch": 1.1494252873563218, "grad_norm": 71.4871826171875, "kl": 14.925088226795197, "learning_rate": 4.6389697052407535e-06, "loss": 0.0149, "reward": 0.036937737138941884, "reward_std": 0.013131760118994862, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.03693773760460317, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 100 }, { "completion_length": 130.5, "epoch": 1.1724137931034484, "grad_norm": 147012736.0, "kl": 2484037.983032584, "learning_rate": 4.622025019761336e-06, "loss": 2484.0376, "reward": 0.061055471654981375, "reward_std": 0.01597878709435463, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.06105547118932009, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 102 }, { "completion_length": 111.25, "epoch": 1.1954022988505748, "grad_norm": 13565832.0, "kl": 160021.34017765522, "learning_rate": 4.604724194338318e-06, "loss": 160.0213, "reward": 0.06643283332232386, "reward_std": 0.03456435844418593, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.06643283332232386, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 104 }, { "completion_length": 118.875, "epoch": 1.2183908045977012, "grad_norm": 138.9129180908203, "kl": 1.8646669669807634e+23, "learning_rate": 4.587070132573178e-06, "loss": 1.8646670472011317e+20, "reward": 0.06637307826895267, "reward_std": 0.03601981734391302, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.06637307826895267, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 106 }, { "completion_length": 96.625, "epoch": 1.2413793103448276, "grad_norm": 107.29772186279297, "kl": 30.86630117893219, "learning_rate": 4.569065797351135e-06, "loss": 0.0309, "reward": 0.06503278238233179, "reward_std": 0.018168910057283938, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.06503278145100921, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 108 }, { "completion_length": 222.75, "epoch": 1.264367816091954, "grad_norm": 67.1069564819336, "kl": 23.613763451576233, "learning_rate": 4.550714210343879e-06, "loss": 0.0236, "reward": 0.0439565951237455, "reward_std": 0.024852770380675793, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.04395659524016082, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 110 }, { "completion_length": 133.125, "epoch": 1.2873563218390804, "grad_norm": 60.596893310546875, "kl": 31.36600613594055, "learning_rate": 4.53201845150245e-06, "loss": 0.0314, "reward": 0.05209910683333874, "reward_std": 0.017306379333604127, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.05209910683333874, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 112 }, { "completion_length": 92.375, "epoch": 1.3103448275862069, "grad_norm": 94.82451629638672, "kl": 20.518282294273376, "learning_rate": 4.512981658540321e-06, "loss": 0.0205, "reward": 0.07281980104744434, "reward_std": 0.032699283969122916, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.07281980174593627, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 114 }, { "completion_length": 105.875, "epoch": 1.3333333333333333, "grad_norm": 85.29702758789062, "kl": 14.601945519447327, "learning_rate": 4.493607026406802e-06, "loss": 0.0146, "reward": 0.0672471143770963, "reward_std": 0.03380749194184318, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.06724711460992694, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 116 }, { "completion_length": 112.875, "epoch": 1.3563218390804597, "grad_norm": 90.2031478881836, "kl": 51.02677869796753, "learning_rate": 4.473897806750829e-06, "loss": 0.051, "reward": 0.1283908288460225, "reward_std": 0.14795588632114232, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0625, "rewards/question_recreation_reward_func": 0.0637658298946917, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0021250001154839993, "step": 118 }, { "completion_length": 84.125, "epoch": 1.3793103448275863, "grad_norm": 237.06651306152344, "kl": 13.766358494758606, "learning_rate": 4.4538573073752365e-06, "loss": 0.0138, "reward": 0.05376248573884368, "reward_std": 0.025461382640060037, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.05376248504035175, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 120 }, { "completion_length": 212.875, "epoch": 1.4022988505747127, "grad_norm": 166.9183349609375, "kl": 33.45983535051346, "learning_rate": 4.4334888916816096e-06, "loss": 0.0335, "reward": 0.07159767742268741, "reward_std": 0.034634619660209864, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.07159767649136484, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 122 }, { "completion_length": 81.875, "epoch": 1.4252873563218391, "grad_norm": 134.45152282714844, "kl": 6.959662973880768, "learning_rate": 4.412795978105807e-06, "loss": 0.007, "reward": 0.06391030387021601, "reward_std": 0.027350948890671134, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.06391030142549425, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 124 }, { "completion_length": 197.875, "epoch": 1.4482758620689655, "grad_norm": 55.555442810058594, "kl": 19.926442325115204, "learning_rate": 4.391782039544239e-06, "loss": 0.0199, "reward": 0.035775873460806906, "reward_std": 0.010830878862179816, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.035775874042883515, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 126 }, { "completion_length": 112.25, "epoch": 1.471264367816092, "grad_norm": 117.75221252441406, "kl": 17.477223455905914, "learning_rate": 4.37045060277101e-06, "loss": 0.0175, "reward": 0.06078207748942077, "reward_std": 0.021332335250917822, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.06078207783866674, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 128 }, { "completion_length": 108.625, "epoch": 1.4942528735632183, "grad_norm": 85.71215057373047, "kl": 10.972188860177994, "learning_rate": 4.348805247846027e-06, "loss": 0.011, "reward": 0.07303046528249979, "reward_std": 0.0242302012629807, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.07303046435117722, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 130 }, { "completion_length": 112.125, "epoch": 1.5172413793103448, "grad_norm": 170.75022888183594, "kl": 15.530431240797043, "learning_rate": 4.326849607514149e-06, "loss": 0.0155, "reward": 0.05910016072448343, "reward_std": 0.031024507101392373, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.05910016049165279, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 132 }, { "completion_length": 132.5, "epoch": 1.5402298850574714, "grad_norm": 197.68955993652344, "kl": 12.310802519321442, "learning_rate": 4.304587366595506e-06, "loss": 0.0123, "reward": 0.030486367526464164, "reward_std": 0.010410786984721199, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.03048636729363352, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 134 }, { "completion_length": 110.875, "epoch": 1.5632183908045976, "grad_norm": 192.08761596679688, "kl": 13.497075021266937, "learning_rate": 4.282022261367074e-06, "loss": 0.0135, "reward": 0.03929068963043392, "reward_std": 0.02567336882930249, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.03929068963043392, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 136 }, { "completion_length": 152.0, "epoch": 1.5862068965517242, "grad_norm": 39.80803298950195, "kl": 20.505420178174973, "learning_rate": 4.259158078935616e-06, "loss": 0.0205, "reward": 0.08570016594603658, "reward_std": 0.032164638047106564, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.08570016699377447, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 138 }, { "completion_length": 131.375, "epoch": 1.6091954022988506, "grad_norm": 231.2183837890625, "kl": 20.783460319042206, "learning_rate": 4.235998656602091e-06, "loss": 0.0208, "reward": 0.055525890085846186, "reward_std": 0.027581465081311762, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.05552589148283005, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 140 }, { "completion_length": 105.75, "epoch": 1.632183908045977, "grad_norm": 95.99288940429688, "kl": 17.77563899755478, "learning_rate": 4.212547881217637e-06, "loss": 0.0178, "reward": 0.05321404663845897, "reward_std": 0.02031989657552913, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.053214047802612185, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 142 }, { "completion_length": 119.625, "epoch": 1.6551724137931034, "grad_norm": 169.1436004638672, "kl": 34.60399383306503, "learning_rate": 4.188809688531241e-06, "loss": 0.0346, "reward": 0.07038785656914115, "reward_std": 0.030746486561838537, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.0703878568019718, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 144 }, { "completion_length": 116.625, "epoch": 1.6781609195402298, "grad_norm": 192.1599884033203, "kl": 16.762088894844055, "learning_rate": 4.164788062529203e-06, "loss": 0.0168, "reward": 0.04594091698527336, "reward_std": 0.01978331687860191, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.045940916636027396, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 146 }, { "completion_length": 180.375, "epoch": 1.7011494252873565, "grad_norm": 102.7906494140625, "kl": 30.857768535614014, "learning_rate": 4.140487034766499e-06, "loss": 0.0309, "reward": 0.09366934420540929, "reward_std": 0.036019391380250454, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.09366934606805444, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 148 }, { "completion_length": 176.5, "epoch": 1.7241379310344827, "grad_norm": 97.75148010253906, "kl": 53.549252450466156, "learning_rate": 4.115910683690167e-06, "loss": 0.0535, "reward": 0.05935717490501702, "reward_std": 0.026303998078219593, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.05935717490501702, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 150 }, { "completion_length": 125.0, "epoch": 1.7471264367816093, "grad_norm": 202.91168212890625, "kl": 40.01495683193207, "learning_rate": 4.091063133954821e-06, "loss": 0.04, "reward": 0.08804582839366049, "reward_std": 0.024869057990144938, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.08804582722950727, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 152 }, { "completion_length": 148.0, "epoch": 1.7701149425287355, "grad_norm": 155.28884887695312, "kl": 5.3842560165046596e+20, "learning_rate": 4.065948555730405e-06, "loss": 5.384256558014136e+17, "reward": 0.055865267117042094, "reward_std": 0.021044594555860385, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.05586526804836467, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 154 }, { "completion_length": 230.75, "epoch": 1.793103448275862, "grad_norm": 109.9100570678711, "kl": 17.425525724887848, "learning_rate": 4.040571164002319e-06, "loss": 0.0174, "reward": 0.06862572557292879, "reward_std": 0.0240573805058375, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.06862572743557394, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 156 }, { "completion_length": 163.375, "epoch": 1.8160919540229885, "grad_norm": 79.25603485107422, "kl": 156.42008650302887, "learning_rate": 4.014935217864009e-06, "loss": 0.1564, "reward": 0.059505124343559146, "reward_std": 0.03177501109894365, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.05950512480922043, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 158 }, { "completion_length": 73.5, "epoch": 1.839080459770115, "grad_norm": 200.8464813232422, "kl": 6.349012166261673, "learning_rate": 3.989045019802171e-06, "loss": 0.0063, "reward": 0.09267536830157042, "reward_std": 0.024046986014582217, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.09267536457628012, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 160 }, { "completion_length": 109.875, "epoch": 1.8620689655172413, "grad_norm": 127.49809265136719, "kl": 18.7496437728405, "learning_rate": 3.962904914974656e-06, "loss": 0.0187, "reward": 0.07295416854321957, "reward_std": 0.029908461146987975, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.07295416947454214, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 162 }, { "completion_length": 117.5, "epoch": 1.8850574712643677, "grad_norm": 201.69329833984375, "kl": 30.973371386528015, "learning_rate": 3.936519290481226e-06, "loss": 0.031, "reward": 0.08170855045318604, "reward_std": 0.03801235364517197, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.08170854859054089, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 164 }, { "completion_length": 89.75, "epoch": 1.9080459770114944, "grad_norm": 174.39805603027344, "kl": 13.89745756983757, "learning_rate": 3.909892574627267e-06, "loss": 0.0139, "reward": 0.03322727954946458, "reward_std": 0.015495281608309597, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.033227279083803296, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 166 }, { "completion_length": 168.375, "epoch": 1.9310344827586206, "grad_norm": 224.63058471679688, "kl": 16.976139187812805, "learning_rate": 3.883029236180577e-06, "loss": 0.017, "reward": 0.049951824359595776, "reward_std": 0.018409932090435177, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.049951824359595776, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 168 }, { "completion_length": 107.875, "epoch": 1.9540229885057472, "grad_norm": 64.73926544189453, "kl": 10.746672213077545, "learning_rate": 3.855933783621384e-06, "loss": 0.0107, "reward": 0.08689862536266446, "reward_std": 0.028454654850065708, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.08689862687606364, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 170 }, { "completion_length": 118.625, "epoch": 1.9770114942528736, "grad_norm": 192.43215942382812, "kl": 16.44640439748764, "learning_rate": 3.828610764385676e-06, "loss": 0.0164, "reward": 0.052876614150591195, "reward_std": 0.027988504065433517, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.052876610425300896, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 172 }, { "completion_length": 169.375, "epoch": 2.0, "grad_norm": 46.39739990234375, "kl": 37.05168032646179, "learning_rate": 3.8010647641020116e-06, "loss": 0.0371, "reward": 0.07344013964757323, "reward_std": 0.03448783012572676, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.07344014011323452, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 174 }, { "completion_length": 132.125, "epoch": 2.0229885057471266, "grad_norm": 201.951904296875, "kl": 8447.514083087444, "learning_rate": 3.773300405821908e-06, "loss": 8.4475, "reward": 0.10326070059090853, "reward_std": 0.03630137036088854, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.10326070059090853, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 176 }, { "completion_length": 146.375, "epoch": 2.045977011494253, "grad_norm": 2640.992919921875, "kl": 787.9442739486694, "learning_rate": 3.7453223492439544e-06, "loss": 0.7879, "reward": 0.07708172407001257, "reward_std": 0.02127069642301649, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.07708172407001257, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 178 }, { "completion_length": 99.0, "epoch": 2.0689655172413794, "grad_norm": 21345.66796875, "kl": 4659.93187391758, "learning_rate": 3.7171352899317743e-06, "loss": 4.6599, "reward": 0.08051675953902304, "reward_std": 0.03442754491697997, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.08051676000468433, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 180 }, { "completion_length": 98.0, "epoch": 2.0919540229885056, "grad_norm": 245.73484802246094, "kl": 1256.7416229844093, "learning_rate": 3.6887439585259693e-06, "loss": 1.2567, "reward": 0.03673222055658698, "reward_std": 0.019385109655559063, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.03673222032375634, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 182 }, { "completion_length": 81.0, "epoch": 2.1149425287356323, "grad_norm": 207.97874450683594, "kl": 95.9972143471241, "learning_rate": 3.6601531199501715e-06, "loss": 0.096, "reward": 0.056939898524433374, "reward_std": 0.021644275984726846, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.05693989899009466, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 184 }, { "completion_length": 120.75, "epoch": 2.1379310344827585, "grad_norm": 133.63221740722656, "kl": 16.09234130382538, "learning_rate": 3.631367572611348e-06, "loss": 0.0161, "reward": 0.03902807948179543, "reward_std": 0.02079846995184198, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.039028079714626074, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 186 }, { "completion_length": 84.125, "epoch": 2.160919540229885, "grad_norm": 115.07598876953125, "kl": 186.13299444317818, "learning_rate": 3.6023921475944795e-06, "loss": 0.1861, "reward": 0.07693583145737648, "reward_std": 0.03298727044602856, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.07693583308719099, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 188 }, { "completion_length": 107.75, "epoch": 2.1839080459770113, "grad_norm": 279.7792663574219, "kl": 71.62854248285294, "learning_rate": 3.573231707851765e-06, "loss": 0.0716, "reward": 0.048047779011540115, "reward_std": 0.015656847856007516, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.04804777691606432, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 190 }, { "completion_length": 142.875, "epoch": 2.206896551724138, "grad_norm": 206.19859313964844, "kl": 72.4559788107872, "learning_rate": 3.5438911473864633e-06, "loss": 0.0725, "reward": 0.07531816582195461, "reward_std": 0.025376251578563824, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.07531816302798688, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 192 }, { "completion_length": 148.0, "epoch": 2.2298850574712645, "grad_norm": 223.21945190429688, "kl": 40.88407605886459, "learning_rate": 3.514375390431539e-06, "loss": 0.0409, "reward": 0.04652288625948131, "reward_std": 0.016751730465330184, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.046522887190803885, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 194 }, { "completion_length": 94.125, "epoch": 2.2528735632183907, "grad_norm": 267.50927734375, "kl": 12.346384435892105, "learning_rate": 3.484689390623218e-06, "loss": 0.0123, "reward": 0.039031822117976844, "reward_std": 0.0252868261304684, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.03903182235080749, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 196 }, { "completion_length": 92.125, "epoch": 2.2758620689655173, "grad_norm": 333.7870178222656, "kl": 157.60900411009789, "learning_rate": 3.4548381301696298e-06, "loss": 0.1576, "reward": 0.0320946826832369, "reward_std": 0.016103634203318506, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.03209468245040625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 198 }, { "completion_length": 112.25, "epoch": 2.2988505747126435, "grad_norm": 151.12559509277344, "kl": 47.55213111639023, "learning_rate": 3.4248266190146307e-06, "loss": 0.0476, "reward": 0.0635837217560038, "reward_std": 0.02406485439860262, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.06358372524846345, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 200 }, { "completion_length": 157.25, "epoch": 2.32183908045977, "grad_norm": 140.63108825683594, "kl": 47.00842010974884, "learning_rate": 3.39465989399699e-06, "loss": 0.047, "reward": 0.04995789797976613, "reward_std": 0.01773734952439554, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.04995789751410484, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 202 }, { "completion_length": 112.25, "epoch": 2.344827586206897, "grad_norm": 206.70672607421875, "kl": 12.584306180477142, "learning_rate": 3.3643430180050573e-06, "loss": 0.0126, "reward": 0.05040284153074026, "reward_std": 0.019176787842297927, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.05040284153074026, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 204 }, { "completion_length": 103.5, "epoch": 2.367816091954023, "grad_norm": 310.7558898925781, "kl": 6.195760190486908, "learning_rate": 3.333881079127052e-06, "loss": 0.0062, "reward": 0.06571804382838309, "reward_std": 0.02076091495109722, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.06571804569102824, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 206 }, { "completion_length": 115.625, "epoch": 2.3908045977011496, "grad_norm": 89.87789916992188, "kl": 36.98254406452179, "learning_rate": 3.3032791897971313e-06, "loss": 0.037, "reward": 0.09712414210662246, "reward_std": 0.03503445046953857, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.09712414362002164, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 208 }, { "completion_length": 90.5, "epoch": 2.413793103448276, "grad_norm": 338.2508544921875, "kl": 12.830681920051575, "learning_rate": 3.272542485937369e-06, "loss": 0.0128, "reward": 0.0636495256330818, "reward_std": 0.02789511193986982, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.06364952633157372, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 210 }, { "completion_length": 137.625, "epoch": 2.4367816091954024, "grad_norm": 227.99618530273438, "kl": 9.094072937965393, "learning_rate": 3.2416761260957925e-06, "loss": 0.0091, "reward": 0.047978554968722165, "reward_std": 0.023690548143349588, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.04797855520155281, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 212 }, { "completion_length": 102.25, "epoch": 2.4597701149425286, "grad_norm": 154.36444091796875, "kl": 8.042297959327698, "learning_rate": 3.210685290580622e-06, "loss": 0.008, "reward": 0.0656727987807244, "reward_std": 0.024344642588403076, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.06567279691807926, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 214 }, { "completion_length": 132.375, "epoch": 2.4827586206896552, "grad_norm": 153.6240234375, "kl": 10.046676218509674, "learning_rate": 3.1795751805908578e-06, "loss": 0.01, "reward": 0.044585506431758404, "reward_std": 0.01837193698156625, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.044585505966097116, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 216 }, { "completion_length": 97.5, "epoch": 2.5057471264367814, "grad_norm": 386.7044372558594, "kl": 72.63909751176834, "learning_rate": 3.148351017343363e-06, "loss": 0.0726, "reward": 0.06864605797454715, "reward_std": 0.027401213883422315, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.068646056111902, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 218 }, { "completion_length": 92.375, "epoch": 2.528735632183908, "grad_norm": 143.78675842285156, "kl": 23.726705104112625, "learning_rate": 3.1170180411965854e-06, "loss": 0.0237, "reward": 0.07314756175037473, "reward_std": 0.028583430394064635, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.07314756221603602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 220 }, { "completion_length": 106.125, "epoch": 2.5517241379310347, "grad_norm": 116.3728256225586, "kl": 33.270900279283524, "learning_rate": 3.085581510771067e-06, "loss": 0.0333, "reward": 0.08058315946254879, "reward_std": 0.03271555982064456, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.08058315794914961, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 222 }, { "completion_length": 150.375, "epoch": 2.574712643678161, "grad_norm": 245.00050354003906, "kl": 115.33485245704651, "learning_rate": 3.054046702066886e-06, "loss": 0.1153, "reward": 0.05190867045894265, "reward_std": 0.027783217956312, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.0519086685962975, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 224 }, { "completion_length": 136.25, "epoch": 2.5977011494252875, "grad_norm": 185.83450317382812, "kl": 55.681486785411835, "learning_rate": 3.0224189075781886e-06, "loss": 0.0557, "reward": 0.07028337975498289, "reward_std": 0.03400374006014317, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.07028337789233774, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 226 }, { "completion_length": 112.0, "epoch": 2.6206896551724137, "grad_norm": 442.36517333984375, "kl": 19.83175164461136, "learning_rate": 2.9907034354049443e-06, "loss": 0.0198, "reward": 0.062314021633937955, "reward_std": 0.02705212461296469, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.06231402256526053, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 228 }, { "completion_length": 125.125, "epoch": 2.6436781609195403, "grad_norm": 143.57456970214844, "kl": 10.839111626148224, "learning_rate": 2.9589056083620902e-06, "loss": 0.0108, "reward": 0.09795380663126707, "reward_std": 0.030974397202953696, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.09795380244031549, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 230 }, { "completion_length": 107.125, "epoch": 2.6666666666666665, "grad_norm": 40835.8125, "kl": 7232.723676383495, "learning_rate": 2.927030763086201e-06, "loss": 7.2327, "reward": 0.07253995933569968, "reward_std": 0.02931177685968578, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.07253995933569968, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 232 }, { "completion_length": 71.125, "epoch": 2.689655172413793, "grad_norm": 203.29135131835938, "kl": 254.6603969335556, "learning_rate": 2.8950842491398358e-06, "loss": 0.2547, "reward": 0.05340645229443908, "reward_std": 0.028615490067750216, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.053406450897455215, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 234 }, { "completion_length": 142.375, "epoch": 2.7126436781609193, "grad_norm": 234.6038818359375, "kl": 55.68269056081772, "learning_rate": 2.8630714281137263e-06, "loss": 0.0557, "reward": 0.06278015091083944, "reward_std": 0.02509829483460635, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.06278014997951686, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 236 }, { "completion_length": 174.0, "epoch": 2.735632183908046, "grad_norm": 190.62289428710938, "kl": 11.476280808448792, "learning_rate": 2.8309976727269335e-06, "loss": 0.0115, "reward": 0.07279695780016482, "reward_std": 0.01803179772105068, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.0727969582658261, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 238 }, { "completion_length": 134.875, "epoch": 2.7586206896551726, "grad_norm": 194.29698181152344, "kl": 18.73045128583908, "learning_rate": 2.7988683659251475e-06, "loss": 0.0187, "reward": 0.10095465055201203, "reward_std": 0.03450235276250169, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.1009546504355967, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 240 }, { "completion_length": 160.5, "epoch": 2.781609195402299, "grad_norm": 159.08277893066406, "kl": 17.671808183193207, "learning_rate": 2.766688899977266e-06, "loss": 0.0177, "reward": 0.07221356220543385, "reward_std": 0.02001258631935343, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.07221355871297419, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 242 }, { "completion_length": 184.0, "epoch": 2.8045977011494254, "grad_norm": 4865.28662109375, "kl": 1058.6769651770592, "learning_rate": 2.7344646755704078e-06, "loss": 1.0587, "reward": 0.07434326456859708, "reward_std": 0.032324450148735195, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.07434326503425837, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 244 }, { "completion_length": 96.375, "epoch": 2.8275862068965516, "grad_norm": 520.932373046875, "kl": 126.8150297999382, "learning_rate": 2.702201100903511e-06, "loss": 0.1268, "reward": 0.08822247432544827, "reward_std": 0.0427072363672778, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.08822247222997248, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 246 }, { "completion_length": 94.625, "epoch": 2.8505747126436782, "grad_norm": 215.631103515625, "kl": 73.34332245588303, "learning_rate": 2.6699035907796796e-06, "loss": 0.0733, "reward": 0.08415804628748447, "reward_std": 0.032036138960393146, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.08415804768446833, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 248 }, { "completion_length": 130.25, "epoch": 2.873563218390805, "grad_norm": 732.016845703125, "kl": 367.30382680892944, "learning_rate": 2.6375775656974124e-06, "loss": 0.3673, "reward": 0.03703907027374953, "reward_std": 0.017379111668560654, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.03703907015733421, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 250 }, { "completion_length": 94.5, "epoch": 2.896551724137931, "grad_norm": 180.62460327148438, "kl": 27.182391583919525, "learning_rate": 2.6052284509408805e-06, "loss": 0.0272, "reward": 0.05479187308810651, "reward_std": 0.015454134321771562, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.05479187145829201, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 252 }, { "completion_length": 112.25, "epoch": 2.9195402298850572, "grad_norm": 277.4781799316406, "kl": 110.91784703731537, "learning_rate": 2.5728616756693995e-06, "loss": 0.1109, "reward": 0.051236634608358145, "reward_std": 0.01320132811088115, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.051236633444204926, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 254 }, { "completion_length": 118.125, "epoch": 2.942528735632184, "grad_norm": 146.3416748046875, "kl": 195.90486681461334, "learning_rate": 2.5404826720062544e-06, "loss": 0.1959, "reward": 0.08834501937963068, "reward_std": 0.03774441118002869, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.08834501937963068, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 256 }, { "completion_length": 113.125, "epoch": 2.9655172413793105, "grad_norm": 66.8388671875, "kl": 7.33636736869812, "learning_rate": 2.5080968741270224e-06, "loss": 0.0073, "reward": 0.08513955981470644, "reward_std": 0.02878198877442628, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.08123330981470644, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.00390625, "step": 258 }, { "completion_length": 70.75, "epoch": 2.9885057471264367, "grad_norm": 170.7029571533203, "kl": 47.25964882969856, "learning_rate": 2.4757097173475574e-06, "loss": 0.0473, "reward": 0.07585499668493867, "reward_std": 0.02281574090011418, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.07585499645210803, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 260 }, { "completion_length": 162.25, "epoch": 3.0114942528735633, "grad_norm": 145.7571258544922, "kl": 22.939729869365692, "learning_rate": 2.4433266372117755e-06, "loss": 0.0229, "reward": 0.06590291450265795, "reward_std": 0.030166592623572797, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.0659029142698273, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 262 }, { "completion_length": 106.75, "epoch": 3.0344827586206895, "grad_norm": 249.9431610107422, "kl": 10.407011151313782, "learning_rate": 2.410953068579411e-06, "loss": 0.0104, "reward": 0.01894011115655303, "reward_std": 0.009890818473650143, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.018940110923722386, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 264 }, { "completion_length": 135.25, "epoch": 3.057471264367816, "grad_norm": 312.3399353027344, "kl": 14.255246341228485, "learning_rate": 2.3785944447138804e-06, "loss": 0.0143, "reward": 0.06780989549588412, "reward_std": 0.02473576454212889, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.06780989456456155, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 266 }, { "completion_length": 92.125, "epoch": 3.0804597701149423, "grad_norm": 218.4748077392578, "kl": 9.486905694007874, "learning_rate": 2.3462561963704132e-06, "loss": 0.0095, "reward": 0.035537787596695125, "reward_std": 0.013097296468913555, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.035537788295187056, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 268 }, { "completion_length": 120.25, "epoch": 3.103448275862069, "grad_norm": 214.8455047607422, "kl": 35.78847563266754, "learning_rate": 2.3139437508846155e-06, "loss": 0.0358, "reward": 0.08255981374531984, "reward_std": 0.03346848307410255, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.0825598118826747, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 270 }, { "completion_length": 87.125, "epoch": 3.1264367816091956, "grad_norm": 358.0038757324219, "kl": 58.21384912729263, "learning_rate": 2.2816625312615903e-06, "loss": 0.0582, "reward": 0.039858372998423874, "reward_std": 0.0139598089735955, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.039858372998423874, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 272 }, { "completion_length": 108.0, "epoch": 3.1494252873563218, "grad_norm": 2689.757568359375, "kl": 561.3155448436737, "learning_rate": 2.2494179552657977e-06, "loss": 0.5613, "reward": 0.08695052471011877, "reward_std": 0.030587425106205046, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.0869505216833204, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 274 }, { "completion_length": 119.25, "epoch": 3.1724137931034484, "grad_norm": 156.3776092529297, "kl": 13.577105820178986, "learning_rate": 2.2172154345117896e-06, "loss": 0.0136, "reward": 0.05204900773242116, "reward_std": 0.02707206120248884, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.052049006801098585, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 276 }, { "completion_length": 186.5, "epoch": 3.1954022988505746, "grad_norm": 253.58987426757812, "kl": 60.839145839214325, "learning_rate": 2.185060373555978e-06, "loss": 0.0608, "reward": 0.06129825604148209, "reward_std": 0.018924910807982087, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.06129825720563531, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 278 }, { "completion_length": 118.75, "epoch": 3.218390804597701, "grad_norm": 174.3742218017578, "kl": 59.040690660476685, "learning_rate": 2.1529581689895838e-06, "loss": 0.059, "reward": 0.055515452404506505, "reward_std": 0.02473576396005228, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.055515452404506505, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 280 }, { "completion_length": 114.125, "epoch": 3.2413793103448274, "grad_norm": 699.6782836914062, "kl": 130.81276640295982, "learning_rate": 2.12091420853293e-06, "loss": 0.1308, "reward": 0.060602283803746104, "reward_std": 0.02057587681338191, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.06060228275600821, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 282 }, { "completion_length": 241.25, "epoch": 3.264367816091954, "grad_norm": 645.268798828125, "kl": 369.306822180748, "learning_rate": 2.0889338701312184e-06, "loss": 0.3693, "reward": 0.08422980736941099, "reward_std": 0.04090468987124041, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.08422980806790292, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 284 }, { "completion_length": 135.625, "epoch": 3.2873563218390807, "grad_norm": 214.42164611816406, "kl": 49.46466279029846, "learning_rate": 2.0570225210519433e-06, "loss": 0.0495, "reward": 0.06836231634952128, "reward_std": 0.022511750808916986, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.06836231704801321, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 286 }, { "completion_length": 114.875, "epoch": 3.310344827586207, "grad_norm": 143.37962341308594, "kl": 13.709496915340424, "learning_rate": 2.025185516984108e-06, "loss": 0.0137, "reward": 0.07179693877696991, "reward_std": 0.029636676074005663, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.0717969371471554, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 288 }, { "completion_length": 104.625, "epoch": 3.3333333333333335, "grad_norm": 213.12794494628906, "kl": 33.390194058418274, "learning_rate": 1.993428201139375e-06, "loss": 0.0334, "reward": 0.07383693801239133, "reward_std": 0.032142720418050885, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.07383694080635905, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 290 }, { "completion_length": 126.125, "epoch": 3.3563218390804597, "grad_norm": 362.9014587402344, "kl": 124.35960441827774, "learning_rate": 1.9617559033553128e-06, "loss": 0.1244, "reward": 0.04353773477487266, "reward_std": 0.017961862351512536, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.04353773477487266, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 292 }, { "completion_length": 149.5, "epoch": 3.3793103448275863, "grad_norm": 74.9876937866211, "kl": 107.57100534439087, "learning_rate": 1.9301739392008923e-06, "loss": 0.1076, "reward": 0.06804983934853226, "reward_std": 0.024156989762559533, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.068049838533625, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 294 }, { "completion_length": 109.5, "epoch": 3.4022988505747125, "grad_norm": 235.3528594970703, "kl": 8.414616346359253, "learning_rate": 1.8986876090843668e-06, "loss": 0.0084, "reward": 0.06046756589785218, "reward_std": 0.01879029450356029, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.06046756776049733, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 296 }, { "completion_length": 128.25, "epoch": 3.425287356321839, "grad_norm": 107.53013610839844, "kl": 56.530424654483795, "learning_rate": 1.8673021973637095e-06, "loss": 0.0565, "reward": 0.07842924515716732, "reward_std": 0.02797157474560663, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.07842924515716732, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 298 }, { "completion_length": 132.0, "epoch": 3.4482758620689653, "grad_norm": 473.3606872558594, "kl": 135.01850801706314, "learning_rate": 1.8360229714597372e-06, "loss": 0.135, "reward": 0.1066941770259291, "reward_std": 0.04587837855797261, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.10669417725875974, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 300 }, { "completion_length": 128.875, "epoch": 3.471264367816092, "grad_norm": 192.39108276367188, "kl": 46.490090012550354, "learning_rate": 1.8048551809720752e-06, "loss": 0.0465, "reward": 0.043134392937645316, "reward_std": 0.0180649196554441, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.04313439084216952, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 302 }, { "completion_length": 96.875, "epoch": 3.4942528735632186, "grad_norm": 334.9054870605469, "kl": 11.145173788070679, "learning_rate": 1.7738040567981168e-06, "loss": 0.0111, "reward": 0.07762327045202255, "reward_std": 0.026855482487007976, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.0776232706848532, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 304 }, { "completion_length": 146.625, "epoch": 3.5172413793103448, "grad_norm": 9220.5947265625, "kl": 1398.0779232382774, "learning_rate": 1.7428748102551237e-06, "loss": 1.3981, "reward": 0.06737202336080372, "reward_std": 0.020121814828598872, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.06737202499061823, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 306 }, { "completion_length": 110.375, "epoch": 3.5402298850574714, "grad_norm": 163.687744140625, "kl": 65.13996613025665, "learning_rate": 1.7120726322056042e-06, "loss": 0.0651, "reward": 0.041452419478446245, "reward_std": 0.020625172619475052, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.04145242040976882, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 308 }, { "completion_length": 125.625, "epoch": 3.5632183908045976, "grad_norm": 123.7718505859375, "kl": 367.19069772958755, "learning_rate": 1.6814026921861337e-06, "loss": 0.3672, "reward": 0.044517830247059464, "reward_std": 0.015566983609460294, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.04451783129479736, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 310 }, { "completion_length": 104.875, "epoch": 3.586206896551724, "grad_norm": 6899.99755859375, "kl": 814.7017561793327, "learning_rate": 1.6508701375397488e-06, "loss": 0.8147, "reward": 0.06519478128757328, "reward_std": 0.02635832253145054, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.06519478117115796, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 312 }, { "completion_length": 80.125, "epoch": 3.609195402298851, "grad_norm": 206556.53125, "kl": 30569.942191660404, "learning_rate": 1.6204800925520685e-06, "loss": 30.5699, "reward": 0.06730342446826398, "reward_std": 0.01795649208361283, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.06730342155788094, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 314 }, { "completion_length": 156.25, "epoch": 3.632183908045977, "grad_norm": 84.7294692993164, "kl": 208.38328206539154, "learning_rate": 1.5902376575912815e-06, "loss": 0.2084, "reward": 0.03716867312323302, "reward_std": 0.01707278488902375, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.037168671493418515, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 316 }, { "completion_length": 117.25, "epoch": 3.655172413793103, "grad_norm": 296.3588562011719, "kl": 130.7706337571144, "learning_rate": 1.5601479082521526e-06, "loss": 0.1308, "reward": 0.0922857525292784, "reward_std": 0.030893770221155137, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.09228575078304857, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 318 }, { "completion_length": 138.375, "epoch": 3.67816091954023, "grad_norm": 93.64472961425781, "kl": 473.6780469417572, "learning_rate": 1.530215894504184e-06, "loss": 0.4737, "reward": 0.060322433011606336, "reward_std": 0.022636316833086312, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.06032243208028376, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 320 }, { "completion_length": 110.375, "epoch": 3.7011494252873565, "grad_norm": 182.2461395263672, "kl": 17.96676391363144, "learning_rate": 1.5004466398440776e-06, "loss": 0.018, "reward": 0.08078207075595856, "reward_std": 0.02943520125700161, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.08078207029029727, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 322 }, { "completion_length": 128.625, "epoch": 3.7241379310344827, "grad_norm": 134.72080993652344, "kl": 443.86798733472824, "learning_rate": 1.4708451404526409e-06, "loss": 0.4439, "reward": 0.05561051343102008, "reward_std": 0.02243916515726596, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.055610513663850725, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 324 }, { "completion_length": 124.0, "epoch": 3.7471264367816093, "grad_norm": 146.9352264404297, "kl": 73.07299399375916, "learning_rate": 1.4414163643562755e-06, "loss": 0.0731, "reward": 0.09513574035372585, "reward_std": 0.02849674120079726, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.09513574140146375, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 326 }, { "completion_length": 178.125, "epoch": 3.7701149425287355, "grad_norm": 277.24066162109375, "kl": 49.67561674118042, "learning_rate": 1.4121652505931922e-06, "loss": 0.0497, "reward": 0.06086232408415526, "reward_std": 0.028575259959325194, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.06086232396773994, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 328 }, { "completion_length": 129.625, "epoch": 3.793103448275862, "grad_norm": 226.15628051757812, "kl": 6.218357235193253, "learning_rate": 1.3830967083844944e-06, "loss": 0.0062, "reward": 0.056405802723020315, "reward_std": 0.016661703935824335, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.05640580435283482, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 330 }, { "completion_length": 95.5, "epoch": 3.8160919540229887, "grad_norm": 197.53431701660156, "kl": 7.846427142620087, "learning_rate": 1.3542156163102582e-06, "loss": 0.0078, "reward": 0.0798245519399643, "reward_std": 0.029496680363081396, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.07982455100864172, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 332 }, { "completion_length": 90.375, "epoch": 3.839080459770115, "grad_norm": 214.06617736816406, "kl": 5.131113618612289, "learning_rate": 1.3255268214907612e-06, "loss": 0.0051, "reward": 0.11855790950357914, "reward_std": 0.036440703843254596, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.11855790857225657, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 334 }, { "completion_length": 96.875, "epoch": 3.862068965517241, "grad_norm": 236.0201416015625, "kl": 20.320499658584595, "learning_rate": 1.2970351387729875e-06, "loss": 0.0203, "reward": 0.048535656183958054, "reward_std": 0.014753967989236116, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.04853565595112741, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 336 }, { "completion_length": 126.125, "epoch": 3.8850574712643677, "grad_norm": 203.66561889648438, "kl": 8.298109591007233, "learning_rate": 1.2687453499225547e-06, "loss": 0.0083, "reward": 0.06008766824379563, "reward_std": 0.026022593956440687, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.06008766917511821, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 338 }, { "completion_length": 129.125, "epoch": 3.9080459770114944, "grad_norm": 147.83627319335938, "kl": 43.150528728961945, "learning_rate": 1.2406622028211846e-06, "loss": 0.0432, "reward": 0.0663972299080342, "reward_std": 0.02249395562103018, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.06639722804538906, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 340 }, { "completion_length": 103.375, "epoch": 3.9310344827586206, "grad_norm": 142.52777099609375, "kl": 11.885593444108963, "learning_rate": 1.2127904106698665e-06, "loss": 0.0119, "reward": 0.06890858267433941, "reward_std": 0.02730451332172379, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.06890858220867813, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 342 }, { "completion_length": 123.375, "epoch": 3.954022988505747, "grad_norm": 304.3057556152344, "kl": 24.875199258327484, "learning_rate": 1.1851346511978427e-06, "loss": 0.0249, "reward": 0.07832225598394871, "reward_std": 0.036686282022856176, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.07832225388847291, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 344 }, { "completion_length": 120.0, "epoch": 3.9770114942528734, "grad_norm": 256.60498046875, "kl": 48.080951035022736, "learning_rate": 1.1576995658775405e-06, "loss": 0.0481, "reward": 0.06934695993550122, "reward_std": 0.019299608509754762, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.06934695795644075, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 346 }, { "completion_length": 140.625, "epoch": 4.0, "grad_norm": 135.9924774169922, "kl": 28.16669899225235, "learning_rate": 1.130489759145593e-06, "loss": 0.0282, "reward": 0.0693545863032341, "reward_std": 0.03148585665621795, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.06935458816587925, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 348 }, { "completion_length": 119.75, "epoch": 4.022988505747127, "grad_norm": 160.60543823242188, "kl": 53.947971284389496, "learning_rate": 1.103509797630077e-06, "loss": 0.0539, "reward": 0.09043259941972792, "reward_std": 0.03596131969243288, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.09043260151520371, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 350 }, { "completion_length": 125.75, "epoch": 4.045977011494253, "grad_norm": 234.570556640625, "kl": 16.265253007411957, "learning_rate": 1.0767642093840933e-06, "loss": 0.0163, "reward": 0.07576032588258386, "reward_std": 0.03264770592795685, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.07576032564975321, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 352 }, { "completion_length": 112.375, "epoch": 4.068965517241379, "grad_norm": 303.1659851074219, "kl": 21.291000366210938, "learning_rate": 1.0502574831258259e-06, "loss": 0.0213, "reward": 0.0404164819046855, "reward_std": 0.016381950030336156, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.04041648283600807, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 354 }, { "completion_length": 94.375, "epoch": 4.091954022988506, "grad_norm": 100.47554016113281, "kl": 38.7614666223526, "learning_rate": 1.0239940674851943e-06, "loss": 0.0388, "reward": 0.06777264247648418, "reward_std": 0.020705469651147723, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.06777264038100839, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 356 }, { "completion_length": 223.25, "epoch": 4.114942528735632, "grad_norm": 268.31207275390625, "kl": 72.22108465433121, "learning_rate": 9.979783702572413e-07, "loss": 0.0722, "reward": 0.04999426531139761, "reward_std": 0.01990462909452617, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.04999426717404276, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 358 }, { "completion_length": 89.5, "epoch": 4.137931034482759, "grad_norm": 263.82476806640625, "kl": 27.76571488380432, "learning_rate": 9.722147576623745e-07, "loss": 0.0278, "reward": 0.07178233447484672, "reward_std": 0.02303942432627082, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.07178233447484672, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 360 }, { "completion_length": 121.125, "epoch": 4.160919540229885, "grad_norm": 329.9454345703125, "kl": 66.46213746070862, "learning_rate": 9.467075536135787e-07, "loss": 0.0665, "reward": 0.05593870091252029, "reward_std": 0.020594695408362895, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.05593870137818158, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 362 }, { "completion_length": 97.25, "epoch": 4.183908045977011, "grad_norm": 232.8856201171875, "kl": 19.42772740125656, "learning_rate": 9.214610389907327e-07, "loss": 0.0194, "reward": 0.05885206162929535, "reward_std": 0.020634907472413033, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.05885206186212599, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 364 }, { "completion_length": 125.75, "epoch": 4.206896551724138, "grad_norm": 601.11474609375, "kl": 389.543055832386, "learning_rate": 8.964794509221508e-07, "loss": 0.3895, "reward": 0.08361869282089174, "reward_std": 0.036958525772206485, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.08361869491636753, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 366 }, { "completion_length": 85.0, "epoch": 4.2298850574712645, "grad_norm": 197.8262176513672, "kl": 8.75059586763382, "learning_rate": 8.71766982073462e-07, "loss": 0.0088, "reward": 0.09783951175631955, "reward_std": 0.026028962631244212, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.0978395098936744, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 368 }, { "completion_length": 155.5, "epoch": 4.252873563218391, "grad_norm": 166.87765502929688, "kl": 27.580621302127838, "learning_rate": 8.473277799439569e-07, "loss": 0.0276, "reward": 0.04894532961770892, "reward_std": 0.026340805983636528, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.048945329152047634, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 370 }, { "completion_length": 104.125, "epoch": 4.275862068965517, "grad_norm": 245.80418395996094, "kl": 16.77532798051834, "learning_rate": 8.231659461705092e-07, "loss": 0.0168, "reward": 0.055821192217990756, "reward_std": 0.02552032028324902, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.05582119361497462, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 372 }, { "completion_length": 196.875, "epoch": 4.2988505747126435, "grad_norm": 122.50881958007812, "kl": 131.06816470623016, "learning_rate": 7.992855358391968e-07, "loss": 0.1311, "reward": 0.06472561252303421, "reward_std": 0.024280548794195056, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.06472561252303421, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 374 }, { "completion_length": 121.0, "epoch": 4.32183908045977, "grad_norm": 1531.7666015625, "kl": 273.0549658536911, "learning_rate": 7.756905568047393e-07, "loss": 0.2731, "reward": 0.07750574871897697, "reward_std": 0.027207322302274406, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.07750575034879148, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 376 }, { "completion_length": 107.625, "epoch": 4.344827586206897, "grad_norm": 270.01812744140625, "kl": 27.606529653072357, "learning_rate": 7.523849690178567e-07, "loss": 0.0276, "reward": 0.05477215372957289, "reward_std": 0.014078312320634723, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.05477215419523418, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 378 }, { "completion_length": 110.5, "epoch": 4.3678160919540225, "grad_norm": 378.3558044433594, "kl": 8.560508668422699, "learning_rate": 7.293726838606674e-07, "loss": 0.0086, "reward": 0.08128956309519708, "reward_std": 0.030680945317726582, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.08128956542350352, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 380 }, { "completion_length": 128.75, "epoch": 4.390804597701149, "grad_norm": 204.55946350097656, "kl": 9.611089289188385, "learning_rate": 7.066575634902437e-07, "loss": 0.0096, "reward": 0.07787647051736712, "reward_std": 0.02802361361682415, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.07787647121585906, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 382 }, { "completion_length": 126.125, "epoch": 4.413793103448276, "grad_norm": 339.25579833984375, "kl": 30.150237411260605, "learning_rate": 6.842434201904255e-07, "loss": 0.0302, "reward": 0.06910064723342657, "reward_std": 0.02399286429863423, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.06910064851399511, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 384 }, { "completion_length": 96.25, "epoch": 4.436781609195402, "grad_norm": 99.44467163085938, "kl": 5.461384296417236, "learning_rate": 6.621340157319998e-07, "loss": 0.0055, "reward": 0.09473570249974728, "reward_std": 0.021121544879861176, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.0947356999386102, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 386 }, { "completion_length": 112.125, "epoch": 4.459770114942529, "grad_norm": 232.5363006591797, "kl": 26.678832948207855, "learning_rate": 6.403330607413643e-07, "loss": 0.0267, "reward": 0.06848772964440286, "reward_std": 0.019976324576418847, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.06848773104138672, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 388 }, { "completion_length": 89.25, "epoch": 4.482758620689655, "grad_norm": 200.0279083251953, "kl": 28.019516110420227, "learning_rate": 6.188442140777742e-07, "loss": 0.028, "reward": 0.10207226336933672, "reward_std": 0.038132603047415614, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.1020722643006593, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 390 }, { "completion_length": 91.375, "epoch": 4.505747126436781, "grad_norm": 176.74429321289062, "kl": 23.99394378066063, "learning_rate": 5.976710822192722e-07, "loss": 0.024, "reward": 0.07150419475510716, "reward_std": 0.024175929196644574, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.07150419522076845, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 392 }, { "completion_length": 224.25, "epoch": 4.528735632183908, "grad_norm": 165.60427856445312, "kl": 92.11241781711578, "learning_rate": 5.768172186574123e-07, "loss": 0.0921, "reward": 0.057717035757377744, "reward_std": 0.023936723941005766, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.0577170355245471, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 394 }, { "completion_length": 105.375, "epoch": 4.551724137931035, "grad_norm": 162.9764862060547, "kl": 18.045094400644302, "learning_rate": 5.562861233008774e-07, "loss": 0.018, "reward": 0.04579072282649577, "reward_std": 0.02352536143735051, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.04579072189517319, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 396 }, { "completion_length": 176.0, "epoch": 4.574712643678161, "grad_norm": 212.1143341064453, "kl": 17.342518150806427, "learning_rate": 5.360812418880884e-07, "loss": 0.0173, "reward": 0.08125179202761501, "reward_std": 0.03418627165956423, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.08125179109629244, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 398 }, { "completion_length": 138.375, "epoch": 4.597701149425287, "grad_norm": 162.72988891601562, "kl": 14.42275521159172, "learning_rate": 5.162059654089083e-07, "loss": 0.0144, "reward": 0.06432117149233818, "reward_std": 0.018252032954478636, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.0643211716087535, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 400 }, { "completion_length": 142.875, "epoch": 4.620689655172414, "grad_norm": 246.48211669921875, "kl": 66.99489599466324, "learning_rate": 4.966636295355254e-07, "loss": 0.067, "reward": 0.044884443981572986, "reward_std": 0.012307498574955389, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.04488444374874234, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 402 }, { "completion_length": 81.5, "epoch": 4.64367816091954, "grad_norm": 296.2477722167969, "kl": 18.9678857922554, "learning_rate": 4.774575140626317e-07, "loss": 0.019, "reward": 0.07781441207043827, "reward_std": 0.02122665592469275, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.07781441207043827, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 404 }, { "completion_length": 105.375, "epoch": 4.666666666666667, "grad_norm": 185.84320068359375, "kl": 18.622023940086365, "learning_rate": 4.5859084235697236e-07, "loss": 0.0186, "reward": 0.06583084259182215, "reward_std": 0.025323806214146316, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.06583084259182215, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 406 }, { "completion_length": 102.0, "epoch": 4.689655172413794, "grad_norm": 203.86257934570312, "kl": 22.635424494743347, "learning_rate": 4.400667808163689e-07, "loss": 0.0226, "reward": 0.06861662562005222, "reward_std": 0.01788102719001472, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.06861662562005222, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 408 }, { "completion_length": 112.25, "epoch": 4.712643678160919, "grad_norm": 84.01768493652344, "kl": 338.1599786877632, "learning_rate": 4.2188843833829874e-07, "loss": 0.3382, "reward": 0.05166293424554169, "reward_std": 0.028636299713980407, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.05166293203365058, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 410 }, { "completion_length": 146.375, "epoch": 4.735632183908046, "grad_norm": 354.5594787597656, "kl": 26.927797377109528, "learning_rate": 4.040588657981301e-07, "loss": 0.0269, "reward": 0.05980089143849909, "reward_std": 0.020547856984194368, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.05980089050717652, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 412 }, { "completion_length": 126.0, "epoch": 4.758620689655173, "grad_norm": 358.0815734863281, "kl": 166.8691514134407, "learning_rate": 3.8658105553709356e-07, "loss": 0.1669, "reward": 0.06559245544485748, "reward_std": 0.02324709319509566, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.06559245544485748, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 414 }, { "completion_length": 89.375, "epoch": 4.781609195402299, "grad_norm": 16868.626953125, "kl": 3085.8103035092354, "learning_rate": 3.6945794086007706e-07, "loss": 3.0858, "reward": 0.06225014664232731, "reward_std": 0.02900221128948033, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.062250145711004734, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 416 }, { "completion_length": 190.375, "epoch": 4.804597701149425, "grad_norm": 185.25111389160156, "kl": 34.844902604818344, "learning_rate": 3.5269239554332565e-07, "loss": 0.0348, "reward": 0.03164132940582931, "reward_std": 0.019219688256271183, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.03164133080281317, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 418 }, { "completion_length": 109.875, "epoch": 4.827586206896552, "grad_norm": 113.00900268554688, "kl": 17.416384041309357, "learning_rate": 3.362872333521389e-07, "loss": 0.0174, "reward": 0.04076780390460044, "reward_std": 0.015567160677164793, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.0407678036717698, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 420 }, { "completion_length": 131.125, "epoch": 4.850574712643678, "grad_norm": 444.0363464355469, "kl": 160.99863409996033, "learning_rate": 3.2024520756863244e-07, "loss": 0.161, "reward": 0.07695711497217417, "reward_std": 0.028765658324118704, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.07695711380802095, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 422 }, { "completion_length": 165.125, "epoch": 4.873563218390805, "grad_norm": 561.9520263671875, "kl": 81.70425218343735, "learning_rate": 3.0456901052965726e-07, "loss": 0.0817, "reward": 0.05970188160426915, "reward_std": 0.032429520215373486, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.05970188160426915, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 424 }, { "completion_length": 94.375, "epoch": 4.896551724137931, "grad_norm": 209.0235595703125, "kl": 57.1011888384819, "learning_rate": 2.892612731749414e-07, "loss": 0.0571, "reward": 0.05344467982649803, "reward_std": 0.020119196677114815, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.05344467982649803, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 426 }, { "completion_length": 151.875, "epoch": 4.919540229885057, "grad_norm": 119.91405487060547, "kl": 74.81910461187363, "learning_rate": 2.743245646055398e-07, "loss": 0.0748, "reward": 0.06681666756048799, "reward_std": 0.02402009002980776, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.06681666662916541, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 428 }, { "completion_length": 90.625, "epoch": 4.942528735632184, "grad_norm": 151.1865997314453, "kl": 17.41374397277832, "learning_rate": 2.5976139165266367e-07, "loss": 0.0174, "reward": 0.062349649146199226, "reward_std": 0.018249723158078268, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.06234965054318309, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 430 }, { "completion_length": 94.125, "epoch": 4.9655172413793105, "grad_norm": 245.86019897460938, "kl": 34.20732420682907, "learning_rate": 2.455741984569543e-07, "loss": 0.0342, "reward": 0.10106607899069786, "reward_std": 0.02299020008649677, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.10106607712805271, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 432 }, { "completion_length": 61.625, "epoch": 4.988505747126437, "grad_norm": 179.89816284179688, "kl": 12.610459923744202, "learning_rate": 2.3176536605828443e-07, "loss": 0.0126, "reward": 0.040777527960017323, "reward_std": 0.013355905626667663, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.040777527843602, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 434 }, { "completion_length": 69.5, "epoch": 5.011494252873563, "grad_norm": 103.5875015258789, "kl": 42.14557844400406, "learning_rate": 2.1833721199614992e-07, "loss": 0.0421, "reward": 0.050128996605053544, "reward_std": 0.01959619121043943, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.050128996605053544, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 436 }, { "completion_length": 117.5, "epoch": 5.0344827586206895, "grad_norm": 269.74884033203125, "kl": 98.92739188671112, "learning_rate": 2.0529198992071202e-07, "loss": 0.0989, "reward": 0.048841290175914764, "reward_std": 0.013170149497454986, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.04884129052516073, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 438 }, { "completion_length": 102.25, "epoch": 5.057471264367816, "grad_norm": 143.8467254638672, "kl": 71.28720360994339, "learning_rate": 1.926318892145712e-07, "loss": 0.0713, "reward": 0.07902667589951307, "reward_std": 0.024230862269178033, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.0790266771800816, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 440 }, { "completion_length": 119.75, "epoch": 5.080459770114943, "grad_norm": 242.18421936035156, "kl": 81.87208154797554, "learning_rate": 1.803590346253195e-07, "loss": 0.0819, "reward": 0.07706521404907107, "reward_std": 0.02488556585740298, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.07706521404907107, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 442 }, { "completion_length": 102.25, "epoch": 5.103448275862069, "grad_norm": 527.6226806640625, "kl": 14.882905751466751, "learning_rate": 1.6847548590894435e-07, "loss": 0.0149, "reward": 0.06148105091415346, "reward_std": 0.026427923352457583, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.061481051379814744, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 444 }, { "completion_length": 119.375, "epoch": 5.126436781609195, "grad_norm": 624.638916015625, "kl": 86.14807039499283, "learning_rate": 1.5698323748414123e-07, "loss": 0.0861, "reward": 0.07114387373439968, "reward_std": 0.025059880048502237, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.07114387326873839, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 446 }, { "completion_length": 119.0, "epoch": 5.149425287356322, "grad_norm": 127.02527618408203, "kl": 26.64391726255417, "learning_rate": 1.458842180975864e-07, "loss": 0.0266, "reward": 0.07868425198830664, "reward_std": 0.021157007955480367, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.07868425187189132, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 448 }, { "completion_length": 118.125, "epoch": 5.172413793103448, "grad_norm": 206.26356506347656, "kl": 48.104073613882065, "learning_rate": 1.3518029050023862e-07, "loss": 0.0481, "reward": 0.06190641736611724, "reward_std": 0.03233522875234485, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.061906418297439814, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 450 }, { "completion_length": 108.125, "epoch": 5.195402298850575, "grad_norm": 213.14529418945312, "kl": 65.89005625247955, "learning_rate": 1.2487325113471034e-07, "loss": 0.0659, "reward": 0.05019039229955524, "reward_std": 0.023320534935919568, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.05019039299804717, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 452 }, { "completion_length": 163.125, "epoch": 5.218390804597701, "grad_norm": 245.68336486816406, "kl": 39.29124653339386, "learning_rate": 1.1496482983377189e-07, "loss": 0.0393, "reward": 0.07361394097097218, "reward_std": 0.02997289298218675, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.07361394050531089, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 454 }, { "completion_length": 89.125, "epoch": 5.241379310344827, "grad_norm": 247.30177307128906, "kl": 42.33103388547897, "learning_rate": 1.054566895300324e-07, "loss": 0.0423, "reward": 0.08200049586594105, "reward_std": 0.031577705522067845, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.0820004977285862, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 456 }, { "completion_length": 146.125, "epoch": 5.264367816091954, "grad_norm": 332.1698303222656, "kl": 28.958667635917664, "learning_rate": 9.635042597685024e-08, "loss": 0.029, "reward": 0.10437152965459973, "reward_std": 0.02384127100231126, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.10437152779195458, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 458 }, { "completion_length": 76.25, "epoch": 5.287356321839081, "grad_norm": 347.81439208984375, "kl": 30.358800321817398, "learning_rate": 8.764756748051661e-08, "loss": 0.0304, "reward": 0.05820860539097339, "reward_std": 0.029113344906363636, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.05820860655512661, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 460 }, { "completion_length": 108.75, "epoch": 5.310344827586207, "grad_norm": 232.17935180664062, "kl": 58.13675355911255, "learning_rate": 7.934957464376059e-08, "loss": 0.0581, "reward": 0.03255829040426761, "reward_std": 0.011912564310478047, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.03255829063709825, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 462 }, { "completion_length": 101.125, "epoch": 5.333333333333333, "grad_norm": 204.68075561523438, "kl": 74.83390206098557, "learning_rate": 7.145784012061424e-08, "loss": 0.0748, "reward": 0.06471558753401041, "reward_std": 0.02323675691150129, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.06471558753401041, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 464 }, { "completion_length": 110.5, "epoch": 5.35632183908046, "grad_norm": 403.2699279785156, "kl": 13.097857117652893, "learning_rate": 6.397368838268497e-08, "loss": 0.0131, "reward": 0.0381945117842406, "reward_std": 0.01582346693612635, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.03819451155140996, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 466 }, { "completion_length": 112.25, "epoch": 5.379310344827586, "grad_norm": 158.2527313232422, "kl": 170.7386868596077, "learning_rate": 5.6898375496867444e-08, "loss": 0.1707, "reward": 0.05373863037675619, "reward_std": 0.01907505358394701, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.053738630609586835, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 468 }, { "completion_length": 162.25, "epoch": 5.402298850574713, "grad_norm": 85.8644790649414, "kl": 24.265853881835938, "learning_rate": 5.023308891453915e-08, "loss": 0.0243, "reward": 0.060619605937972665, "reward_std": 0.01981559843989089, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.060619607800617814, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 470 }, { "completion_length": 109.75, "epoch": 5.425287356321839, "grad_norm": 182.0062713623047, "kl": 7.7864866852760315, "learning_rate": 4.397894727226931e-08, "loss": 0.0078, "reward": 0.11819641152396798, "reward_std": 0.04493911180179566, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.11819641152396798, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 472 }, { "completion_length": 166.875, "epoch": 5.448275862068965, "grad_norm": 134.50875854492188, "kl": 56.89212989807129, "learning_rate": 3.813700020407707e-08, "loss": 0.0569, "reward": 0.05759430106263608, "reward_std": 0.02498369975364767, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.05759430245961994, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 474 }, { "completion_length": 140.75, "epoch": 5.471264367816092, "grad_norm": 273.6034851074219, "kl": 114.64517286419868, "learning_rate": 3.270822816527325e-08, "loss": 0.1146, "reward": 0.05116581660695374, "reward_std": 0.017275791411520913, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.05116581846959889, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 476 }, { "completion_length": 89.625, "epoch": 5.494252873563219, "grad_norm": 39.552711486816406, "kl": 14.781741857528687, "learning_rate": 2.7693542267908934e-08, "loss": 0.0148, "reward": 0.04086445295251906, "reward_std": 0.013047619431745261, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.040864453418180346, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 478 }, { "completion_length": 87.875, "epoch": 5.517241379310345, "grad_norm": 116.31592559814453, "kl": 251.68036818504333, "learning_rate": 2.309378412786306e-08, "loss": 0.2517, "reward": 0.06248646741732955, "reward_std": 0.021653928386513144, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.06248646741732955, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 480 }, { "completion_length": 100.5, "epoch": 5.540229885057471, "grad_norm": 318.4939270019531, "kl": 41.53044927120209, "learning_rate": 1.890972572359456e-08, "loss": 0.0415, "reward": 0.07314828992821276, "reward_std": 0.030538745340891182, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.0731482901610434, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 482 }, { "completion_length": 92.625, "epoch": 5.563218390804598, "grad_norm": 209.56138610839844, "kl": 85.39921009540558, "learning_rate": 1.5142069266580462e-08, "loss": 0.0854, "reward": 0.07255859789438546, "reward_std": 0.03463851008564234, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.07255859626457095, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 484 }, { "completion_length": 100.5, "epoch": 5.586206896551724, "grad_norm": 353.20306396484375, "kl": 9.140353620052338, "learning_rate": 1.1791447083465136e-08, "loss": 0.0091, "reward": 0.049784947419539094, "reward_std": 0.014763840037630871, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.04978494835086167, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 486 }, { "completion_length": 181.375, "epoch": 5.609195402298851, "grad_norm": 206.03250122070312, "kl": 19.34032052755356, "learning_rate": 8.858421509933823e-09, "loss": 0.0193, "reward": 0.08371848054230213, "reward_std": 0.03288691467605531, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.08371848124079406, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 488 }, { "completion_length": 103.25, "epoch": 5.6321839080459775, "grad_norm": 150.69528198242188, "kl": 17.797045648097992, "learning_rate": 6.343484796338395e-09, "loss": 0.0178, "reward": 0.05300784157589078, "reward_std": 0.017884997942019254, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.05300783971324563, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 490 }, { "completion_length": 129.875, "epoch": 5.655172413793103, "grad_norm": 96.60308837890625, "kl": 20.069464325904846, "learning_rate": 4.247059025082323e-09, "loss": 0.0201, "reward": 0.05194263160228729, "reward_std": 0.021622385072987527, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.051942631835117936, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 492 }, { "completion_length": 80.25, "epoch": 5.67816091954023, "grad_norm": 172.4818878173828, "kl": 22.12061709165573, "learning_rate": 2.5694960397806834e-09, "loss": 0.0221, "reward": 0.046301966765895486, "reward_std": 0.014548589737387374, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.04630196746438742, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 494 }, { "completion_length": 105.75, "epoch": 5.7011494252873565, "grad_norm": 88.81124114990234, "kl": 302.9540114104748, "learning_rate": 1.3110773862126669e-09, "loss": 0.303, "reward": 0.06522750947624445, "reward_std": 0.01896664861124009, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.0652275097090751, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 496 }, { "completion_length": 126.25, "epoch": 5.724137931034483, "grad_norm": 158.61248779296875, "kl": 63.442298233509064, "learning_rate": 4.720142650685433e-10, "loss": 0.0634, "reward": 0.03797460882924497, "reward_std": 0.017652916838414967, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.03797460859641433, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 498 }, { "completion_length": 116.5, "epoch": 5.747126436781609, "grad_norm": 184.5111541748047, "kl": 7.218226313591003, "learning_rate": 5.2447496503016395e-11, "loss": 0.0072, "reward": 0.09307991340756416, "reward_std": 0.03332131908973679, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.09307991620153189, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0, "step": 500 }, { "epoch": 5.747126436781609, "step": 500, "total_flos": 0.0, "train_loss": 7.480205215036584e+17, "train_runtime": 3503.0797, "train_samples_per_second": 2.284, "train_steps_per_second": 0.143 } ], "logging_steps": 2, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }