{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 10.714285714285714, "eval_steps": 500, "global_step": 150, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 531.0625, "epoch": 0.14285714285714285, "grad_norm": 0.43849122524261475, "kl": 0.0007592401634610724, "learning_rate": 1e-07, "loss": 0.0, "reward": 0.9380363449454308, "reward_std": 1.002307377755642, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.5975051186978817, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.27803126722574234, "step": 2 }, { "completion_length": 397.3125, "epoch": 0.2857142857142857, "grad_norm": 1.577577829360962, "kl": 0.0008190772859961726, "learning_rate": 3e-07, "loss": 0.0, "reward": 1.7442463338375092, "reward_std": 1.2219942659139633, "rewards/concensus_correctness_reward_func": 0.08231249824166298, "rewards/consensus_reward_func": 0.6875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.6789963319897652, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.03125, "rewards/xmlcount_reward_func": 0.26418749848380685, "step": 4 }, { "completion_length": 566.875, "epoch": 0.42857142857142855, "grad_norm": 0.8244675993919373, "kl": 0.0007077432819642127, "learning_rate": 5e-07, "loss": 0.0, "reward": 0.6187557838857174, "reward_std": 0.9308283319696784, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.31600580271333456, "rewards/soft_format_reward_func": 0.015625, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.08787499507889152, "step": 6 }, { "completion_length": 393.03125, "epoch": 0.5714285714285714, "grad_norm": 0.4791373610496521, "kl": 0.0010389809758635238, "learning_rate": 4.997653255609941e-07, "loss": 0.0, "reward": 1.142044559121132, "reward_std": 0.993548296391964, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.4375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.5628570690751076, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.03125, "rewards/xmlcount_reward_func": 0.1104374947026372, "step": 8 }, { "completion_length": 451.96875, "epoch": 0.7142857142857143, "grad_norm": 0.24888773262500763, "kl": 0.0007037188952381257, "learning_rate": 4.990617428207153e-07, "loss": 0.0, "reward": 0.9834587760269642, "reward_std": 1.4619794301688671, "rewards/concensus_correctness_reward_func": 0.05999999865889549, "rewards/consensus_reward_func": 0.3125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.529052471742034, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.06628124509006739, "step": 10 }, { "completion_length": 546.625, "epoch": 0.8571428571428571, "grad_norm": 0.4139728546142578, "kl": 0.0007554452349722851, "learning_rate": 4.978905726822423e-07, "loss": 0.0, "reward": 0.7695160396397114, "reward_std": 0.9300474151968956, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.1875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.4751097960397601, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.09128125011920929, "step": 12 }, { "completion_length": 484.9375, "epoch": 1.0, "grad_norm": 0.35962751507759094, "kl": 0.0007782203174429014, "learning_rate": 4.962540138951371e-07, "loss": 0.0, "reward": 1.0523580554872751, "reward_std": 1.1958911046385765, "rewards/concensus_correctness_reward_func": 0.04731250088661909, "rewards/consensus_reward_func": 0.25, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.5425143092870712, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.046875, "rewards/xmlcount_reward_func": 0.16565624624490738, "step": 14 }, { "completion_length": 400.78125, "epoch": 1.1428571428571428, "grad_norm": 0.6109427809715271, "kl": 0.0008876436986611225, "learning_rate": 4.941551389275217e-07, "loss": 0.0, "reward": 1.3130817487835884, "reward_std": 1.2536730356514454, "rewards/concensus_correctness_reward_func": 0.05999999865889549, "rewards/consensus_reward_func": 0.25, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.5473629906773567, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.046875, "rewards/xmlcount_reward_func": 0.40884375921450555, "step": 16 }, { "completion_length": 486.75, "epoch": 1.2857142857142856, "grad_norm": 0.6664641499519348, "kl": 0.0007863550781621598, "learning_rate": 4.915978881978406e-07, "loss": 0.0, "reward": 0.7770229317247868, "reward_std": 0.8553876895457506, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.25, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.40283544175326824, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.03125, "rewards/xmlcount_reward_func": 0.09293749369680882, "step": 18 }, { "completion_length": 448.0625, "epoch": 1.4285714285714286, "grad_norm": 2.3377184867858887, "kl": 0.0008212350694520865, "learning_rate": 4.88587062677137e-07, "loss": 0.0, "reward": 1.2851424254477024, "reward_std": 1.0251347795128822, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.5, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.42982995929196477, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.33968749921768904, "step": 20 }, { "completion_length": 493.0625, "epoch": 1.5714285714285714, "grad_norm": 0.49713313579559326, "kl": 0.0007119231995602604, "learning_rate": 4.85128314875731e-07, "loss": 0.0, "reward": 0.9178848676383495, "reward_std": 1.0165484435856342, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.3125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.4934161137789488, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11196874734014273, "step": 22 }, { "completion_length": 461.03125, "epoch": 1.7142857142857144, "grad_norm": 0.4986618757247925, "kl": 0.001024968114506919, "learning_rate": 4.812281382312223e-07, "loss": 0.0, "reward": 0.7276457324624062, "reward_std": 1.1532482020556927, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.25, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.410145727917552, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.05187500640749931, "step": 24 }, { "completion_length": 527.25, "epoch": 1.8571428571428572, "grad_norm": 0.5186403393745422, "kl": 0.0006880087239551358, "learning_rate": 4.768938549177392e-07, "loss": 0.0, "reward": 1.18468963727355, "reward_std": 0.9039346128702164, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.6515021324157715, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.15818749461323023, "step": 26 }, { "completion_length": 427.1875, "epoch": 2.0, "grad_norm": 0.8767024874687195, "kl": 0.0007596770446980372, "learning_rate": 4.721336020993228e-07, "loss": 0.0, "reward": 1.2328671924769878, "reward_std": 1.0404776483774185, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.3125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0625, "rewards/question_recreation_reward_func": 0.5037421826273203, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.03125, "rewards/xmlcount_reward_func": 0.322875015437603, "step": 28 }, { "completion_length": 500.75, "epoch": 2.142857142857143, "grad_norm": 0.47846952080726624, "kl": 0.0008058039384195581, "learning_rate": 4.669563166532503e-07, "loss": 0.0, "reward": 1.2811774164438248, "reward_std": 1.28513915091753, "rewards/concensus_correctness_reward_func": 0.022874999791383743, "rewards/consensus_reward_func": 0.5, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.46248994395136833, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2958124950528145, "step": 30 }, { "completion_length": 505.625, "epoch": 2.2857142857142856, "grad_norm": 0.7064347863197327, "kl": 0.0008112043797154911, "learning_rate": 4.6137171839198297e-07, "loss": 0.0, "reward": 0.8990373089909554, "reward_std": 1.036856360733509, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.1875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.5230998322367668, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.17281250469386578, "step": 32 }, { "completion_length": 496.8125, "epoch": 2.4285714285714284, "grad_norm": 0.5539802312850952, "kl": 0.0008243580268754158, "learning_rate": 4.5539029181523284e-07, "loss": 0.0, "reward": 1.531732201576233, "reward_std": 1.292894572019577, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.5, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.5942946895956993, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.03125, "rewards/xmlcount_reward_func": 0.4061875008046627, "step": 34 }, { "completion_length": 481.375, "epoch": 2.571428571428571, "grad_norm": 2.632323980331421, "kl": 0.0007293780872714706, "learning_rate": 4.490232664264109e-07, "loss": 0.0, "reward": 0.9573331773281097, "reward_std": 1.0130096804350615, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.25, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.4835206978023052, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.03125, "rewards/xmlcount_reward_func": 0.19256250374019146, "step": 36 }, { "completion_length": 489.46875, "epoch": 2.7142857142857144, "grad_norm": 0.5014302730560303, "kl": 0.0008036747931328136, "learning_rate": 4.422825956504072e-07, "loss": 0.0, "reward": 0.9848394468426704, "reward_std": 1.0464553572237492, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.3125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.41177696315571666, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.26056248880922794, "step": 38 }, { "completion_length": 354.40625, "epoch": 2.857142857142857, "grad_norm": 0.8983840942382812, "kl": 0.0010199547614320181, "learning_rate": 4.3518093439228474e-07, "loss": 0.0, "reward": 1.3354799263179302, "reward_std": 0.7889060713350773, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.5625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.4569174461066723, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.03125, "rewards/xmlcount_reward_func": 0.28481248766183853, "step": 40 }, { "completion_length": 491.84375, "epoch": 3.0, "grad_norm": 0.3673498034477234, "kl": 0.0007946314362925477, "learning_rate": 4.277316152790177e-07, "loss": 0.0, "reward": 1.2819406390190125, "reward_std": 1.23009411431849, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.3125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.5464093834161758, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.2824062593281269, "step": 42 }, { "completion_length": 424.96875, "epoch": 3.142857142857143, "grad_norm": 0.6144971251487732, "kl": 0.0008468980086036026, "learning_rate": 4.1994862362887694e-07, "loss": 0.0, "reward": 1.2973965927958488, "reward_std": 0.8762462437152863, "rewards/concensus_correctness_reward_func": 0.022874999791383743, "rewards/consensus_reward_func": 0.375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.4806466265581548, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.4032499957829714, "step": 44 }, { "completion_length": 384.0625, "epoch": 3.2857142857142856, "grad_norm": 0.45620861649513245, "kl": 0.002571742355939932, "learning_rate": 4.118465711954569e-07, "loss": 0.0, "reward": 1.4320471212267876, "reward_std": 1.0780956260859966, "rewards/concensus_correctness_reward_func": 0.0078125, "rewards/consensus_reward_func": 0.625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.5111721362918615, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.27243749238550663, "step": 46 }, { "completion_length": 504.71875, "epoch": 3.4285714285714284, "grad_norm": 0.5377627015113831, "kl": 0.0012243477249285206, "learning_rate": 4.0344066873563436e-07, "loss": 0.0, "reward": 0.736795149743557, "reward_std": 0.9947699457406998, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.569857656955719, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.02631249837577343, "step": 48 }, { "completion_length": 477.75, "epoch": 3.571428571428571, "grad_norm": 0.49835580587387085, "kl": 0.0008050548494793475, "learning_rate": 3.947466974529622e-07, "loss": 0.0, "reward": 1.3773181941360235, "reward_std": 1.2256085090339184, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.4375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0625, "rewards/question_recreation_reward_func": 0.6182556860148907, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.24343750812113285, "step": 50 }, { "completion_length": 394.90625, "epoch": 3.7142857142857144, "grad_norm": 0.9342589378356934, "kl": 0.001351733117189724, "learning_rate": 3.857809793701082e-07, "loss": 0.0, "reward": 1.1242612563073635, "reward_std": 1.042309246957302, "rewards/concensus_correctness_reward_func": 0.1587500013411045, "rewards/consensus_reward_func": 0.25, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.4321050215512514, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.03125, "rewards/xmlcount_reward_func": 0.25215625343844295, "step": 52 }, { "completion_length": 524.65625, "epoch": 3.857142857142857, "grad_norm": 0.40101158618927, "kl": 0.0008420911181019619, "learning_rate": 3.765603466859635e-07, "loss": 0.0, "reward": 0.8740292452275753, "reward_std": 1.0554649233818054, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.25, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.497060501947999, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.11134374886751175, "step": 54 }, { "completion_length": 584.53125, "epoch": 4.0, "grad_norm": 0.5451942682266235, "kl": 0.0007025906161288731, "learning_rate": 3.6710211017494754e-07, "loss": 0.0, "reward": 1.0994054786860943, "reward_std": 1.2268578335642815, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.75, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.37571799475699663, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.026312503032386303, "step": 56 }, { "completion_length": 494.34375, "epoch": 4.142857142857143, "grad_norm": 0.4098193943500519, "kl": 0.0008971344286692329, "learning_rate": 3.5742402668783795e-07, "loss": 0.0, "reward": 1.0998137965798378, "reward_std": 0.9589141272008419, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.5625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.4196263402700424, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.10206249542534351, "step": 58 }, { "completion_length": 398.375, "epoch": 4.285714285714286, "grad_norm": 2.3923497200012207, "kl": 0.0009595184528734535, "learning_rate": 3.475442658151386e-07, "loss": 0.0, "reward": 1.0170318558812141, "reward_std": 0.8580264896154404, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.3125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.4561881124973297, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.23271876480430365, "step": 60 }, { "completion_length": 383.25, "epoch": 4.428571428571429, "grad_norm": 0.5244536399841309, "kl": 0.000809692959592212, "learning_rate": 3.374813757755721e-07, "loss": 0.0, "reward": 1.160951979458332, "reward_std": 0.967152014374733, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.25, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.559733223170042, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0625, "rewards/xmlcount_reward_func": 0.28871875163167715, "step": 62 }, { "completion_length": 456.28125, "epoch": 4.571428571428571, "grad_norm": 0.4779525697231293, "kl": 0.0007659446309844498, "learning_rate": 3.272542485937368e-07, "loss": 0.0, "reward": 1.1288444176316261, "reward_std": 0.9435681980103254, "rewards/concensus_correctness_reward_func": 0.03968749940395355, "rewards/consensus_reward_func": 0.125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0625, "rewards/question_recreation_reward_func": 0.4585944190621376, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.4274374954402447, "step": 64 }, { "completion_length": 450.5625, "epoch": 4.714285714285714, "grad_norm": 6.9962158203125, "kl": 0.0009121940056502353, "learning_rate": 3.168820846323053e-07, "loss": 0.0, "reward": 0.9754399657249451, "reward_std": 1.2612947151064873, "rewards/concensus_correctness_reward_func": 0.04937500134110451, "rewards/consensus_reward_func": 0.1875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.5312837082892656, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.19165627285838127, "step": 66 }, { "completion_length": 572.5625, "epoch": 4.857142857142857, "grad_norm": 25.707015991210938, "kl": 0.001060292182955891, "learning_rate": 3.0638435654534855e-07, "loss": 0.0, "reward": 1.2489292174577713, "reward_std": 1.0243084505200386, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.4375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.6242104470729828, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.18721875082701445, "step": 68 }, { "completion_length": 400.75, "epoch": 5.0, "grad_norm": 0.5181100964546204, "kl": 0.0017470509847044013, "learning_rate": 2.9578077272046406e-07, "loss": 0.0, "reward": 0.8818678706884384, "reward_std": 1.1266341097652912, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.3456803672015667, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.14556251280009747, "step": 70 }, { "completion_length": 520.90625, "epoch": 5.142857142857143, "grad_norm": 0.4338609576225281, "kl": 0.0008475819049635902, "learning_rate": 2.850912402783361e-07, "loss": 0.0, "reward": 0.9694934040307999, "reward_std": 1.0477705840021372, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.4826496662572026, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11184373684227467, "step": 72 }, { "completion_length": 466.84375, "epoch": 5.285714285714286, "grad_norm": 0.7591902017593384, "kl": 0.0008839051115501206, "learning_rate": 2.743358276991975e-07, "loss": 0.0, "reward": 1.1550805270671844, "reward_std": 1.1475523337721825, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.1875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.49870552588254213, "rewards/soft_format_reward_func": 0.015625, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.31262501119636, "step": 74 }, { "completion_length": 509.1875, "epoch": 5.428571428571429, "grad_norm": 0.43733879923820496, "kl": 0.0007559881923953071, "learning_rate": 2.635347271463544e-07, "loss": 0.0, "reward": 0.7065972574055195, "reward_std": 1.2975607588887215, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.3125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.39328478649258614, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.03125, "rewards/xmlcount_reward_func": -0.03043750301003456, "step": 76 }, { "completion_length": 474.40625, "epoch": 5.571428571428571, "grad_norm": 4.258476257324219, "kl": 0.0012869478377979249, "learning_rate": 2.5270821655750997e-07, "loss": 0.0, "reward": 1.3390182089060545, "reward_std": 1.1490016989409924, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.5625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.49158067628741264, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.2693125046789646, "step": 78 }, { "completion_length": 461.90625, "epoch": 5.714285714285714, "grad_norm": 18.467025756835938, "kl": 0.0017964004873647355, "learning_rate": 2.418766215750549e-07, "loss": 0.0, "reward": 1.166889525949955, "reward_std": 1.3167821913957596, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.5, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.47351452335715294, "rewards/soft_format_reward_func": 0.015625, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.16212500724941492, "step": 80 }, { "completion_length": 491.1875, "epoch": 5.857142857142857, "grad_norm": 0.4540603458881378, "kl": 0.0007902593570179306, "learning_rate": 2.310602773867974e-07, "loss": 0.0, "reward": 1.0019405260682106, "reward_std": 1.3042295798659325, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.5, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0625, "rewards/question_recreation_reward_func": 0.4987842608243227, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": -0.07496875338256359, "step": 82 }, { "completion_length": 398.0, "epoch": 6.0, "grad_norm": 0.5137256979942322, "kl": 0.0009611124041839503, "learning_rate": 2.202794905487734e-07, "loss": 0.0, "reward": 1.0205133110284805, "reward_std": 0.8475228548049927, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.25, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.5592633187770844, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.03125, "rewards/xmlcount_reward_func": 0.17999999597668648, "step": 84 }, { "completion_length": 469.65625, "epoch": 6.142857142857143, "grad_norm": 0.4792466163635254, "kl": 0.0008061980261118151, "learning_rate": 2.0955450086180881e-07, "loss": 0.0, "reward": 0.8682211935520172, "reward_std": 1.0378656350076199, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.1875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.4103149529546499, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.27040624991059303, "step": 86 }, { "completion_length": 400.40625, "epoch": 6.285714285714286, "grad_norm": 7.85969352722168, "kl": 0.0010710225178627297, "learning_rate": 1.9890544337340882e-07, "loss": 0.0, "reward": 0.8357085809111595, "reward_std": 0.856484754011035, "rewards/concensus_correctness_reward_func": 0.023624999448657036, "rewards/consensus_reward_func": 0.1875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.3641773536801338, "rewards/soft_format_reward_func": 0.015625, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2447812515310943, "step": 88 }, { "completion_length": 426.71875, "epoch": 6.428571428571429, "grad_norm": 0.47061946988105774, "kl": 0.0008341599168488756, "learning_rate": 1.8835231057630952e-07, "loss": 0.0, "reward": 1.5294715389609337, "reward_std": 1.2328666970133781, "rewards/concensus_correctness_reward_func": 0.11943749710917473, "rewards/consensus_reward_func": 0.4375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.5701277758926153, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.03125, "rewards/xmlcount_reward_func": 0.3711562566459179, "step": 90 }, { "completion_length": 523.21875, "epoch": 6.571428571428571, "grad_norm": 0.5155832767486572, "kl": 0.0008018054249987472, "learning_rate": 1.779149148746623e-07, "loss": 0.0, "reward": 0.8209393434226513, "reward_std": 1.3677421361207962, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.5, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.4111268315464258, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": -0.10581249743700027, "step": 92 }, { "completion_length": 430.0625, "epoch": 6.714285714285714, "grad_norm": 0.5447288751602173, "kl": 0.0008923190762288868, "learning_rate": 1.6761285138831492e-07, "loss": 0.0, "reward": 1.4743354730308056, "reward_std": 1.434298001229763, "rewards/concensus_correctness_reward_func": 0.03968749940395355, "rewards/consensus_reward_func": 0.375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0625, "rewards/question_recreation_reward_func": 0.5945542603731155, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0625, "rewards/xmlcount_reward_func": 0.34009375236928463, "step": 94 }, { "completion_length": 446.25, "epoch": 6.857142857142857, "grad_norm": 0.5138098001480103, "kl": 0.0012626612806343473, "learning_rate": 1.5746546116502139e-07, "loss": 0.0, "reward": 1.05492047034204, "reward_std": 0.9639037847518921, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.1875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.526826735585928, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.34059375477954745, "step": 96 }, { "completion_length": 454.5625, "epoch": 7.0, "grad_norm": 0.40146228671073914, "kl": 0.0007050501299090683, "learning_rate": 1.4749179486964598e-07, "loss": 0.0, "reward": 1.0958987846970558, "reward_std": 0.8821211084723473, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0625, "rewards/question_recreation_reward_func": 0.4891800582408905, "rewards/soft_format_reward_func": 0.015625, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.13796875812113285, "step": 98 }, { "completion_length": 379.71875, "epoch": 7.142857142857143, "grad_norm": 0.3652368187904358, "kl": 0.0009633691079216078, "learning_rate": 1.377105770185303e-07, "loss": 0.0, "reward": 0.7689098361879587, "reward_std": 1.277234248816967, "rewards/concensus_correctness_reward_func": 0.03700000047683716, "rewards/consensus_reward_func": 0.25, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.31775358971208334, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.046875, "rewards/xmlcount_reward_func": 0.11728124879300594, "step": 100 }, { "completion_length": 440.40625, "epoch": 7.285714285714286, "grad_norm": 1.0428880453109741, "kl": 0.0008292151760542765, "learning_rate": 1.2814017082617022e-07, "loss": 0.0, "reward": 1.671914242208004, "reward_std": 1.0861946307122707, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.8125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.6100080162286758, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.23378124739974737, "step": 102 }, { "completion_length": 560.84375, "epoch": 7.428571428571429, "grad_norm": 0.5331642627716064, "kl": 0.0006519956768897828, "learning_rate": 1.1879854373019988e-07, "loss": 0.0, "reward": 1.0102687766775489, "reward_std": 1.1615657843649387, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.5, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.4500812734477222, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0601874990388751, "step": 104 }, { "completion_length": 368.3125, "epoch": 7.571428571428571, "grad_norm": 0.5822815895080566, "kl": 0.00092067445802968, "learning_rate": 1.0970323365940443e-07, "loss": 0.0, "reward": 1.556887112557888, "reward_std": 1.1068004593253136, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.5, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.5599808432161808, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.03125, "rewards/xmlcount_reward_func": 0.4656562441959977, "step": 106 }, { "completion_length": 590.40625, "epoch": 7.714285714285714, "grad_norm": 0.4019765257835388, "kl": 0.000690833454427775, "learning_rate": 1.0087131610809151e-07, "loss": 0.0, "reward": 1.0686058551073074, "reward_std": 0.9091645516455173, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0625, "rewards/question_recreation_reward_func": 0.5719808377325535, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.05912499502301216, "step": 108 }, { "completion_length": 550.34375, "epoch": 7.857142857142857, "grad_norm": 0.46632856130599976, "kl": 0.0007421129339491017, "learning_rate": 9.231937207863458e-08, "loss": 0.0, "reward": 1.272650208324194, "reward_std": 0.9663579203188419, "rewards/concensus_correctness_reward_func": 0.03968749940395355, "rewards/consensus_reward_func": 0.3125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0625, "rewards/question_recreation_reward_func": 0.6062439531087875, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.23609375767409801, "step": 110 }, { "completion_length": 454.0625, "epoch": 8.0, "grad_norm": 0.3237430453300476, "kl": 0.0006917461178090889, "learning_rate": 8.406345695237394e-08, "loss": 0.0, "reward": 0.9366877265274525, "reward_std": 0.823640601709485, "rewards/concensus_correctness_reward_func": 0.04574999958276749, "rewards/consensus_reward_func": 0.125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.1875, "rewards/question_recreation_reward_func": 0.5623127091675997, "rewards/soft_format_reward_func": 0.015625, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0005000047385692596, "step": 112 }, { "completion_length": 502.53125, "epoch": 8.142857142857142, "grad_norm": 0.484937846660614, "kl": 0.0007385101416730322, "learning_rate": 7.611907034731538e-08, "loss": 0.0, "reward": 1.1455121785402298, "reward_std": 1.2154901176691055, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.6119183450937271, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.14296876452863216, "step": 114 }, { "completion_length": 510.9375, "epoch": 8.285714285714286, "grad_norm": 0.78264981508255, "kl": 0.000916360630071722, "learning_rate": 6.850112701921735e-08, "loss": 0.0, "reward": 0.9761499464511871, "reward_std": 1.0737531632184982, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.4395561721175909, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.4584687575697899, "step": 116 }, { "completion_length": 585.09375, "epoch": 8.428571428571429, "grad_norm": 0.3693084120750427, "kl": 0.00056322867385461, "learning_rate": 6.122392886069486e-08, "loss": 0.0, "reward": 0.893231701105833, "reward_std": 1.0969277396798134, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0625, "rewards/question_recreation_reward_func": 0.48766916897147894, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.03125, "rewards/xmlcount_reward_func": 0.18681250559166074, "step": 118 }, { "completion_length": 462.625, "epoch": 8.571428571428571, "grad_norm": 0.30537551641464233, "kl": 0.0007632317865500227, "learning_rate": 5.43011380509111e-08, "loss": 0.0, "reward": 0.97243014536798, "reward_std": 0.6945619508624077, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.5610863734036684, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.2707187496125698, "step": 120 }, { "completion_length": 547.4375, "epoch": 8.714285714285714, "grad_norm": 1.2487338781356812, "kl": 0.0006829651501902845, "learning_rate": 4.774575140626316e-08, "loss": 0.0, "reward": 1.1616501435637474, "reward_std": 0.7140795886516571, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.5262126475572586, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.26043749740347266, "step": 122 }, { "completion_length": 379.96875, "epoch": 8.857142857142858, "grad_norm": 0.6411337852478027, "kl": 0.0008765048332861625, "learning_rate": 4.15700759802175e-08, "loss": 0.0, "reward": 1.4295315220952034, "reward_std": 0.840527132153511, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.39525024220347404, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.5186562407761812, "step": 124 }, { "completion_length": 414.625, "epoch": 9.0, "grad_norm": 0.5082442164421082, "kl": 0.0010801785756484605, "learning_rate": 3.578570595810274e-08, "loss": 0.0, "reward": 1.3468186743557453, "reward_std": 1.312661036849022, "rewards/concensus_correctness_reward_func": 0.07400000095367432, "rewards/consensus_reward_func": 0.5, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.4449436501599848, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.046875, "rewards/xmlcount_reward_func": 0.2809999957680702, "step": 126 }, { "completion_length": 363.59375, "epoch": 9.142857142857142, "grad_norm": 0.7290022969245911, "kl": 0.0009098628361243755, "learning_rate": 3.0403500890238435e-08, "loss": 0.0, "reward": 1.7384984195232391, "reward_std": 1.120932698249817, "rewards/concensus_correctness_reward_func": 0.03968749940395355, "rewards/consensus_reward_func": 0.875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.5292484303936362, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.046875, "rewards/xmlcount_reward_func": 0.24768750555813313, "step": 128 }, { "completion_length": 475.59375, "epoch": 9.285714285714286, "grad_norm": 0.8706356883049011, "kl": 0.0007130142839741893, "learning_rate": 2.5433565304263937e-08, "loss": 0.0, "reward": 0.9139937087893486, "reward_std": 0.9278598949313164, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.5153999701142311, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.27359374053776264, "step": 130 }, { "completion_length": 488.90625, "epoch": 9.428571428571429, "grad_norm": 0.4597799479961395, "kl": 0.0007840716061764397, "learning_rate": 2.08852297349435e-08, "loss": 0.0, "reward": 1.1477353498339653, "reward_std": 1.0849581249058247, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.375, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.47573534958064556, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0625, "rewards/xmlcount_reward_func": 0.23450001329183578, "step": 132 }, { "completion_length": 466.46875, "epoch": 9.571428571428571, "grad_norm": 0.4358411133289337, "kl": 0.0007847334636608139, "learning_rate": 1.6767033207062297e-08, "loss": 0.0, "reward": 1.436149962246418, "reward_std": 1.238723523914814, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.5, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0625, "rewards/question_recreation_reward_func": 0.5826500160619617, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.27537500858306885, "step": 134 }, { "completion_length": 620.40625, "epoch": 9.714285714285714, "grad_norm": 0.24498514831066132, "kl": 0.0006698338329442777, "learning_rate": 1.3086707204299413e-08, "loss": 0.0, "reward": 0.4200167916715145, "reward_std": 1.446401447057724, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.1875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.5266417786478996, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": -0.3097500205039978, "step": 136 }, { "completion_length": 459.78125, "epoch": 9.857142857142858, "grad_norm": 0.6039451956748962, "kl": 0.0008627947681816295, "learning_rate": 9.851161154175336e-09, "loss": 0.0, "reward": 1.2975695468485355, "reward_std": 1.2158415243029594, "rewards/concensus_correctness_reward_func": 0.04937500134110451, "rewards/consensus_reward_func": 0.5, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.5543820522725582, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.1781875118613243, "step": 138 }, { "completion_length": 461.65625, "epoch": 10.0, "grad_norm": 0.6108608245849609, "kl": 0.0011360995413269848, "learning_rate": 7.066469456323609e-09, "loss": 0.0, "reward": 0.9892156459391117, "reward_std": 0.8619969859719276, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.5, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.4117156434804201, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.06187499314546585, "step": 140 }, { "completion_length": 454.15625, "epoch": 10.142857142857142, "grad_norm": 0.9019815325737, "kl": 0.0009011247602757066, "learning_rate": 4.737860078440209e-09, "loss": 0.0, "reward": 0.803930751979351, "reward_std": 0.998000368475914, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0625, "rewards/question_recreation_reward_func": 0.4643057584762573, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.046875, "rewards/xmlcount_reward_func": 0.16774999350309372, "step": 142 }, { "completion_length": 474.40625, "epoch": 10.285714285714286, "grad_norm": 1.3242416381835938, "kl": 0.0008034449347178452, "learning_rate": 2.8697047413204778e-09, "loss": 0.0, "reward": 1.2339159920811653, "reward_std": 0.9663043133914471, "rewards/concensus_correctness_reward_func": 0.07400000095367432, "rewards/consensus_reward_func": 0.3125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.4057597480714321, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.426031258655712, "step": 144 }, { "completion_length": 396.125, "epoch": 10.428571428571429, "grad_norm": 0.47938087582588196, "kl": 0.0007421185255225282, "learning_rate": 1.4655107114101007e-09, "loss": 0.0, "reward": 1.4692494682967663, "reward_std": 1.2490764074027538, "rewards/concensus_correctness_reward_func": 0.03968749940395355, "rewards/consensus_reward_func": 0.625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.5105619505047798, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.27837499789893627, "step": 146 }, { "completion_length": 566.46875, "epoch": 10.571428571428571, "grad_norm": 0.32477688789367676, "kl": 0.000911211685888702, "learning_rate": 5.279142162789018e-10, "loss": 0.0, "reward": 1.5270253717899323, "reward_std": 1.2376420386135578, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.5625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.7494628727436066, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21506249997764826, "step": 148 }, { "completion_length": 392.53125, "epoch": 10.714285714285714, "grad_norm": 0.8822419047355652, "kl": 0.0008073000899457838, "learning_rate": 5.86754953789681e-11, "loss": 0.0, "reward": 1.194279432296753, "reward_std": 0.8677986171096563, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.3125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.49518570490181446, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.015625, "rewards/xmlcount_reward_func": 0.3709687450900674, "step": 150 }, { "epoch": 10.714285714285714, "step": 150, "total_flos": 0.0, "train_loss": 8.959674520762444e-07, "train_runtime": 7092.4557, "train_samples_per_second": 0.338, "train_steps_per_second": 0.021 } ], "logging_steps": 2, "max_steps": 150, "num_input_tokens_seen": 0, "num_train_epochs": 11, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }