{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1459, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006856359273225917, "grad_norm": 1.033403298669248, "kl": 0.0, "learning_rate": 0.0, "logits/chosen": -214718931.0877193, "logits/rejected": -281195592.1126761, "logps/chosen": -219.08771929824562, "logps/rejected": -329.01408450704224, "loss": 0.5, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0013712718546451835, "grad_norm": 0.9679499981949258, "kl": 0.0, "learning_rate": 2.0000000000000002e-07, "logits/chosen": -251658240.0, "logits/rejected": -213789666.74285713, "logps/chosen": -220.82758620689654, "logps/rejected": -298.0571428571429, "loss": 0.5, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.002056907781967775, "grad_norm": 1.0611905337544343, "kl": 0.01953125, "learning_rate": 4.0000000000000003e-07, "logits/chosen": -289739857.26984125, "logits/rejected": -258627237.41538462, "logps/chosen": -351.4920634920635, "logps/rejected": -309.4153846153846, "loss": 0.4966, "rewards/chosen": 0.025731646825396824, "rewards/margins": 0.021476087931166055, "rewards/rejected": 0.004255558894230769, "step": 3 }, { "epoch": 0.002742543709290367, "grad_norm": 0.814155850009405, "kl": 0.0234375, "learning_rate": 6.000000000000001e-07, "logits/chosen": -235526301.53846154, "logits/rejected": -270490664.96, "logps/chosen": -226.05128205128204, "logps/rejected": -259.84, "loss": 0.4995, "rewards/chosen": -0.0018607897636217948, "rewards/margins": 0.0023579602363782053, "rewards/rejected": -0.00421875, "step": 4 }, { "epoch": 0.0034281796366129585, "grad_norm": 0.8856891656980501, "kl": 0.0078125, "learning_rate": 8.000000000000001e-07, "logits/chosen": -290624677.16129035, "logits/rejected": -273773661.09090906, "logps/chosen": -302.96774193548384, "logps/rejected": -306.90909090909093, "loss": 0.4986, "rewards/chosen": 0.00012600806451612903, "rewards/margins": 0.011489644428152493, "rewards/rejected": -0.011363636363636364, "step": 5 }, { "epoch": 0.00411381556393555, "grad_norm": 0.8263821241933255, "kl": 0.111328125, "learning_rate": 1.0000000000000002e-06, "logits/chosen": -199193895.05084747, "logits/rejected": -204001220.63768116, "logps/chosen": -294.23728813559325, "logps/rejected": -262.95652173913044, "loss": 0.5046, "rewards/chosen": -0.03216035487288135, "rewards/margins": -0.04416216646708425, "rewards/rejected": 0.012001811594202898, "step": 6 }, { "epoch": 0.0047994514912581415, "grad_norm": 1.0320808207185073, "kl": 0.0634765625, "learning_rate": 1.2000000000000002e-06, "logits/chosen": -233257422.4516129, "logits/rejected": -194463185.45454547, "logps/chosen": -243.09677419354838, "logps/rejected": -264.969696969697, "loss": 0.5016, "rewards/chosen": -0.00394562752016129, "rewards/margins": -0.011092289451979473, "rewards/rejected": 0.007146661931818182, "step": 7 }, { "epoch": 0.005485087418580734, "grad_norm": 0.9585792678692967, "kl": 0.0703125, "learning_rate": 1.4000000000000001e-06, "logits/chosen": -209577681.83606556, "logits/rejected": -219856651.46268657, "logps/chosen": -246.55737704918033, "logps/rejected": -325.97014925373134, "loss": 0.4982, "rewards/chosen": 0.0014628425973360656, "rewards/margins": 0.01211211090423905, "rewards/rejected": -0.010649268306902986, "step": 8 }, { "epoch": 0.0061707233459033254, "grad_norm": 1.0402922956697396, "kl": 0.1875, "learning_rate": 1.6000000000000001e-06, "logits/chosen": -271141458.58064514, "logits/rejected": -204376994.9090909, "logps/chosen": -305.80645161290323, "logps/rejected": -331.8787878787879, "loss": 0.4968, "rewards/chosen": 0.009507702242943549, "rewards/margins": 0.02356428368233749, "rewards/rejected": -0.01405658143939394, "step": 9 }, { "epoch": 0.006856359273225917, "grad_norm": 0.9013694403779456, "kl": 0.0, "learning_rate": 1.8000000000000001e-06, "logits/chosen": -244947353.6, "logits/rejected": -246723764.70588234, "logps/chosen": -249.33333333333334, "logps/rejected": -258.3529411764706, "loss": 0.4987, "rewards/chosen": -0.010009765625, "rewards/margins": 0.009097828584558824, "rewards/rejected": -0.019107594209558824, "step": 10 }, { "epoch": 0.0075419952005485085, "grad_norm": 1.0147372822846756, "kl": 0.01171875, "learning_rate": 2.0000000000000003e-06, "logits/chosen": -392201803.5409836, "logits/rejected": -220107057.6716418, "logps/chosen": -281.7049180327869, "logps/rejected": -293.97014925373134, "loss": 0.4978, "rewards/chosen": 0.007660412397540984, "rewards/margins": 0.018869017807988744, "rewards/rejected": -0.01120860541044776, "step": 11 }, { "epoch": 0.0082276311278711, "grad_norm": 0.9468880789713233, "kl": 0.01171875, "learning_rate": 2.2e-06, "logits/chosen": -266401854.06060606, "logits/rejected": -261941049.80645162, "logps/chosen": -264.0, "logps/rejected": -233.29032258064515, "loss": 0.4973, "rewards/chosen": 0.0037943522135416665, "rewards/margins": 0.02359336935063844, "rewards/rejected": -0.019799017137096774, "step": 12 }, { "epoch": 0.008913267055193692, "grad_norm": 0.9092609168378706, "kl": 0.05859375, "learning_rate": 2.4000000000000003e-06, "logits/chosen": -247397359.74603173, "logits/rejected": -220168696.12307692, "logps/chosen": -194.28571428571428, "logps/rejected": -276.9230769230769, "loss": 0.4979, "rewards/chosen": 0.013307601686507936, "rewards/margins": 0.015478575244200245, "rewards/rejected": -0.002170973557692308, "step": 13 }, { "epoch": 0.009598902982516283, "grad_norm": 1.0290018624350254, "kl": 0.072265625, "learning_rate": 2.6e-06, "logits/chosen": -203459288.94915253, "logits/rejected": -297856371.01449275, "logps/chosen": -292.06779661016947, "logps/rejected": -282.4347826086956, "loss": 0.4988, "rewards/chosen": -0.024364406779661018, "rewards/margins": 0.00332963194316507, "rewards/rejected": -0.027694038722826088, "step": 14 }, { "epoch": 0.010284538909838875, "grad_norm": 0.9711939016857876, "kl": 0.048828125, "learning_rate": 2.8000000000000003e-06, "logits/chosen": -209126525.75438598, "logits/rejected": -231336541.7464789, "logps/chosen": -276.2105263157895, "logps/rejected": -283.2676056338028, "loss": 0.4923, "rewards/chosen": 0.015436540570175438, "rewards/margins": 0.05460907578144305, "rewards/rejected": -0.03917253521126761, "step": 15 }, { "epoch": 0.010970174837161468, "grad_norm": 0.9252812286757661, "kl": 0.0, "learning_rate": 3e-06, "logits/chosen": -230051219.3939394, "logits/rejected": -264647052.38709676, "logps/chosen": -336.969696969697, "logps/rejected": -234.83870967741936, "loss": 0.494, "rewards/chosen": -0.00146484375, "rewards/margins": 0.04911164314516129, "rewards/rejected": -0.05057648689516129, "step": 16 }, { "epoch": 0.011655810764484058, "grad_norm": 0.9041067372714432, "kl": 0.01953125, "learning_rate": 3.2000000000000003e-06, "logits/chosen": -268959744.0, "logits/rejected": -270270464.0, "logps/chosen": -351.5, "logps/rejected": -297.0, "loss": 0.4948, "rewards/chosen": 0.00177001953125, "rewards/margins": 0.0440673828125, "rewards/rejected": -0.04229736328125, "step": 17 }, { "epoch": 0.012341446691806651, "grad_norm": 0.8682952091545603, "kl": 0.0517578125, "learning_rate": 3.4000000000000005e-06, "logits/chosen": -401127982.54545456, "logits/rejected": -240563629.41935483, "logps/chosen": -242.1818181818182, "logps/rejected": -277.6774193548387, "loss": 0.4862, "rewards/chosen": 0.05308948863636364, "rewards/margins": 27601226.375670135, "rewards/rejected": -27601226.322580647, "step": 18 }, { "epoch": 0.013027082619129242, "grad_norm": 0.8793645895833742, "kl": 0.03125, "learning_rate": 3.6000000000000003e-06, "logits/chosen": -268689656.24242425, "logits/rejected": -247599236.12903225, "logps/chosen": -297.2121212121212, "logps/rejected": -223.48387096774192, "loss": 0.4928, "rewards/chosen": -0.006747159090909091, "rewards/margins": 0.043404050586510264, "rewards/rejected": -0.05015120967741935, "step": 19 }, { "epoch": 0.013712718546451834, "grad_norm": 0.9497087602574376, "kl": 0.0, "learning_rate": 3.8000000000000005e-06, "logits/chosen": -161119126.06896552, "logits/rejected": -202764639.08571428, "logps/chosen": -232.55172413793105, "logps/rejected": -251.42857142857142, "loss": 0.4829, "rewards/chosen": 0.06381330818965517, "rewards/margins": 0.13969221443965518, "rewards/rejected": -0.07587890625, "step": 20 }, { "epoch": 0.014398354473774426, "grad_norm": 0.883166229775906, "kl": 0.0, "learning_rate": 4.000000000000001e-06, "logits/chosen": -213715920.73846152, "logits/rejected": -237011464.12698412, "logps/chosen": -304.73846153846154, "logps/rejected": -239.74603174603175, "loss": 0.4861, "rewards/chosen": 0.02241962139423077, "rewards/margins": 0.1053809309180403, "rewards/rejected": -0.08296130952380952, "step": 21 }, { "epoch": 0.015083990401097017, "grad_norm": 1.0055954092853805, "kl": 0.0, "learning_rate": 4.2000000000000004e-06, "logits/chosen": -250713044.73239437, "logits/rejected": -271084490.1052632, "logps/chosen": -296.3380281690141, "logps/rejected": -267.7894736842105, "loss": 0.4776, "rewards/chosen": 0.056007922535211266, "rewards/margins": 0.19060222078082528, "rewards/rejected": -0.13459429824561403, "step": 22 }, { "epoch": 0.01576962632841961, "grad_norm": 0.8909231986471797, "kl": 0.0, "learning_rate": 4.4e-06, "logits/chosen": -197205872.28070176, "logits/rejected": -181004893.7464789, "logps/chosen": -204.0701754385965, "logps/rejected": -239.54929577464787, "loss": 0.4771, "rewards/chosen": 0.02050352933114035, "rewards/margins": 0.16321919834522486, "rewards/rejected": -0.1427156690140845, "step": 23 }, { "epoch": 0.0164552622557422, "grad_norm": 0.9228328146763347, "kl": 0.0, "learning_rate": 4.600000000000001e-06, "logits/chosen": -269535601.3114754, "logits/rejected": -251908646.20895523, "logps/chosen": -286.42622950819674, "logps/rejected": -309.4925373134328, "loss": 0.4744, "rewards/chosen": 0.0469390368852459, "rewards/margins": 0.20423847718375335, "rewards/rejected": -0.15729944029850745, "step": 24 }, { "epoch": 0.017140898183064794, "grad_norm": 1.002687075332263, "kl": 0.0, "learning_rate": 4.800000000000001e-06, "logits/chosen": -311044258.53968257, "logits/rejected": -260175903.5076923, "logps/chosen": -302.22222222222223, "logps/rejected": -283.32307692307694, "loss": 0.4601, "rewards/chosen": 0.009486607142857142, "rewards/margins": 0.3224673763736264, "rewards/rejected": -0.31298076923076923, "step": 25 }, { "epoch": 0.017826534110387385, "grad_norm": 1.078300649850198, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -185700753.56862745, "logits/rejected": -279547638.02597404, "logps/chosen": -307.7647058823529, "logps/rejected": -247.6883116883117, "loss": 0.4536, "rewards/chosen": 0.04015395220588235, "rewards/margins": 0.19843317298510313, "rewards/rejected": -0.15827922077922077, "step": 26 }, { "epoch": 0.018512170037709975, "grad_norm": 1.0197527067339498, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -239638741.97014925, "logits/rejected": -263759838.4262295, "logps/chosen": -246.44776119402985, "logps/rejected": -251.01639344262296, "loss": 0.4422, "rewards/chosen": 0.1117070895522388, "rewards/margins": 0.5061743026669929, "rewards/rejected": -0.3944672131147541, "step": 27 }, { "epoch": 0.019197805965032566, "grad_norm": 0.9487953690765935, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -256575699.86206895, "logits/rejected": -255972381.25714287, "logps/chosen": -229.79310344827587, "logps/rejected": -293.7142857142857, "loss": 0.4504, "rewards/chosen": 0.04475350215517242, "rewards/margins": 0.39386064501231527, "rewards/rejected": -0.34910714285714284, "step": 28 }, { "epoch": 0.01988344189235516, "grad_norm": 0.7944472885358466, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -239914188.8, "logits/rejected": -205027448.47058824, "logps/chosen": -273.3333333333333, "logps/rejected": -252.0, "loss": 0.4505, "rewards/chosen": 0.05325520833333333, "rewards/margins": 0.3908011642156862, "rewards/rejected": -0.3375459558823529, "step": 29 }, { "epoch": 0.02056907781967775, "grad_norm": 0.7604314557767836, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -328636054.5882353, "logits/rejected": -293042039.46666664, "logps/chosen": -254.58823529411765, "logps/rejected": -269.8666666666667, "loss": 0.4419, "rewards/chosen": 0.05572150735294118, "rewards/margins": 0.5447840073529412, "rewards/rejected": -0.4890625, "step": 30 }, { "epoch": 0.02125471374700034, "grad_norm": 0.7754105754031038, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -329049913.8064516, "logits/rejected": -289279875.8787879, "logps/chosen": -327.2258064516129, "logps/rejected": -286.54545454545456, "loss": 0.438, "rewards/chosen": -0.004032258064516129, "rewards/margins": 0.5447366813294233, "rewards/rejected": -0.5487689393939394, "step": 31 }, { "epoch": 0.021940349674322936, "grad_norm": 0.8285295606620912, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -283814570.6666667, "logits/rejected": -279291301.64705884, "logps/chosen": -290.6666666666667, "logps/rejected": -275.29411764705884, "loss": 0.4272, "rewards/chosen": 0.0212890625, "rewards/margins": 0.6334214154411765, "rewards/rejected": -0.6121323529411765, "step": 32 }, { "epoch": 0.022625985601645526, "grad_norm": 0.8105399173931278, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -241921462.85714287, "logits/rejected": -295232398.2222222, "logps/chosen": -260.57142857142856, "logps/rejected": -273.3333333333333, "loss": 0.4023, "rewards/chosen": 0.06105259486607143, "rewards/margins": 0.8622678726438492, "rewards/rejected": -0.8012152777777778, "step": 33 }, { "epoch": 0.023311621528968117, "grad_norm": 0.7886002173106584, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -227137693.53846154, "logits/rejected": -269500676.06349206, "logps/chosen": -287.5076923076923, "logps/rejected": -299.6825396825397, "loss": 0.4109, "rewards/chosen": 0.04140625, "rewards/margins": 0.7279141865079365, "rewards/rejected": -0.6865079365079365, "step": 34 }, { "epoch": 0.02399725745629071, "grad_norm": 0.6631288720556409, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -213782403.87878788, "logits/rejected": -219456809.29032257, "logps/chosen": -279.030303030303, "logps/rejected": -292.1290322580645, "loss": 0.4078, "rewards/chosen": -0.043708570075757576, "rewards/margins": 1.0429849783113392, "rewards/rejected": -1.0866935483870968, "step": 35 }, { "epoch": 0.024682893383613302, "grad_norm": 0.5365964151900361, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -276488519.68, "logits/rejected": -219528288.6037736, "logps/chosen": -257.06666666666666, "logps/rejected": -298.7169811320755, "loss": 0.3936, "rewards/chosen": 0.039114583333333335, "rewards/margins": 0.9447749606918239, "rewards/rejected": -0.9056603773584906, "step": 36 }, { "epoch": 0.025368529310935892, "grad_norm": 0.6708690141408862, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -261141014.26086956, "logits/rejected": -248530284.47457626, "logps/chosen": -274.7826086956522, "logps/rejected": -321.89830508474574, "loss": 0.3775, "rewards/chosen": 0.03283514492753623, "rewards/margins": 1.5402503991648242, "rewards/rejected": -1.507415254237288, "step": 37 }, { "epoch": 0.026054165238258483, "grad_norm": 0.5183394160211979, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -230886448.76190478, "logits/rejected": -201326592.0, "logps/chosen": -240.0, "logps/rejected": -257.7230769230769, "loss": 0.3826, "rewards/chosen": -0.10689484126984126, "rewards/margins": 1.3882974664224665, "rewards/rejected": -1.4951923076923077, "step": 38 }, { "epoch": 0.026739801165581077, "grad_norm": 0.578122974926956, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -324534272.0, "logits/rejected": -329515008.0, "logps/chosen": -241.0, "logps/rejected": -309.5, "loss": 0.3847, "rewards/chosen": -0.1383056640625, "rewards/margins": 1.4046630859375, "rewards/rejected": -1.54296875, "step": 39 }, { "epoch": 0.027425437092903668, "grad_norm": 0.5537082288282981, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -253755392.0, "logits/rejected": -264503296.0, "logps/chosen": -216.75, "logps/rejected": -307.0, "loss": 0.3759, "rewards/chosen": -0.029689788818359375, "rewards/margins": 1.5464820861816406, "rewards/rejected": -1.576171875, "step": 40 }, { "epoch": 0.02811107302022626, "grad_norm": 0.515406831830795, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -330231534.93333334, "logits/rejected": -427819008.0, "logps/chosen": -365.6, "logps/rejected": -301.6470588235294, "loss": 0.3445, "rewards/chosen": -0.10816243489583334, "rewards/margins": 2.0719846239276962, "rewards/rejected": -2.1801470588235294, "step": 41 }, { "epoch": 0.028796708947548853, "grad_norm": 0.4972257913832469, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -254683639.60655737, "logits/rejected": -236884273.6716418, "logps/chosen": -244.45901639344262, "logps/rejected": -313.7910447761194, "loss": 0.3862, "rewards/chosen": -0.265625, "rewards/margins": 1.5739272388059702, "rewards/rejected": -1.8395522388059702, "step": 42 }, { "epoch": 0.029482344874871443, "grad_norm": 0.5540738735220132, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -212318561.10344827, "logits/rejected": -202524964.57142857, "logps/chosen": -284.13793103448273, "logps/rejected": -287.77142857142854, "loss": 0.3664, "rewards/chosen": -0.18661604256465517, "rewards/margins": 1.1919553860067733, "rewards/rejected": -1.3785714285714286, "step": 43 }, { "epoch": 0.030167980802194034, "grad_norm": 0.46930990597687866, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -332026516.6451613, "logits/rejected": -338340522.6666667, "logps/chosen": -264.258064516129, "logps/rejected": -298.42424242424244, "loss": 0.3755, "rewards/chosen": -0.3611391129032258, "rewards/margins": 2.176739674975562, "rewards/rejected": -2.537878787878788, "step": 44 }, { "epoch": 0.030853616729516628, "grad_norm": 0.7236450745203775, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -302884672.85333335, "logits/rejected": -316234694.0377358, "logps/chosen": -254.08, "logps/rejected": -269.2830188679245, "loss": 0.4286, "rewards/chosen": -0.5183333333333333, "rewards/margins": 1.0972327044025156, "rewards/rejected": -1.615566037735849, "step": 45 }, { "epoch": 0.03153925265683922, "grad_norm": 0.6431044508840758, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -302764221.04615384, "logits/rejected": -256984340.31746033, "logps/chosen": -242.46153846153845, "logps/rejected": -378.92063492063494, "loss": 0.3548, "rewards/chosen": -0.2971153846153846, "rewards/margins": 2.651297313797314, "rewards/rejected": -2.9484126984126986, "step": 46 }, { "epoch": 0.03222488858416181, "grad_norm": 0.550277165842801, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -243402784.5079365, "logits/rejected": -268177344.9846154, "logps/chosen": -263.1111111111111, "logps/rejected": -258.7076923076923, "loss": 0.3609, "rewards/chosen": -0.20647321428571427, "rewards/margins": 2.156988324175824, "rewards/rejected": -2.3634615384615385, "step": 47 }, { "epoch": 0.0329105245114844, "grad_norm": 0.5263697574413075, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -245736869.6470588, "logits/rejected": -273748241.06666666, "logps/chosen": -192.94117647058823, "logps/rejected": -265.3333333333333, "loss": 0.4165, "rewards/chosen": -0.4025735294117647, "rewards/margins": 1.9474264705882354, "rewards/rejected": -2.35, "step": 48 }, { "epoch": 0.03359616043880699, "grad_norm": 0.5084833914633539, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -211812352.0, "logits/rejected": -227278848.0, "logps/chosen": -220.75, "logps/rejected": -278.75, "loss": 0.3745, "rewards/chosen": -0.327880859375, "rewards/margins": 1.963134765625, "rewards/rejected": -2.291015625, "step": 49 }, { "epoch": 0.03428179636612959, "grad_norm": 0.6359722951750122, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -228620868.7761194, "logits/rejected": -211090381.63934427, "logps/chosen": -338.6268656716418, "logps/rejected": -317.11475409836066, "loss": 0.3924, "rewards/chosen": -0.5237873134328358, "rewards/margins": 2.2098192439442137, "rewards/rejected": -2.7336065573770494, "step": 50 }, { "epoch": 0.03496743229345218, "grad_norm": 0.45273117649550604, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -260317448.2580645, "logits/rejected": -273265260.6060606, "logps/chosen": -227.3548387096774, "logps/rejected": -322.90909090909093, "loss": 0.3758, "rewards/chosen": -0.34274193548387094, "rewards/margins": 2.536045943304008, "rewards/rejected": -2.878787878787879, "step": 51 }, { "epoch": 0.03565306822077477, "grad_norm": 0.5488559440711003, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -268435456.0, "logits/rejected": -247929969.7777778, "logps/chosen": -323.42857142857144, "logps/rejected": -288.44444444444446, "loss": 0.3347, "rewards/chosen": -0.24637276785714285, "rewards/margins": 2.5904327876984126, "rewards/rejected": -2.8368055555555554, "step": 52 }, { "epoch": 0.03633870414809736, "grad_norm": 0.5361675226060483, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -244794833.45454547, "logits/rejected": -251117039.48387095, "logps/chosen": -280.72727272727275, "logps/rejected": -270.4516129032258, "loss": 0.3698, "rewards/chosen": -0.20336174242424243, "rewards/margins": 1.901476967253177, "rewards/rejected": -2.1048387096774195, "step": 53 }, { "epoch": 0.03702434007541995, "grad_norm": 0.6178554997826525, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -306446336.0, "logits/rejected": -221249536.0, "logps/chosen": -249.25, "logps/rejected": -288.0, "loss": 0.3582, "rewards/chosen": -0.221435546875, "rewards/margins": 2.532470703125, "rewards/rejected": -2.75390625, "step": 54 }, { "epoch": 0.03770997600274254, "grad_norm": 0.5737791285900101, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -288116420.9230769, "logits/rejected": -244814901.89473686, "logps/chosen": -265.84615384615387, "logps/rejected": -381.4736842105263, "loss": 0.3119, "rewards/chosen": -0.0694110576923077, "rewards/margins": 2.8483521002024292, "rewards/rejected": -2.9177631578947367, "step": 55 }, { "epoch": 0.03839561193006513, "grad_norm": 0.5804933768441619, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -234055915.01639345, "logits/rejected": -232376961.91044775, "logps/chosen": -300.0655737704918, "logps/rejected": -312.8358208955224, "loss": 0.3665, "rewards/chosen": -0.3724385245901639, "rewards/margins": 2.217113714215806, "rewards/rejected": -2.58955223880597, "step": 56 }, { "epoch": 0.03908124785738773, "grad_norm": 0.6768037022976536, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -280592858.89855075, "logits/rejected": -282653435.66101694, "logps/chosen": -282.6666666666667, "logps/rejected": -325.6949152542373, "loss": 0.3555, "rewards/chosen": -0.2689085144927536, "rewards/margins": 2.8709219939818227, "rewards/rejected": -3.139830508474576, "step": 57 }, { "epoch": 0.03976688378471032, "grad_norm": 0.5873214566160008, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -240981829.8181818, "logits/rejected": -257882045.93548387, "logps/chosen": -310.54545454545456, "logps/rejected": -308.9032258064516, "loss": 0.3772, "rewards/chosen": -0.19688091856060605, "rewards/margins": 2.0107803717619745, "rewards/rejected": -2.2076612903225805, "step": 58 }, { "epoch": 0.04045251971203291, "grad_norm": 0.5644540366529803, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -192683783.75757575, "logits/rejected": -326614511.483871, "logps/chosen": -274.6666666666667, "logps/rejected": -355.61290322580646, "loss": 0.3763, "rewards/chosen": -0.22774621212121213, "rewards/margins": 2.518221529814272, "rewards/rejected": -2.745967741935484, "step": 59 }, { "epoch": 0.0411381556393555, "grad_norm": 0.4979063397065147, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -259425469.62962964, "logits/rejected": -266168264.64864865, "logps/chosen": -338.22222222222223, "logps/rejected": -293.18918918918916, "loss": 0.331, "rewards/chosen": 0.034577546296296294, "rewards/margins": 2.162955924674675, "rewards/rejected": -2.1283783783783785, "step": 60 }, { "epoch": 0.04182379156667809, "grad_norm": 0.49892129500995896, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -264840338.2857143, "logits/rejected": -214375537.7777778, "logps/chosen": -211.71428571428572, "logps/rejected": -313.3333333333333, "loss": 0.3281, "rewards/chosen": -0.023716517857142856, "rewards/margins": 1.771422371031746, "rewards/rejected": -1.7951388888888888, "step": 61 }, { "epoch": 0.04250942749400068, "grad_norm": 0.5449166438822667, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -248152553.07462686, "logits/rejected": -317391922.3606557, "logps/chosen": -336.7164179104478, "logps/rejected": -264.91803278688525, "loss": 0.3801, "rewards/chosen": -0.10657649253731344, "rewards/margins": 2.1249808845118667, "rewards/rejected": -2.2315573770491803, "step": 62 }, { "epoch": 0.04319506342132328, "grad_norm": 0.5454336220573117, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -246149303.40298507, "logits/rejected": -269810637.6393443, "logps/chosen": -284.8955223880597, "logps/rejected": -300.59016393442624, "loss": 0.3495, "rewards/chosen": -0.07596781716417911, "rewards/margins": 2.4978026746390993, "rewards/rejected": -2.5737704918032787, "step": 63 }, { "epoch": 0.04388069934864587, "grad_norm": 0.5426771277848758, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -243957222.81967214, "logits/rejected": -241141179.2238806, "logps/chosen": -246.0327868852459, "logps/rejected": -328.1194029850746, "loss": 0.3302, "rewards/chosen": -0.04802766393442623, "rewards/margins": 2.6515992017372154, "rewards/rejected": -2.699626865671642, "step": 64 }, { "epoch": 0.04456633527596846, "grad_norm": 0.5441173146090078, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -298472084.6451613, "logits/rejected": -297922684.1212121, "logps/chosen": -291.35483870967744, "logps/rejected": -330.7878787878788, "loss": 0.3298, "rewards/chosen": 0.0715725806451613, "rewards/margins": 2.6776331867057674, "rewards/rejected": -2.606060606060606, "step": 65 }, { "epoch": 0.04525197120329105, "grad_norm": 0.517901132395746, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -229288618.66666666, "logits/rejected": -277906465.0322581, "logps/chosen": -371.3939393939394, "logps/rejected": -265.80645161290323, "loss": 0.354, "rewards/chosen": 0.1533203125, "rewards/margins": 2.3327557963709675, "rewards/rejected": -2.1794354838709675, "step": 66 }, { "epoch": 0.04593760713061364, "grad_norm": 0.6307658185622965, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -354257368.61538464, "logits/rejected": -245115125.76, "logps/chosen": -252.4102564102564, "logps/rejected": -354.24, "loss": 0.3734, "rewards/chosen": -0.03617037259615385, "rewards/margins": 2.863829627403846, "rewards/rejected": -2.9, "step": 67 }, { "epoch": 0.046623243057936234, "grad_norm": 0.5145484584125917, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -277635864.7741935, "logits/rejected": -248353636.84848484, "logps/chosen": -197.41935483870967, "logps/rejected": -275.3939393939394, "loss": 0.3608, "rewards/chosen": -0.04909195438508065, "rewards/margins": 2.19901410622098, "rewards/rejected": -2.2481060606060606, "step": 68 }, { "epoch": 0.047308878985258825, "grad_norm": 0.6165626164965914, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -259401570.46153846, "logits/rejected": -267103930.92063493, "logps/chosen": -333.2923076923077, "logps/rejected": -324.06349206349205, "loss": 0.3188, "rewards/chosen": 0.25614483173076924, "rewards/margins": 2.5339226095085468, "rewards/rejected": -2.2777777777777777, "step": 69 }, { "epoch": 0.04799451491258142, "grad_norm": 0.4889451042263892, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -209453056.0, "logits/rejected": -228851712.0, "logps/chosen": -225.5, "logps/rejected": -396.5, "loss": 0.3291, "rewards/chosen": 0.1665802001953125, "rewards/margins": 2.5240020751953125, "rewards/rejected": -2.357421875, "step": 70 }, { "epoch": 0.04868015083990401, "grad_norm": 0.49188791242551594, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -331287414.4477612, "logits/rejected": -289613253.24590164, "logps/chosen": -300.4179104477612, "logps/rejected": -350.95081967213116, "loss": 0.3582, "rewards/chosen": -0.12196828358208955, "rewards/margins": 2.4026218803523367, "rewards/rejected": -2.5245901639344264, "step": 71 }, { "epoch": 0.049365786767226603, "grad_norm": 0.4696698123076489, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -294387712.0, "logits/rejected": -287047680.0, "logps/chosen": -277.5, "logps/rejected": -352.5, "loss": 0.3452, "rewards/chosen": 0.021728515625, "rewards/margins": 1.756103515625, "rewards/rejected": -1.734375, "step": 72 }, { "epoch": 0.050051422694549194, "grad_norm": 0.5813311636049598, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -221264732.7536232, "logits/rejected": -221942662.5084746, "logps/chosen": -274.0869565217391, "logps/rejected": -277.4237288135593, "loss": 0.3514, "rewards/chosen": 0.03920403079710145, "rewards/margins": 2.4841192850343896, "rewards/rejected": -2.444915254237288, "step": 73 }, { "epoch": 0.050737058621871785, "grad_norm": 1.0555712886860558, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -267649024.0, "logits/rejected": -270008320.0, "logps/chosen": -284.0, "logps/rejected": -350.5, "loss": 0.3267, "rewards/chosen": 0.23779296875, "rewards/margins": 2.20263671875, "rewards/rejected": -1.96484375, "step": 74 }, { "epoch": 0.051422694549194375, "grad_norm": 0.531814417060953, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -286245116.06153846, "logits/rejected": -305451853.2063492, "logps/chosen": -247.5076923076923, "logps/rejected": -333.46031746031747, "loss": 0.3268, "rewards/chosen": 0.20078125, "rewards/margins": 2.5976066468253967, "rewards/rejected": -2.3968253968253967, "step": 75 }, { "epoch": 0.052108330476516966, "grad_norm": 0.49493620231842267, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -261114489.01818183, "logits/rejected": -218793282.630137, "logps/chosen": -203.63636363636363, "logps/rejected": -288.0, "loss": 0.3145, "rewards/chosen": 0.10404829545454546, "rewards/margins": 2.4670619940846823, "rewards/rejected": -2.363013698630137, "step": 76 }, { "epoch": 0.052793966403839564, "grad_norm": 0.44780909742158115, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -289922065.9649123, "logits/rejected": -266308766.64788732, "logps/chosen": -258.94736842105266, "logps/rejected": -286.6478873239437, "loss": 0.3, "rewards/chosen": 0.3456688596491228, "rewards/margins": 2.782288577958982, "rewards/rejected": -2.436619718309859, "step": 77 }, { "epoch": 0.053479602331162154, "grad_norm": 0.4766647413117899, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -265814016.0, "logits/rejected": -277086208.0, "logps/chosen": -343.0, "logps/rejected": -308.75, "loss": 0.3232, "rewards/chosen": 0.13623046875, "rewards/margins": 2.66162109375, "rewards/rejected": -2.525390625, "step": 78 }, { "epoch": 0.054165238258484745, "grad_norm": 0.4998446114256767, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -316953350.9189189, "logits/rejected": -292047834.0740741, "logps/chosen": -290.3783783783784, "logps/rejected": -353.6296296296296, "loss": 0.3476, "rewards/chosen": 0.06397804054054054, "rewards/margins": 2.7711539664664664, "rewards/rejected": -2.707175925925926, "step": 79 }, { "epoch": 0.054850874185807336, "grad_norm": 0.5190080764783026, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -274291654.0377358, "logits/rejected": -262395658.24, "logps/chosen": -281.9622641509434, "logps/rejected": -293.3333333333333, "loss": 0.2981, "rewards/chosen": 0.28537735849056606, "rewards/margins": 2.672044025157233, "rewards/rejected": -2.3866666666666667, "step": 80 }, { "epoch": 0.055536510113129926, "grad_norm": 0.5142618890871266, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -258521646.54545453, "logits/rejected": -218780308.6451613, "logps/chosen": -264.4848484848485, "logps/rejected": -315.8709677419355, "loss": 0.3314, "rewards/chosen": 0.13707386363636365, "rewards/margins": 2.7096545087976542, "rewards/rejected": -2.5725806451612905, "step": 81 }, { "epoch": 0.05622214604045252, "grad_norm": 0.9661618674743873, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -318767104.0, "logits/rejected": -327777090.3703704, "logps/chosen": -254.05405405405406, "logps/rejected": -349.9259259259259, "loss": 0.338, "rewards/chosen": 0.18976984797297297, "rewards/margins": 2.2545846627877877, "rewards/rejected": -2.064814814814815, "step": 82 }, { "epoch": 0.056907781967775115, "grad_norm": 0.5755875817727906, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -256784611.55555555, "logits/rejected": -246864749.7142857, "logps/chosen": -311.1111111111111, "logps/rejected": -316.85714285714283, "loss": 0.3157, "rewards/chosen": 0.4894883897569444, "rewards/margins": 3.007345532614087, "rewards/rejected": -2.517857142857143, "step": 83 }, { "epoch": 0.057593417895097705, "grad_norm": 0.47504593198360706, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -323688420.6933333, "logits/rejected": -332062256.3018868, "logps/chosen": -315.73333333333335, "logps/rejected": -274.7169811320755, "loss": 0.3472, "rewards/chosen": 0.3720833333333333, "rewards/margins": 2.5584040880503145, "rewards/rejected": -2.186320754716981, "step": 84 }, { "epoch": 0.058279053822420296, "grad_norm": 0.47041551586911107, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -244426681.37931034, "logits/rejected": -231046231.77142859, "logps/chosen": -242.48275862068965, "logps/rejected": -321.37142857142857, "loss": 0.2871, "rewards/chosen": 0.33997844827586204, "rewards/margins": 2.357835591133005, "rewards/rejected": -2.017857142857143, "step": 85 }, { "epoch": 0.058964689749742887, "grad_norm": 0.46625718145428025, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -267258107.50877193, "logits/rejected": -341215491.6056338, "logps/chosen": -258.6666666666667, "logps/rejected": -284.61971830985914, "loss": 0.3064, "rewards/chosen": 0.23684210526315788, "rewards/margins": 2.793180133432172, "rewards/rejected": -2.556338028169014, "step": 86 }, { "epoch": 0.05965032567706548, "grad_norm": 0.4679554459096769, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -252387684.17391303, "logits/rejected": -217819448.40677965, "logps/chosen": -221.2173913043478, "logps/rejected": -307.52542372881356, "loss": 0.3452, "rewards/chosen": 0.1930480072463768, "rewards/margins": 2.9875395326701057, "rewards/rejected": -2.794491525423729, "step": 87 }, { "epoch": 0.06033596160438807, "grad_norm": 0.6490340168616623, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -229215990.02597404, "logits/rejected": -182246620.8627451, "logps/chosen": -271.16883116883116, "logps/rejected": -309.6470588235294, "loss": 0.3416, "rewards/chosen": 0.1750202922077922, "rewards/margins": 1.2191379392666157, "rewards/rejected": -1.0441176470588236, "step": 88 }, { "epoch": 0.06102159753171066, "grad_norm": 0.49644945153013725, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -286037780.9836066, "logits/rejected": -286464703.04477614, "logps/chosen": -301.11475409836066, "logps/rejected": -294.92537313432837, "loss": 0.3199, "rewards/chosen": 0.0070600665983606556, "rewards/margins": 2.6190003651058236, "rewards/rejected": -2.611940298507463, "step": 89 }, { "epoch": 0.061707233459033256, "grad_norm": 0.4054438685280132, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -264503296.0, "logits/rejected": -288096256.0, "logps/chosen": -307.0, "logps/rejected": -300.0, "loss": 0.284, "rewards/chosen": 0.37109375, "rewards/margins": 3.13671875, "rewards/rejected": -2.765625, "step": 90 }, { "epoch": 0.06239286938635585, "grad_norm": 0.5529542066558714, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -229874919.22580644, "logits/rejected": -235643624.72727272, "logps/chosen": -275.8709677419355, "logps/rejected": -315.1515151515151, "loss": 0.2924, "rewards/chosen": 0.4183467741935484, "rewards/margins": 2.5736498044965788, "rewards/rejected": -2.1553030303030303, "step": 91 }, { "epoch": 0.06307850531367844, "grad_norm": 0.4613391235814498, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -202343392.96969697, "logits/rejected": -207279797.67741936, "logps/chosen": -243.3939393939394, "logps/rejected": -274.06451612903226, "loss": 0.3268, "rewards/chosen": 0.04249526515151515, "rewards/margins": 2.6513662328934506, "rewards/rejected": -2.6088709677419355, "step": 92 }, { "epoch": 0.06376414124100103, "grad_norm": 0.549035447376032, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -305135616.0, "logits/rejected": -270794752.0, "logps/chosen": -280.0, "logps/rejected": -319.25, "loss": 0.3033, "rewards/chosen": 0.221435546875, "rewards/margins": 2.844482421875, "rewards/rejected": -2.623046875, "step": 93 }, { "epoch": 0.06444977716832362, "grad_norm": 0.5714622776639201, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -225624628.96551725, "logits/rejected": -287609417.14285713, "logps/chosen": -310.8965517241379, "logps/rejected": -325.9428571428571, "loss": 0.2773, "rewards/chosen": 0.5074757543103449, "rewards/margins": 2.7431900400246305, "rewards/rejected": -2.2357142857142858, "step": 94 }, { "epoch": 0.06513541309564622, "grad_norm": 0.4408906361228316, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -288056267.9322034, "logits/rejected": -307825441.3913044, "logps/chosen": -296.135593220339, "logps/rejected": -304.69565217391306, "loss": 0.2921, "rewards/chosen": 0.4495497881355932, "rewards/margins": 3.127086020019651, "rewards/rejected": -2.677536231884058, "step": 95 }, { "epoch": 0.0658210490229688, "grad_norm": 0.5294652124818812, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -303696045.55932206, "logits/rejected": -234394727.88405797, "logps/chosen": -302.64406779661016, "logps/rejected": -332.5217391304348, "loss": 0.2942, "rewards/chosen": 0.2816472457627119, "rewards/margins": 2.6548356515598135, "rewards/rejected": -2.3731884057971016, "step": 96 }, { "epoch": 0.0665066849502914, "grad_norm": 0.45757442865071685, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -235853616.23188406, "logits/rejected": -323885576.6779661, "logps/chosen": -299.82608695652175, "logps/rejected": -280.135593220339, "loss": 0.2993, "rewards/chosen": 0.5783514492753623, "rewards/margins": 3.260554839105871, "rewards/rejected": -2.6822033898305087, "step": 97 }, { "epoch": 0.06719232087761398, "grad_norm": 0.5318871286184247, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -244794833.45454547, "logits/rejected": -246787435.3548387, "logps/chosen": -216.4848484848485, "logps/rejected": -318.4516129032258, "loss": 0.3278, "rewards/chosen": 0.018584280303030304, "rewards/margins": 2.8613262157869013, "rewards/rejected": -2.842741935483871, "step": 98 }, { "epoch": 0.06787795680493658, "grad_norm": 0.5302147483862296, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -277317511.5294118, "logits/rejected": -335264699.73333335, "logps/chosen": -312.94117647058823, "logps/rejected": -379.2, "loss": 0.3133, "rewards/chosen": 0.13166360294117646, "rewards/margins": 3.3983302696078432, "rewards/rejected": -3.2666666666666666, "step": 99 }, { "epoch": 0.06856359273225918, "grad_norm": 0.4535696316096024, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -281297988.26666665, "logits/rejected": -237101537.88235295, "logps/chosen": -234.26666666666668, "logps/rejected": -268.70588235294116, "loss": 0.3333, "rewards/chosen": -0.040690104166666664, "rewards/margins": 2.4482804840686274, "rewards/rejected": -2.488970588235294, "step": 100 }, { "epoch": 0.06924922865958176, "grad_norm": 0.5385707230757211, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -240153863.31428573, "logits/rejected": -223889054.89655173, "logps/chosen": -265.14285714285717, "logps/rejected": -407.17241379310343, "loss": 0.313, "rewards/chosen": 0.11785714285714285, "rewards/margins": 3.4023399014778324, "rewards/rejected": -3.2844827586206895, "step": 101 }, { "epoch": 0.06993486458690436, "grad_norm": 0.46920911059980147, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -264105851.87096775, "logits/rejected": -266910254.54545453, "logps/chosen": -321.80645161290323, "logps/rejected": -317.09090909090907, "loss": 0.2736, "rewards/chosen": 0.3911290322580645, "rewards/margins": 3.6297653958944283, "rewards/rejected": -3.2386363636363638, "step": 102 }, { "epoch": 0.07062050051422694, "grad_norm": 0.5114522703564416, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -273339565.2923077, "logits/rejected": -260446305.52380952, "logps/chosen": -272.0, "logps/rejected": -333.7142857142857, "loss": 0.3127, "rewards/chosen": 0.15889423076923076, "rewards/margins": 2.150957722832723, "rewards/rejected": -1.992063492063492, "step": 103 }, { "epoch": 0.07130613644154954, "grad_norm": 0.49875522792990157, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -254663114.50746268, "logits/rejected": -344620518.8196721, "logps/chosen": -286.089552238806, "logps/rejected": -328.39344262295083, "loss": 0.2818, "rewards/chosen": 0.6007462686567164, "rewards/margins": 3.895828235869831, "rewards/rejected": -3.2950819672131146, "step": 104 }, { "epoch": 0.07199177236887212, "grad_norm": 0.5005175304122931, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -245402328.94915253, "logits/rejected": -293479705.9710145, "logps/chosen": -266.8474576271187, "logps/rejected": -339.4782608695652, "loss": 0.2618, "rewards/chosen": 0.3098516949152542, "rewards/margins": 3.5272429992630805, "rewards/rejected": -3.217391304347826, "step": 105 }, { "epoch": 0.07267740829619472, "grad_norm": 0.45128525681911086, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -232244604.34285715, "logits/rejected": -234013236.96551725, "logps/chosen": -274.74285714285713, "logps/rejected": -294.3448275862069, "loss": 0.2988, "rewards/chosen": 0.25089285714285714, "rewards/margins": 3.5289100985221675, "rewards/rejected": -3.2780172413793105, "step": 106 }, { "epoch": 0.07336304422351732, "grad_norm": 0.43402828793714415, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -278702110.5671642, "logits/rejected": -267610347.01639345, "logps/chosen": -286.2089552238806, "logps/rejected": -291.672131147541, "loss": 0.2897, "rewards/chosen": 0.24696828358208955, "rewards/margins": 3.7469682835820897, "rewards/rejected": -3.5, "step": 107 }, { "epoch": 0.0740486801508399, "grad_norm": 0.5302694711833661, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -258182712.8888889, "logits/rejected": -283398918.9189189, "logps/chosen": -298.6666666666667, "logps/rejected": -333.4054054054054, "loss": 0.2442, "rewards/chosen": 0.6255787037037037, "rewards/margins": 3.9634165415415414, "rewards/rejected": -3.3378378378378377, "step": 108 }, { "epoch": 0.0747343160781625, "grad_norm": 0.4630644789272658, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -235706132.98360655, "logits/rejected": -316263041.9104478, "logps/chosen": -267.27868852459017, "logps/rejected": -334.32835820895525, "loss": 0.2769, "rewards/chosen": 0.5075563524590164, "rewards/margins": 3.9627802330560313, "rewards/rejected": -3.455223880597015, "step": 109 }, { "epoch": 0.07541995200548508, "grad_norm": 0.47019754693520477, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -227025026.03174603, "logits/rejected": -251916351.0153846, "logps/chosen": -289.9047619047619, "logps/rejected": -287.0153846153846, "loss": 0.275, "rewards/chosen": 0.3665674603174603, "rewards/margins": 2.6665674603174603, "rewards/rejected": -2.3, "step": 110 }, { "epoch": 0.07610558793280768, "grad_norm": 0.48986422340401253, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -264170062.1016949, "logits/rejected": -247524723.01449275, "logps/chosen": -184.94915254237287, "logps/rejected": -346.8985507246377, "loss": 0.2679, "rewards/chosen": 0.12294756355932203, "rewards/margins": 3.7497591577622207, "rewards/rejected": -3.6268115942028984, "step": 111 }, { "epoch": 0.07679122386013026, "grad_norm": 0.6037554058731764, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -224327613.93548387, "logits/rejected": -277078264.24242425, "logps/chosen": -322.83870967741933, "logps/rejected": -329.6969696969697, "loss": 0.2862, "rewards/chosen": 0.11061145413306452, "rewards/margins": 3.2659144844360948, "rewards/rejected": -3.1553030303030303, "step": 112 }, { "epoch": 0.07747685978745286, "grad_norm": 0.5334767917877146, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -235706132.98360655, "logits/rejected": -254162302.08955225, "logps/chosen": -233.8360655737705, "logps/rejected": -312.1194029850746, "loss": 0.2561, "rewards/chosen": 0.40817110655737704, "rewards/margins": 4.236529315512601, "rewards/rejected": -3.828358208955224, "step": 113 }, { "epoch": 0.07816249571477546, "grad_norm": 0.6412758042035153, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -248350056.56338027, "logits/rejected": -292865437.19298244, "logps/chosen": -324.7323943661972, "logps/rejected": -333.1929824561403, "loss": 0.3081, "rewards/chosen": 0.22139084507042253, "rewards/margins": 2.870513652087966, "rewards/rejected": -2.6491228070175437, "step": 114 }, { "epoch": 0.07884813164209804, "grad_norm": 0.38571293293955766, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -264675045.5172414, "logits/rejected": -265799036.34285715, "logps/chosen": -265.6551724137931, "logps/rejected": -335.54285714285714, "loss": 0.2322, "rewards/chosen": 0.7322198275862069, "rewards/margins": 4.642934113300493, "rewards/rejected": -3.9107142857142856, "step": 115 }, { "epoch": 0.07953376756942064, "grad_norm": 0.5137684900259905, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -253755392.0, "logits/rejected": -294999381.3333333, "logps/chosen": -317.14285714285717, "logps/rejected": -346.6666666666667, "loss": 0.2571, "rewards/chosen": 0.26450892857142855, "rewards/margins": 3.9832589285714284, "rewards/rejected": -3.71875, "step": 116 }, { "epoch": 0.08021940349674322, "grad_norm": 0.5510843750375651, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -265854345.84615386, "logits/rejected": -304120328.1269841, "logps/chosen": -293.66153846153844, "logps/rejected": -302.22222222222223, "loss": 0.2912, "rewards/chosen": 0.03293269230769231, "rewards/margins": 2.9019803113553113, "rewards/rejected": -2.869047619047619, "step": 117 }, { "epoch": 0.08090503942406582, "grad_norm": 0.5326321789582045, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -251658240.0, "logits/rejected": -240301355.32307693, "logps/chosen": -242.03174603174602, "logps/rejected": -290.2153846153846, "loss": 0.2729, "rewards/chosen": 0.1466393849206349, "rewards/margins": 4.035100923382173, "rewards/rejected": -3.8884615384615384, "step": 118 }, { "epoch": 0.0815906753513884, "grad_norm": 0.4537377668403987, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -234881024.0, "logits/rejected": -260466278.4, "logps/chosen": -253.88235294117646, "logps/rejected": -316.0, "loss": 0.2841, "rewards/chosen": 0.34329044117647056, "rewards/margins": 3.7682904411764704, "rewards/rejected": -3.425, "step": 119 }, { "epoch": 0.082276311278711, "grad_norm": 0.5174316364453037, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -235156060.32786885, "logits/rejected": -315511823.2835821, "logps/chosen": -263.60655737704917, "logps/rejected": -355.34328358208955, "loss": 0.2658, "rewards/chosen": 0.3130122950819672, "rewards/margins": 3.182415280156594, "rewards/rejected": -2.8694029850746268, "step": 120 }, { "epoch": 0.0829619472060336, "grad_norm": 0.5381285599110599, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -231375337.07462686, "logits/rejected": -261284511.47540984, "logps/chosen": -295.64179104477614, "logps/rejected": -278.0327868852459, "loss": 0.288, "rewards/chosen": 0.3451492537313433, "rewards/margins": 4.250886958649376, "rewards/rejected": -3.9057377049180326, "step": 121 }, { "epoch": 0.08364758313335618, "grad_norm": 0.42648945674623934, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -248196274.7936508, "logits/rejected": -291407336.36923075, "logps/chosen": -305.26984126984127, "logps/rejected": -331.0769230769231, "loss": 0.2697, "rewards/chosen": 0.46130952380952384, "rewards/margins": 4.07669413919414, "rewards/rejected": -3.6153846153846154, "step": 122 }, { "epoch": 0.08433321906067878, "grad_norm": 0.4510846162890721, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -331230178.74285716, "logits/rejected": -272485128.82758623, "logps/chosen": -224.45714285714286, "logps/rejected": -384.2758620689655, "loss": 0.2719, "rewards/chosen": 0.3620535714285714, "rewards/margins": 4.4310190886699505, "rewards/rejected": -4.068965517241379, "step": 123 }, { "epoch": 0.08501885498800137, "grad_norm": 0.5429355339838494, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -271279051.9322034, "logits/rejected": -259438977.85507247, "logps/chosen": -236.47457627118644, "logps/rejected": -304.92753623188406, "loss": 0.2533, "rewards/chosen": 0.4777542372881356, "rewards/margins": 3.7676093097519034, "rewards/rejected": -3.289855072463768, "step": 124 }, { "epoch": 0.08570449091532396, "grad_norm": 0.38044077466023524, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -310902784.0, "logits/rejected": -278659072.0, "logps/chosen": -252.75, "logps/rejected": -360.0, "loss": 0.247, "rewards/chosen": 0.76611328125, "rewards/margins": 4.67626953125, "rewards/rejected": -3.91015625, "step": 125 }, { "epoch": 0.08639012684264656, "grad_norm": 0.49222703907474136, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -253424262.7368421, "logits/rejected": -271743639.4366197, "logps/chosen": -225.12280701754386, "logps/rejected": -285.7464788732394, "loss": 0.2484, "rewards/chosen": 0.43530701754385964, "rewards/margins": 4.2803774400790715, "rewards/rejected": -3.8450704225352115, "step": 126 }, { "epoch": 0.08707576276996914, "grad_norm": 0.39108040041492903, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -296677102.93333334, "logits/rejected": -213662780.2352941, "logps/chosen": -231.33333333333334, "logps/rejected": -305.88235294117646, "loss": 0.2445, "rewards/chosen": 0.43046875, "rewards/margins": 4.470909926470588, "rewards/rejected": -4.040441176470588, "step": 127 }, { "epoch": 0.08776139869729174, "grad_norm": 0.43784618146929866, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -350457400.8888889, "logits/rejected": -308700774.4, "logps/chosen": -222.22222222222223, "logps/rejected": -312.3692307692308, "loss": 0.2797, "rewards/chosen": 0.3013392857142857, "rewards/margins": 3.2590315934065934, "rewards/rejected": -2.957692307692308, "step": 128 }, { "epoch": 0.08844703462461433, "grad_norm": 0.43835527386731876, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -213350263.46666667, "logits/rejected": -321481065.4117647, "logps/chosen": -283.3333333333333, "logps/rejected": -296.47058823529414, "loss": 0.2404, "rewards/chosen": 0.6046875, "rewards/margins": 4.2554227941176475, "rewards/rejected": -3.650735294117647, "step": 129 }, { "epoch": 0.08913267055193692, "grad_norm": 0.462796803616905, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -232907233.88235295, "logits/rejected": -262423620.26666668, "logps/chosen": -239.76470588235293, "logps/rejected": -357.3333333333333, "loss": 0.2604, "rewards/chosen": 0.4189453125, "rewards/margins": 4.564778645833333, "rewards/rejected": -4.145833333333333, "step": 130 }, { "epoch": 0.08981830647925951, "grad_norm": 0.44466683759149506, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -304570998.15384614, "logits/rejected": -301723582.984127, "logps/chosen": -265.6, "logps/rejected": -308.06349206349205, "loss": 0.2707, "rewards/chosen": 0.3245192307692308, "rewards/margins": 3.546741452991453, "rewards/rejected": -3.2222222222222223, "step": 131 }, { "epoch": 0.0905039424065821, "grad_norm": 0.43725796059685385, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -207880192.0, "logits/rejected": -218365952.0, "logps/chosen": -336.0, "logps/rejected": -297.5, "loss": 0.2426, "rewards/chosen": 0.6669921875, "rewards/margins": 4.5341796875, "rewards/rejected": -3.8671875, "step": 132 }, { "epoch": 0.0911895783339047, "grad_norm": 1.3049116401748613, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -232037428.06779662, "logits/rejected": -265760827.36231884, "logps/chosen": -275.79661016949154, "logps/rejected": -308.40579710144925, "loss": 0.2361, "rewards/chosen": 0.5948093220338984, "rewards/margins": 4.420896278555638, "rewards/rejected": -3.8260869565217392, "step": 133 }, { "epoch": 0.09187521426122729, "grad_norm": 0.4586091101261962, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -237134679.88059703, "logits/rejected": -306940541.90163934, "logps/chosen": -293.4925373134328, "logps/rejected": -242.88524590163934, "loss": 0.2677, "rewards/chosen": 0.59765625, "rewards/margins": 3.2533939549180326, "rewards/rejected": -2.6557377049180326, "step": 134 }, { "epoch": 0.09256085018854988, "grad_norm": 0.423307617990971, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -238076684.19047618, "logits/rejected": -291407336.36923075, "logps/chosen": -335.74603174603175, "logps/rejected": -315.0769230769231, "loss": 0.237, "rewards/chosen": 0.6282242063492064, "rewards/margins": 4.658993437118437, "rewards/rejected": -4.030769230769231, "step": 135 }, { "epoch": 0.09324648611587247, "grad_norm": 0.44387577022220764, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -227278848.0, "logits/rejected": -329252864.0, "logps/chosen": -304.5, "logps/rejected": -294.75, "loss": 0.2418, "rewards/chosen": 0.61181640625, "rewards/margins": 4.89306640625, "rewards/rejected": -4.28125, "step": 136 }, { "epoch": 0.09393212204319507, "grad_norm": 0.3677826304267846, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -292028416.0, "logits/rejected": -285474816.0, "logps/chosen": -249.5, "logps/rejected": -375.0, "loss": 0.2423, "rewards/chosen": 0.306640625, "rewards/margins": 3.822265625, "rewards/rejected": -3.515625, "step": 137 }, { "epoch": 0.09461775797051765, "grad_norm": 0.35442547716160117, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -309539635.2, "logits/rejected": -255235734.5882353, "logps/chosen": -271.46666666666664, "logps/rejected": -311.05882352941177, "loss": 0.2079, "rewards/chosen": 1.09375, "rewards/margins": 5.068014705882353, "rewards/rejected": -3.974264705882353, "step": 138 }, { "epoch": 0.09530339389784025, "grad_norm": 0.402749593544724, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -265680987.70149255, "logits/rejected": -322755130.75409836, "logps/chosen": -234.26865671641792, "logps/rejected": -347.0163934426229, "loss": 0.2506, "rewards/chosen": 0.42467350746268656, "rewards/margins": 5.076312851724982, "rewards/rejected": -4.651639344262295, "step": 139 }, { "epoch": 0.09598902982516284, "grad_norm": 0.5980692355727258, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -220264510.06060606, "logits/rejected": -230822020.12903225, "logps/chosen": -282.1818181818182, "logps/rejected": -282.5806451612903, "loss": 0.2679, "rewards/chosen": 0.5236742424242424, "rewards/margins": 3.301900048875855, "rewards/rejected": -2.778225806451613, "step": 140 }, { "epoch": 0.09667466575248543, "grad_norm": 0.4597379444412949, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -252511318.77966103, "logits/rejected": -252387684.17391303, "logps/chosen": -237.01694915254237, "logps/rejected": -326.0289855072464, "loss": 0.2284, "rewards/chosen": 0.715572033898305, "rewards/margins": 4.958325657086712, "rewards/rejected": -4.242753623188406, "step": 141 }, { "epoch": 0.09736030167980803, "grad_norm": 0.4387958516481828, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -307742076.3428571, "logits/rejected": -245583730.75862068, "logps/chosen": -259.42857142857144, "logps/rejected": -336.82758620689657, "loss": 0.2596, "rewards/chosen": 0.5740792410714286, "rewards/margins": 4.867182689347291, "rewards/rejected": -4.293103448275862, "step": 142 }, { "epoch": 0.09804593760713061, "grad_norm": 0.46062604364242354, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -251894538.81690142, "logits/rejected": -277559906.80701756, "logps/chosen": -356.50704225352115, "logps/rejected": -270.5964912280702, "loss": 0.2755, "rewards/chosen": 0.40360915492957744, "rewards/margins": 22185131.070275825, "rewards/rejected": -22185130.666666668, "step": 143 }, { "epoch": 0.09873157353445321, "grad_norm": 0.3994538519032511, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -227794096.55172414, "logits/rejected": -275386016.9142857, "logps/chosen": -265.6551724137931, "logps/rejected": -283.2, "loss": 0.2589, "rewards/chosen": 0.3389682112068966, "rewards/margins": 4.053253925492611, "rewards/rejected": -3.7142857142857144, "step": 144 }, { "epoch": 0.09941720946177579, "grad_norm": 0.40599692960649225, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -323106039.17241377, "logits/rejected": -264600663.77142859, "logps/chosen": -215.86206896551724, "logps/rejected": -313.14285714285717, "loss": 0.2326, "rewards/chosen": 0.3935883620689655, "rewards/margins": 4.986445504926108, "rewards/rejected": -4.5928571428571425, "step": 145 }, { "epoch": 0.10010284538909839, "grad_norm": 0.4842881743588677, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -294267042.53968257, "logits/rejected": -258627237.41538462, "logps/chosen": -222.22222222222223, "logps/rejected": -352.0, "loss": 0.2498, "rewards/chosen": 0.3087797619047619, "rewards/margins": 4.71647206959707, "rewards/rejected": -4.407692307692308, "step": 146 }, { "epoch": 0.10078848131642099, "grad_norm": 0.3876392367181984, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -271378827.2280702, "logits/rejected": -277178512.2253521, "logps/chosen": -245.6140350877193, "logps/rejected": -333.9718309859155, "loss": 0.2396, "rewards/chosen": 0.3059210526315789, "rewards/margins": 4.439723869532987, "rewards/rejected": -4.133802816901408, "step": 147 }, { "epoch": 0.10147411724374357, "grad_norm": 0.39662366631629775, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -218604620.41791046, "logits/rejected": -305290323.93442625, "logps/chosen": -274.14925373134326, "logps/rejected": -316.59016393442624, "loss": 0.2637, "rewards/chosen": 0.3969216417910448, "rewards/margins": 3.4174134450697333, "rewards/rejected": -3.0204918032786887, "step": 148 }, { "epoch": 0.10215975317106617, "grad_norm": 0.42562613941973554, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -250260138.66666666, "logits/rejected": -270162522.35294116, "logps/chosen": -263.73333333333335, "logps/rejected": -345.88235294117646, "loss": 0.2481, "rewards/chosen": 0.45546875, "rewards/margins": 4.334145220588235, "rewards/rejected": -3.8786764705882355, "step": 149 }, { "epoch": 0.10284538909838875, "grad_norm": 0.3746561430571297, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -246162255.44827586, "logits/rejected": -272509922.74285716, "logps/chosen": -251.58620689655172, "logps/rejected": -312.6857142857143, "loss": 0.2235, "rewards/chosen": 0.5581896551724138, "rewards/margins": 3.8260467980295565, "rewards/rejected": -3.267857142857143, "step": 150 }, { "epoch": 0.10353102502571135, "grad_norm": 0.40174997507138355, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -228959653.6470588, "logits/rejected": -226212795.73333332, "logps/chosen": -299.29411764705884, "logps/rejected": -337.06666666666666, "loss": 0.2561, "rewards/chosen": 0.5136862362132353, "rewards/margins": 4.655352902879902, "rewards/rejected": -4.141666666666667, "step": 151 }, { "epoch": 0.10421666095303393, "grad_norm": 0.3995255281433983, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -328054491.4285714, "logits/rejected": -347195164.4444444, "logps/chosen": -252.28571428571428, "logps/rejected": -315.1111111111111, "loss": 0.22, "rewards/chosen": 0.3842075892857143, "rewards/margins": 4.4987909226190474, "rewards/rejected": -4.114583333333333, "step": 152 }, { "epoch": 0.10490229688035653, "grad_norm": 0.4270265350330859, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -281247148.2181818, "logits/rejected": -261770534.57534248, "logps/chosen": -283.6363636363636, "logps/rejected": -293.6986301369863, "loss": 0.2018, "rewards/chosen": 0.8176136363636364, "rewards/margins": 5.084736924034869, "rewards/rejected": -4.267123287671233, "step": 153 }, { "epoch": 0.10558793280767913, "grad_norm": 0.3681169435046202, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -305718158.2222222, "logits/rejected": -282631561.84615386, "logps/chosen": -213.84126984126985, "logps/rejected": -353.4769230769231, "loss": 0.2288, "rewards/chosen": 0.5629960317460317, "rewards/margins": 5.074534493284493, "rewards/rejected": -4.5115384615384615, "step": 154 }, { "epoch": 0.10627356873500171, "grad_norm": 0.43934252233793436, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -281341006.7692308, "logits/rejected": -286544197.0793651, "logps/chosen": -266.33846153846156, "logps/rejected": -319.23809523809524, "loss": 0.2964, "rewards/chosen": 0.09326923076923077, "rewards/margins": 2.767872405372405, "rewards/rejected": -2.6746031746031744, "step": 155 }, { "epoch": 0.10695920466232431, "grad_norm": 0.4036118065138837, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -309067776.0, "logits/rejected": -372244480.0, "logps/chosen": -235.25, "logps/rejected": -392.5, "loss": 0.2307, "rewards/chosen": 0.43212890625, "rewards/margins": 4.08447265625, "rewards/rejected": -3.65234375, "step": 156 }, { "epoch": 0.10764484058964689, "grad_norm": 0.5172598721703513, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -280201295.7922078, "logits/rejected": -296561965.1764706, "logps/chosen": -256.83116883116884, "logps/rejected": -292.078431372549, "loss": 0.2756, "rewards/chosen": 0.575487012987013, "rewards/margins": 3.8352909345556405, "rewards/rejected": -3.2598039215686274, "step": 157 }, { "epoch": 0.10833047651696949, "grad_norm": 0.3993276133380985, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -256776712.6779661, "logits/rejected": -331410803.01449275, "logps/chosen": -261.4237288135593, "logps/rejected": -341.7971014492754, "loss": 0.2269, "rewards/chosen": 0.7616525423728814, "rewards/margins": 4.834116310488824, "rewards/rejected": -4.072463768115942, "step": 158 }, { "epoch": 0.10901611244429209, "grad_norm": 0.3780638259306392, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -224429643.54098362, "logits/rejected": -311254917.73134327, "logps/chosen": -295.60655737704917, "logps/rejected": -328.5970149253731, "loss": 0.25, "rewards/chosen": 0.5927814421106558, "rewards/margins": 4.768154576439014, "rewards/rejected": -4.175373134328358, "step": 159 }, { "epoch": 0.10970174837161467, "grad_norm": 0.4169365692652222, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -326901511.75757575, "logits/rejected": -240563629.41935483, "logps/chosen": -208.24242424242425, "logps/rejected": -381.93548387096774, "loss": 0.2379, "rewards/chosen": 0.6581439393939394, "rewards/margins": 4.896047165200391, "rewards/rejected": -4.237903225806452, "step": 160 }, { "epoch": 0.11038738429893727, "grad_norm": 0.36633494107667514, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -308040687.21311474, "logits/rejected": -211843652.7761194, "logps/chosen": -220.45901639344262, "logps/rejected": -320.4776119402985, "loss": 0.2463, "rewards/chosen": 0.3078893442622951, "rewards/margins": 4.531769941277221, "rewards/rejected": -4.223880597014926, "step": 161 }, { "epoch": 0.11107302022625985, "grad_norm": 0.4125038567886214, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -269873503.0857143, "logits/rejected": -255997175.1724138, "logps/chosen": -260.9142857142857, "logps/rejected": -346.2068965517241, "loss": 0.2856, "rewards/chosen": 0.29151785714285716, "rewards/margins": 4.136345443349754, "rewards/rejected": -3.8448275862068964, "step": 162 }, { "epoch": 0.11175865615358245, "grad_norm": 0.39902385689812714, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -262482250.32258064, "logits/rejected": -303260889.2121212, "logps/chosen": -247.2258064516129, "logps/rejected": -316.1212121212121, "loss": 0.2345, "rewards/chosen": 0.7056451612903226, "rewards/margins": 4.652614858260019, "rewards/rejected": -3.946969696969697, "step": 163 }, { "epoch": 0.11244429208090503, "grad_norm": 0.4253480616978065, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -294719761.06666666, "logits/rejected": -267695284.70588234, "logps/chosen": -232.13333333333333, "logps/rejected": -314.3529411764706, "loss": 0.261, "rewards/chosen": 0.19375, "rewards/margins": 4.274632352941176, "rewards/rejected": -4.080882352941177, "step": 164 }, { "epoch": 0.11312992800822763, "grad_norm": 0.37990194767784013, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -291472827.2238806, "logits/rejected": -283837490.3606557, "logps/chosen": -198.2089552238806, "logps/rejected": -334.95081967213116, "loss": 0.2303, "rewards/chosen": 0.7873134328358209, "rewards/margins": 4.418460973819427, "rewards/rejected": -3.6311475409836067, "step": 165 }, { "epoch": 0.11381556393555023, "grad_norm": 0.4850323533975292, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -223594533.23636365, "logits/rejected": -227066978.19178084, "logps/chosen": -212.94545454545454, "logps/rejected": -328.7671232876712, "loss": 0.218, "rewards/chosen": 0.45227272727272727, "rewards/margins": 4.428300124533001, "rewards/rejected": -3.9760273972602738, "step": 166 }, { "epoch": 0.11450119986287281, "grad_norm": 0.39771612624075964, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -278061727.47540987, "logits/rejected": -273944392.5970149, "logps/chosen": -306.3606557377049, "logps/rejected": -306.14925373134326, "loss": 0.2292, "rewards/chosen": 0.6183401639344263, "rewards/margins": 5.073564044531441, "rewards/rejected": -4.455223880597015, "step": 167 }, { "epoch": 0.11518683579019541, "grad_norm": 0.48769719574986153, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -244173020.55384615, "logits/rejected": -285478977.015873, "logps/chosen": -247.63076923076923, "logps/rejected": -341.3333333333333, "loss": 0.229, "rewards/chosen": 0.5524038461538462, "rewards/margins": 4.929387973137974, "rewards/rejected": -4.376984126984127, "step": 168 }, { "epoch": 0.115872471717518, "grad_norm": 0.4249763889661134, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -276695008.49230766, "logits/rejected": -220234248.12698412, "logps/chosen": -296.9846153846154, "logps/rejected": -295.6190476190476, "loss": 0.2398, "rewards/chosen": 0.38016826923076924, "rewards/margins": 4.023025412087912, "rewards/rejected": -3.642857142857143, "step": 169 }, { "epoch": 0.11655810764484059, "grad_norm": 0.4155988443609802, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -297009152.0, "logits/rejected": -264765440.0, "logps/chosen": -288.25, "logps/rejected": -396.5, "loss": 0.2354, "rewards/chosen": 0.7783203125, "rewards/margins": 4.8681640625, "rewards/rejected": -4.08984375, "step": 170 }, { "epoch": 0.11724374357216318, "grad_norm": 0.43660306174919195, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -271743639.4366197, "logits/rejected": -203828457.54385966, "logps/chosen": -241.80281690140845, "logps/rejected": -272.0, "loss": 0.2497, "rewards/chosen": 0.7693661971830986, "rewards/margins": 4.348313565604151, "rewards/rejected": -3.5789473684210527, "step": 171 }, { "epoch": 0.11792937949948577, "grad_norm": 0.4014887412620182, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -392586854.4, "logits/rejected": -230193272.47058824, "logps/chosen": -236.53333333333333, "logps/rejected": -345.1764705882353, "loss": 0.2272, "rewards/chosen": 0.4973958333333333, "rewards/margins": 5.232689950980392, "rewards/rejected": -4.735294117647059, "step": 172 }, { "epoch": 0.11861501542680837, "grad_norm": 0.406855237727489, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -313174698.6666667, "logits/rejected": -290880650.3783784, "logps/chosen": -189.62962962962962, "logps/rejected": -340.3243243243243, "loss": 0.2084, "rewards/chosen": 0.5545428240740741, "rewards/margins": 4.8146779592092095, "rewards/rejected": -4.260135135135135, "step": 173 }, { "epoch": 0.11930065135413095, "grad_norm": 0.42934700155044403, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -344919823.0588235, "logits/rejected": -285492292.26666665, "logps/chosen": -244.58823529411765, "logps/rejected": -320.53333333333336, "loss": 0.2488, "rewards/chosen": 0.24827665441176472, "rewards/margins": 5.023276654411765, "rewards/rejected": -4.775, "step": 174 }, { "epoch": 0.11998628728145355, "grad_norm": 0.4826819457204367, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -192458634.97142857, "logits/rejected": -249054878.89655173, "logps/chosen": -255.77142857142857, "logps/rejected": -314.48275862068965, "loss": 0.2646, "rewards/chosen": 0.4141183035714286, "rewards/margins": 4.5520493380541875, "rewards/rejected": -4.137931034482759, "step": 175 }, { "epoch": 0.12067192320877614, "grad_norm": 0.4340071415766119, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -214641842.7936508, "logits/rejected": -229976914.7076923, "logps/chosen": -224.25396825396825, "logps/rejected": -309.4153846153846, "loss": 0.2271, "rewards/chosen": 0.6329365079365079, "rewards/margins": 4.556013431013431, "rewards/rejected": -3.923076923076923, "step": 176 }, { "epoch": 0.12135755913609873, "grad_norm": 0.4725287262818528, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -218645008.51612905, "logits/rejected": -311649497.2121212, "logps/chosen": -333.4193548387097, "logps/rejected": -308.8484848484849, "loss": 0.2494, "rewards/chosen": 0.25882056451612906, "rewards/margins": 3.8042751099706744, "rewards/rejected": -3.5454545454545454, "step": 177 }, { "epoch": 0.12204319506342132, "grad_norm": 0.5262735461297862, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -294384216.74666667, "logits/rejected": -292493350.6415094, "logps/chosen": -280.96, "logps/rejected": -387.62264150943395, "loss": 0.2473, "rewards/chosen": 0.7670833333333333, "rewards/margins": 5.734064465408805, "rewards/rejected": -4.966981132075472, "step": 178 }, { "epoch": 0.12272883099074391, "grad_norm": 0.4899537930133462, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -252432573.04615384, "logits/rejected": -296663787.6825397, "logps/chosen": -319.5076923076923, "logps/rejected": -310.3492063492063, "loss": 0.2809, "rewards/chosen": 0.27884615384615385, "rewards/margins": 4.076465201465202, "rewards/rejected": -3.7976190476190474, "step": 179 }, { "epoch": 0.12341446691806651, "grad_norm": 0.41609497419500924, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -267082454.70967743, "logits/rejected": -289279875.8787879, "logps/chosen": -319.2258064516129, "logps/rejected": -327.75757575757575, "loss": 0.2401, "rewards/chosen": 0.44606854838709675, "rewards/margins": 4.957432184750734, "rewards/rejected": -4.511363636363637, "step": 180 }, { "epoch": 0.1241001028453891, "grad_norm": 0.39629387627878543, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -283859670.7096774, "logits/rejected": -288263074.90909094, "logps/chosen": -252.7741935483871, "logps/rejected": -317.3333333333333, "loss": 0.2353, "rewards/chosen": 0.5987903225806451, "rewards/margins": 4.932123655913978, "rewards/rejected": -4.333333333333333, "step": 181 }, { "epoch": 0.1247857387727117, "grad_norm": 0.3813104062257244, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -228170137.6, "logits/rejected": -217363636.70588234, "logps/chosen": -222.66666666666666, "logps/rejected": -326.11764705882354, "loss": 0.2249, "rewards/chosen": 0.32864583333333336, "rewards/margins": 4.7293811274509805, "rewards/rejected": -4.400735294117647, "step": 182 }, { "epoch": 0.12547137470003428, "grad_norm": 0.4231959629102809, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -317369002.6666667, "logits/rejected": -271889588.7058824, "logps/chosen": -260.53333333333336, "logps/rejected": -321.88235294117646, "loss": 0.2156, "rewards/chosen": 0.7497395833333333, "rewards/margins": 5.201945465686275, "rewards/rejected": -4.452205882352941, "step": 183 }, { "epoch": 0.12615701062735687, "grad_norm": 0.3811639979503546, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -250651607.04, "logits/rejected": -233375376.41025642, "logps/chosen": -211.52, "logps/rejected": -334.35897435897436, "loss": 0.2057, "rewards/chosen": 0.60390625, "rewards/margins": 5.020572916666667, "rewards/rejected": -4.416666666666667, "step": 184 }, { "epoch": 0.12684264655467947, "grad_norm": 0.40898829261591757, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -279530066.58064514, "logits/rejected": -261826249.6969697, "logps/chosen": -296.51612903225805, "logps/rejected": -329.6969696969697, "loss": 0.2112, "rewards/chosen": 0.9717741935483871, "rewards/margins": 5.581622678396871, "rewards/rejected": -4.609848484848484, "step": 185 }, { "epoch": 0.12752828248200207, "grad_norm": 0.4268405656247513, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -195734186.66666666, "logits/rejected": -238494578.2153846, "logps/chosen": -272.76190476190476, "logps/rejected": -326.89230769230767, "loss": 0.2216, "rewards/chosen": 0.6713789682539683, "rewards/margins": 5.3021481990232, "rewards/rejected": -4.630769230769231, "step": 186 }, { "epoch": 0.12821391840932464, "grad_norm": 0.4070362819494312, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -253335961.6, "logits/rejected": -278317925.69863015, "logps/chosen": -196.21818181818182, "logps/rejected": -290.1917808219178, "loss": 0.2131, "rewards/chosen": 0.5271306818181818, "rewards/margins": 5.2805553393524285, "rewards/rejected": -4.7534246575342465, "step": 187 }, { "epoch": 0.12889955433664724, "grad_norm": 0.3531300196034081, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -257839319.57894737, "logits/rejected": -267253961.91549295, "logps/chosen": -222.31578947368422, "logps/rejected": -338.92957746478874, "loss": 0.1962, "rewards/chosen": 0.7192982456140351, "rewards/margins": 5.465777118853472, "rewards/rejected": -4.746478873239437, "step": 188 }, { "epoch": 0.12958519026396983, "grad_norm": 0.4220459049813661, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -230966340.26666668, "logits/rejected": -233647405.17647058, "logps/chosen": -315.2, "logps/rejected": -320.0, "loss": 0.2144, "rewards/chosen": 0.6119791666666666, "rewards/margins": 5.850949754901961, "rewards/rejected": -5.238970588235294, "step": 189 }, { "epoch": 0.13027082619129243, "grad_norm": 0.45026623886694744, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -299390601.0140845, "logits/rejected": -284035323.50877196, "logps/chosen": -248.33802816901408, "logps/rejected": -304.8421052631579, "loss": 0.2414, "rewards/chosen": 0.6628521126760564, "rewards/margins": 4.088290709167285, "rewards/rejected": -3.425438596491228, "step": 190 }, { "epoch": 0.130956462118615, "grad_norm": 0.42703088225472113, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -260646034.2857143, "logits/rejected": -363506346.6666667, "logps/chosen": -364.57142857142856, "logps/rejected": -350.44444444444446, "loss": 0.2133, "rewards/chosen": 0.40625, "rewards/margins": 5.038194444444445, "rewards/rejected": -4.631944444444445, "step": 191 }, { "epoch": 0.1316420980459376, "grad_norm": 0.4510165213650826, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -244580352.0, "logits/rejected": -364642304.0, "logps/chosen": -329.5, "logps/rejected": -357.0, "loss": 0.2281, "rewards/chosen": 0.766357421875, "rewards/margins": 5.578857421875, "rewards/rejected": -4.8125, "step": 192 }, { "epoch": 0.1323277339732602, "grad_norm": 0.4823888387938782, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -224483876.05633804, "logits/rejected": -245477160.42105263, "logps/chosen": -322.2535211267606, "logps/rejected": -323.36842105263156, "loss": 0.2798, "rewards/chosen": 0.46811729753521125, "rewards/margins": 4.994433087008895, "rewards/rejected": -4.526315789473684, "step": 193 }, { "epoch": 0.1330133699005828, "grad_norm": 0.43841426897617425, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -287811958.9859155, "logits/rejected": -294925797.05263156, "logps/chosen": -281.23943661971833, "logps/rejected": -349.7543859649123, "loss": 0.2564, "rewards/chosen": 0.6146566901408451, "rewards/margins": 5.6848321287373365, "rewards/rejected": -5.0701754385964914, "step": 194 }, { "epoch": 0.1336990058279054, "grad_norm": 0.40798324378921086, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -300568090.0338983, "logits/rejected": -291291373.4492754, "logps/chosen": -248.40677966101694, "logps/rejected": -387.2463768115942, "loss": 0.1987, "rewards/chosen": 0.6101694915254238, "rewards/margins": 5.5087202161631055, "rewards/rejected": -4.898550724637682, "step": 195 }, { "epoch": 0.13438464175522796, "grad_norm": 0.49371756430681585, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -269318467.3684211, "logits/rejected": -323729379.1549296, "logps/chosen": -229.05263157894737, "logps/rejected": -277.1830985915493, "loss": 0.2398, "rewards/chosen": 0.7297149122807017, "rewards/margins": 4.240278292562392, "rewards/rejected": -3.51056338028169, "step": 196 }, { "epoch": 0.13507027768255056, "grad_norm": 0.4346167754984595, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -251363902.87719297, "logits/rejected": -259928698.5915493, "logps/chosen": -301.4736842105263, "logps/rejected": -365.9718309859155, "loss": 0.2158, "rewards/chosen": 0.5150767543859649, "rewards/margins": 5.543245768470472, "rewards/rejected": -5.028169014084507, "step": 197 }, { "epoch": 0.13575591360987316, "grad_norm": 0.411732407173467, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -289001075.61290324, "logits/rejected": -223696213.33333334, "logps/chosen": -210.58064516129033, "logps/rejected": -304.24242424242425, "loss": 0.2179, "rewards/chosen": 0.6184475806451613, "rewards/margins": 5.8911748533724335, "rewards/rejected": -5.2727272727272725, "step": 198 }, { "epoch": 0.13644154953719576, "grad_norm": 0.5173145726188395, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -248082767.73770493, "logits/rejected": -257667989.01492536, "logps/chosen": -223.7377049180328, "logps/rejected": -291.82089552238807, "loss": 0.2165, "rewards/chosen": 0.6511270491803278, "rewards/margins": 5.031724064105701, "rewards/rejected": -4.380597014925373, "step": 199 }, { "epoch": 0.13712718546451835, "grad_norm": 0.594452260767088, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -247217212.2352941, "logits/rejected": -275146342.4, "logps/chosen": -334.8235294117647, "logps/rejected": -307.2, "loss": 0.242, "rewards/chosen": 0.6277573529411765, "rewards/margins": 4.22359068627451, "rewards/rejected": -3.595833333333333, "step": 200 }, { "epoch": 0.13781282139184092, "grad_norm": 0.3954338814214239, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -277469341.53846157, "logits/rejected": -320364934.0952381, "logps/chosen": -333.7846153846154, "logps/rejected": -370.53968253968253, "loss": 0.232, "rewards/chosen": 0.6879807692307692, "rewards/margins": 3.1879807692307693, "rewards/rejected": -2.5, "step": 201 }, { "epoch": 0.13849845731916352, "grad_norm": 0.5080584557330696, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -267490260.73239437, "logits/rejected": -225756573.19298247, "logps/chosen": -280.3380281690141, "logps/rejected": -353.96491228070175, "loss": 0.2231, "rewards/chosen": 0.745818661971831, "rewards/margins": 6.136169539164814, "rewards/rejected": -5.390350877192983, "step": 202 }, { "epoch": 0.13918409324648612, "grad_norm": 0.4241725943567192, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -327531321.3134328, "logits/rejected": -275861436.852459, "logps/chosen": -344.23880597014926, "logps/rejected": -376.91803278688525, "loss": 0.2375, "rewards/chosen": 0.6982276119402985, "rewards/margins": 4.1367522021042324, "rewards/rejected": -3.4385245901639343, "step": 203 }, { "epoch": 0.13986972917380872, "grad_norm": 0.47868623572314256, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -255972381.25714287, "logits/rejected": -281162999.17241377, "logps/chosen": -330.74285714285713, "logps/rejected": -341.51724137931035, "loss": 0.2623, "rewards/chosen": 0.47433035714285715, "rewards/margins": 5.37950277093596, "rewards/rejected": -4.905172413793103, "step": 204 }, { "epoch": 0.14055536510113129, "grad_norm": 0.5122919531650276, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -354902646.15384614, "logits/rejected": -357115026.28571427, "logps/chosen": -249.84615384615384, "logps/rejected": -376.63492063492066, "loss": 0.2557, "rewards/chosen": 0.28197115384615384, "rewards/margins": 5.2422886141636145, "rewards/rejected": -4.9603174603174605, "step": 205 }, { "epoch": 0.14124100102845388, "grad_norm": 0.4571312160328957, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -259084220.852459, "logits/rejected": -261173675.9402985, "logps/chosen": -268.327868852459, "logps/rejected": -372.53731343283584, "loss": 0.2512, "rewards/chosen": 0.26011782786885246, "rewards/margins": 5.252655141301688, "rewards/rejected": -4.992537313432836, "step": 206 }, { "epoch": 0.14192663695577648, "grad_norm": 0.3949549817181147, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -259143459.44615385, "logits/rejected": -239674514.2857143, "logps/chosen": -278.89230769230767, "logps/rejected": -349.7142857142857, "loss": 0.2275, "rewards/chosen": 0.3144230769230769, "rewards/margins": 5.286645299145299, "rewards/rejected": -4.972222222222222, "step": 207 }, { "epoch": 0.14261227288309908, "grad_norm": 0.4121248166887182, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -264540745.14285713, "logits/rejected": -289173959.1111111, "logps/chosen": -279.42857142857144, "logps/rejected": -323.77777777777777, "loss": 0.2105, "rewards/chosen": 0.5997488839285714, "rewards/margins": 5.1136377728174605, "rewards/rejected": -4.513888888888889, "step": 208 }, { "epoch": 0.14329790881042168, "grad_norm": 0.5108031479572005, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -256085560.8888889, "logits/rejected": -270532608.0, "logps/chosen": -249.11111111111111, "logps/rejected": -272.0, "loss": 0.2611, "rewards/chosen": 0.2708333333333333, "rewards/margins": 4.681547619047619, "rewards/rejected": -4.410714285714286, "step": 209 }, { "epoch": 0.14398354473774425, "grad_norm": 0.47444281179774506, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -322903951.7808219, "logits/rejected": -331273755.92727274, "logps/chosen": -271.7808219178082, "logps/rejected": -330.76363636363635, "loss": 0.2558, "rewards/chosen": 0.639554794520548, "rewards/margins": 4.344100249066003, "rewards/rejected": -3.7045454545454546, "step": 210 }, { "epoch": 0.14466918066506684, "grad_norm": 0.5086561074081896, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -262843050.66666666, "logits/rejected": -259051589.42372882, "logps/chosen": -292.6376811594203, "logps/rejected": -415.728813559322, "loss": 0.245, "rewards/chosen": 0.6367753623188406, "rewards/margins": 4.29355502333579, "rewards/rejected": -3.656779661016949, "step": 211 }, { "epoch": 0.14535481659238944, "grad_norm": 0.44833952553870693, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -250833131.01639345, "logits/rejected": -320019135.04477614, "logps/chosen": -285.11475409836066, "logps/rejected": -285.3731343283582, "loss": 0.2086, "rewards/chosen": 0.8017418032786885, "rewards/margins": 4.290547773427942, "rewards/rejected": -3.4888059701492535, "step": 212 }, { "epoch": 0.14604045251971204, "grad_norm": 0.4171435158282834, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -298964488.39344263, "logits/rejected": -357580066.3880597, "logps/chosen": -300.72131147540983, "logps/rejected": -356.7761194029851, "loss": 0.1959, "rewards/chosen": 0.9979508196721312, "rewards/margins": 4.803920968925863, "rewards/rejected": -3.8059701492537314, "step": 213 }, { "epoch": 0.14672608844703464, "grad_norm": 0.43070079693743984, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -239434839.77142859, "logits/rejected": -278848900.4137931, "logps/chosen": -320.45714285714286, "logps/rejected": -274.7586206896552, "loss": 0.2418, "rewards/chosen": 0.6741071428571429, "rewards/margins": 5.3594519704433505, "rewards/rejected": -4.685344827586207, "step": 214 }, { "epoch": 0.1474117243743572, "grad_norm": 0.46883706600691394, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -319308304.516129, "logits/rejected": -335290119.75757575, "logps/chosen": -215.48387096774192, "logps/rejected": -304.4848484848485, "loss": 0.2065, "rewards/chosen": 0.7691532258064516, "rewards/margins": 4.591122922776148, "rewards/rejected": -3.821969696969697, "step": 215 }, { "epoch": 0.1480973603016798, "grad_norm": 0.46964215515778074, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -291813543.86885244, "logits/rejected": -265680987.70149255, "logps/chosen": -249.70491803278688, "logps/rejected": -318.089552238806, "loss": 0.2187, "rewards/chosen": 0.71875, "rewards/margins": 5.767257462686567, "rewards/rejected": -5.048507462686567, "step": 216 }, { "epoch": 0.1487829962290024, "grad_norm": 0.4303750738044495, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -244833609.76271185, "logits/rejected": -240716577.39130434, "logps/chosen": -348.7457627118644, "logps/rejected": -273.15942028985506, "loss": 0.214, "rewards/chosen": 0.805912341101695, "rewards/margins": 5.233448572985752, "rewards/rejected": -4.427536231884058, "step": 217 }, { "epoch": 0.149468632156325, "grad_norm": 0.48103516055417145, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -239638741.97014925, "logits/rejected": -249732985.70491803, "logps/chosen": -261.7313432835821, "logps/rejected": -376.655737704918, "loss": 0.2325, "rewards/chosen": 0.7369402985074627, "rewards/margins": 5.704153413261562, "rewards/rejected": -4.967213114754099, "step": 218 }, { "epoch": 0.1501542680836476, "grad_norm": 0.5189809103554067, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -375420601.5072464, "logits/rejected": -400947026.44067794, "logps/chosen": -224.69565217391303, "logps/rejected": -329.76271186440675, "loss": 0.236, "rewards/chosen": 0.3451086956521739, "rewards/margins": 26836732.006125644, "rewards/rejected": -26836731.66101695, "step": 219 }, { "epoch": 0.15083990401097017, "grad_norm": 0.4847652311077455, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -371937973.16923076, "logits/rejected": -250859324.95238096, "logps/chosen": -293.4153846153846, "logps/rejected": -371.3015873015873, "loss": 0.217, "rewards/chosen": 0.7125, "rewards/margins": 6.180753968253969, "rewards/rejected": -5.468253968253968, "step": 220 }, { "epoch": 0.15152553993829276, "grad_norm": 0.440890534390218, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -264738781.2881356, "logits/rejected": -255305460.86956522, "logps/chosen": -352.54237288135596, "logps/rejected": -369.15942028985506, "loss": 0.2173, "rewards/chosen": 0.5201271186440678, "rewards/margins": 5.201286538933923, "rewards/rejected": -4.681159420289855, "step": 221 }, { "epoch": 0.15221117586561536, "grad_norm": 0.370375110137696, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -242761231.5151515, "logits/rejected": -274929862.1935484, "logps/chosen": -176.72727272727272, "logps/rejected": -369.03225806451616, "loss": 0.2215, "rewards/chosen": 0.6486742424242424, "rewards/margins": 4.277706500488758, "rewards/rejected": -3.629032258064516, "step": 222 }, { "epoch": 0.15289681179293796, "grad_norm": 0.44520865468704307, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -267169251.0188679, "logits/rejected": -298634444.8, "logps/chosen": -225.81132075471697, "logps/rejected": -349.8666666666667, "loss": 0.1979, "rewards/chosen": 0.2626768867924528, "rewards/margins": 4.84934355345912, "rewards/rejected": -4.586666666666667, "step": 223 }, { "epoch": 0.15358244772026053, "grad_norm": 0.4151827207040014, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -269034642.28571427, "logits/rejected": -361642211.5555556, "logps/chosen": -300.57142857142856, "logps/rejected": -311.1111111111111, "loss": 0.2018, "rewards/chosen": 0.7843191964285714, "rewards/margins": 5.336402529761904, "rewards/rejected": -4.552083333333333, "step": 224 }, { "epoch": 0.15426808364758313, "grad_norm": 0.4732410208620507, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -323167685.24590164, "logits/rejected": -327531321.3134328, "logps/chosen": -290.88524590163934, "logps/rejected": -268.8955223880597, "loss": 0.2184, "rewards/chosen": 0.8258196721311475, "rewards/margins": 5.131789821384879, "rewards/rejected": -4.3059701492537314, "step": 225 }, { "epoch": 0.15495371957490572, "grad_norm": 0.5347618345115164, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -219035875.55555555, "logits/rejected": -245067190.85714287, "logps/chosen": -248.0, "logps/rejected": -279.57142857142856, "loss": 0.2259, "rewards/chosen": 1.1041666666666667, "rewards/margins": 5.474702380952381, "rewards/rejected": -4.370535714285714, "step": 226 }, { "epoch": 0.15563935550222832, "grad_norm": 0.5430755347121313, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -306311292.1212121, "logits/rejected": -337167921.5483871, "logps/chosen": -241.93939393939394, "logps/rejected": -309.16129032258067, "loss": 0.2378, "rewards/chosen": 0.7587594696969697, "rewards/margins": 4.778920760019551, "rewards/rejected": -4.020161290322581, "step": 227 }, { "epoch": 0.15632499142955092, "grad_norm": 0.4134209337431832, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -322174976.0, "logits/rejected": -270270464.0, "logps/chosen": -230.75, "logps/rejected": -339.75, "loss": 0.243, "rewards/chosen": 0.407867431640625, "rewards/margins": 4.915679931640625, "rewards/rejected": -4.5078125, "step": 228 }, { "epoch": 0.1570106273568735, "grad_norm": 0.47570324465280306, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -240393537.82857144, "logits/rejected": -275667014.62068963, "logps/chosen": -263.54285714285714, "logps/rejected": -366.3448275862069, "loss": 0.2328, "rewards/chosen": 0.665625, "rewards/margins": 5.937176724137931, "rewards/rejected": -5.271551724137931, "step": 229 }, { "epoch": 0.1576962632841961, "grad_norm": 0.4561020167353722, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -249421277.86666667, "logits/rejected": -299275926.5882353, "logps/chosen": -224.93333333333334, "logps/rejected": -326.5882352941176, "loss": 0.2169, "rewards/chosen": 0.3963541666666667, "rewards/margins": 4.227236519607843, "rewards/rejected": -3.8308823529411766, "step": 230 }, { "epoch": 0.15838189921151868, "grad_norm": 0.4835812774923925, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -276330616.4705882, "logits/rejected": -248023176.53333333, "logps/chosen": -244.0, "logps/rejected": -466.4, "loss": 0.2336, "rewards/chosen": 0.5728400735294118, "rewards/margins": 4.947840073529412, "rewards/rejected": -4.375, "step": 231 }, { "epoch": 0.15906753513884128, "grad_norm": 0.4665544076793681, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -221609494.92537314, "logits/rejected": -264584947.40983605, "logps/chosen": -264.5970149253731, "logps/rejected": -303.4754098360656, "loss": 0.2277, "rewards/chosen": 0.6152052238805971, "rewards/margins": 5.697172436995351, "rewards/rejected": -5.081967213114754, "step": 232 }, { "epoch": 0.15975317106616388, "grad_norm": 0.4594711236490616, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -252706816.0, "logits/rejected": -216793088.0, "logps/chosen": -262.0, "logps/rejected": -312.25, "loss": 0.2335, "rewards/chosen": 0.522216796875, "rewards/margins": 4.799560546875, "rewards/rejected": -4.27734375, "step": 233 }, { "epoch": 0.16043880699348645, "grad_norm": 0.3972909316968625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -199728761.9047619, "logits/rejected": -207779367.3846154, "logps/chosen": -204.95238095238096, "logps/rejected": -319.26153846153846, "loss": 0.2358, "rewards/chosen": 0.5092385912698413, "rewards/margins": 5.532315514346764, "rewards/rejected": -5.023076923076923, "step": 234 }, { "epoch": 0.16112444292080905, "grad_norm": 0.41830488970335344, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -315271850.6666667, "logits/rejected": -264734599.52941176, "logps/chosen": -265.3333333333333, "logps/rejected": -343.52941176470586, "loss": 0.2196, "rewards/chosen": 0.6395833333333333, "rewards/margins": 5.433700980392157, "rewards/rejected": -4.794117647058823, "step": 235 }, { "epoch": 0.16181007884813164, "grad_norm": 0.43056113844812316, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -225067935.3962264, "logits/rejected": -218327504.21333334, "logps/chosen": -285.58490566037733, "logps/rejected": -301.2266666666667, "loss": 0.1744, "rewards/chosen": 0.46462264150943394, "rewards/margins": 5.791289308176101, "rewards/rejected": -5.326666666666667, "step": 236 }, { "epoch": 0.16249571477545424, "grad_norm": 0.48943027652769383, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -268976656.516129, "logits/rejected": -248862037.33333334, "logps/chosen": -274.83870967741933, "logps/rejected": -318.3030303030303, "loss": 0.2134, "rewards/chosen": 0.6840977822580645, "rewards/margins": 5.684097782258064, "rewards/rejected": -5.0, "step": 237 }, { "epoch": 0.1631813507027768, "grad_norm": 0.4991835296903018, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -256488044.6060606, "logits/rejected": -263294051.0967742, "logps/chosen": -264.24242424242425, "logps/rejected": -294.7096774193548, "loss": 0.2226, "rewards/chosen": 0.6837121212121212, "rewards/margins": 6.127260508308895, "rewards/rejected": -5.443548387096774, "step": 238 }, { "epoch": 0.1638669866300994, "grad_norm": 0.4981800010777172, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -268908053.63380283, "logits/rejected": -284035323.50877196, "logps/chosen": -287.09859154929575, "logps/rejected": -323.64912280701753, "loss": 0.2353, "rewards/chosen": 0.7596830985915493, "rewards/margins": 5.7026655547319, "rewards/rejected": -4.942982456140351, "step": 239 }, { "epoch": 0.164552622557422, "grad_norm": 0.5053944638898886, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -218103808.0, "logits/rejected": -288718358.92537314, "logps/chosen": -200.39344262295083, "logps/rejected": -362.9850746268657, "loss": 0.2239, "rewards/chosen": 0.5302254098360656, "rewards/margins": 5.679479141179349, "rewards/rejected": -5.149253731343284, "step": 240 }, { "epoch": 0.1652382584847446, "grad_norm": 0.4763440497336324, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -281599117.7846154, "logits/rejected": -295598567.61904764, "logps/chosen": -245.41538461538462, "logps/rejected": -303.4920634920635, "loss": 0.2073, "rewards/chosen": 0.5798076923076924, "rewards/margins": 5.85758547008547, "rewards/rejected": -5.277777777777778, "step": 241 }, { "epoch": 0.1659238944120672, "grad_norm": 0.4845774227326403, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -314041127.6619718, "logits/rejected": -352615873.122807, "logps/chosen": -290.7042253521127, "logps/rejected": -333.4736842105263, "loss": 0.2464, "rewards/chosen": 0.48525528169014087, "rewards/margins": 5.213325457128738, "rewards/rejected": -4.728070175438597, "step": 242 }, { "epoch": 0.16660953033938977, "grad_norm": 0.40405280861081166, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -360557623.8545455, "logits/rejected": -375763673.4246575, "logps/chosen": -252.8, "logps/rejected": -369.972602739726, "loss": 0.1884, "rewards/chosen": 0.8863636363636364, "rewards/margins": 6.016500622665006, "rewards/rejected": -5.13013698630137, "step": 243 }, { "epoch": 0.16729516626671237, "grad_norm": 0.45279296402328545, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -221432620.6984127, "logits/rejected": -228944470.64615384, "logps/chosen": -291.1746031746032, "logps/rejected": -304.4923076923077, "loss": 0.2292, "rewards/chosen": 0.35813492063492064, "rewards/margins": 4.992750305250306, "rewards/rejected": -4.634615384615385, "step": 244 }, { "epoch": 0.16798080219403497, "grad_norm": 0.4391791033010668, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -272565232.24615383, "logits/rejected": -298794227.8095238, "logps/chosen": -300.3076923076923, "logps/rejected": -351.4920634920635, "loss": 0.2241, "rewards/chosen": 0.6930288461538462, "rewards/margins": 5.83588598901099, "rewards/rejected": -5.142857142857143, "step": 245 }, { "epoch": 0.16866643812135756, "grad_norm": 0.5193178479905373, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -282458203.70149255, "logits/rejected": -237218832.78688523, "logps/chosen": -236.8955223880597, "logps/rejected": -306.62295081967216, "loss": 0.217, "rewards/chosen": 0.9048507462686567, "rewards/margins": 5.781899926596525, "rewards/rejected": -4.877049180327869, "step": 246 }, { "epoch": 0.16935207404868016, "grad_norm": 0.43389068342629894, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -314445699.8787879, "logits/rejected": -285753872.516129, "logps/chosen": -349.57575757575756, "logps/rejected": -342.19354838709677, "loss": 0.1919, "rewards/chosen": 1.209280303030303, "rewards/margins": 6.499602883675464, "rewards/rejected": -5.290322580645161, "step": 247 }, { "epoch": 0.17003770997600273, "grad_norm": 0.43314226479760476, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -294596538.5762712, "logits/rejected": -263086198.7246377, "logps/chosen": -374.77966101694915, "logps/rejected": -342.2608695652174, "loss": 0.2017, "rewards/chosen": 0.9486228813559322, "rewards/margins": 5.87615911323999, "rewards/rejected": -4.927536231884058, "step": 248 }, { "epoch": 0.17072334590332533, "grad_norm": 0.45566880822497097, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -259659681.47692308, "logits/rejected": -227957093.58730158, "logps/chosen": -276.9230769230769, "logps/rejected": -358.6031746031746, "loss": 0.2104, "rewards/chosen": 0.6831730769230769, "rewards/margins": 3.9609508547008545, "rewards/rejected": -3.2777777777777777, "step": 249 }, { "epoch": 0.17140898183064793, "grad_norm": 0.6952834905428329, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -231765255.31428573, "logits/rejected": -230542088.8275862, "logps/chosen": -358.85714285714283, "logps/rejected": -338.2068965517241, "loss": 0.271, "rewards/chosen": 0.4607142857142857, "rewards/margins": 5.292610837438423, "rewards/rejected": -4.831896551724138, "step": 250 }, { "epoch": 0.17209461775797052, "grad_norm": 0.4707970391010138, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -223429908.31746033, "logits/rejected": -255271794.2153846, "logps/chosen": -256.0, "logps/rejected": -311.38461538461536, "loss": 0.2312, "rewards/chosen": 0.435515873015873, "rewards/margins": 5.554746642246641, "rewards/rejected": -5.119230769230769, "step": 251 }, { "epoch": 0.17278025368529312, "grad_norm": 0.41781156531929287, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -243140576.4923077, "logits/rejected": -256718035.3015873, "logps/chosen": -297.10769230769233, "logps/rejected": -341.8412698412698, "loss": 0.2147, "rewards/chosen": 0.8706730769230769, "rewards/margins": 5.779403235653236, "rewards/rejected": -4.908730158730159, "step": 252 }, { "epoch": 0.1734658896126157, "grad_norm": 0.49610435553005594, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -289788276.3636364, "logits/rejected": -242728431.48387095, "logps/chosen": -262.06060606060606, "logps/rejected": -338.5806451612903, "loss": 0.2104, "rewards/chosen": 0.7681107954545454, "rewards/margins": 5.058433376099707, "rewards/rejected": -4.290322580645161, "step": 253 }, { "epoch": 0.1741515255399383, "grad_norm": 0.4422743201874494, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -292037302.2372881, "logits/rejected": -264788235.13043478, "logps/chosen": -233.6271186440678, "logps/rejected": -365.9130434782609, "loss": 0.2105, "rewards/chosen": 0.5, "rewards/margins": 4.391304347826087, "rewards/rejected": -3.891304347826087, "step": 254 }, { "epoch": 0.1748371614672609, "grad_norm": 0.48428743721015344, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -278123707.4929578, "logits/rejected": -308170967.57894737, "logps/chosen": -232.56338028169014, "logps/rejected": -394.6666666666667, "loss": 0.2238, "rewards/chosen": 0.9524647887323944, "rewards/margins": 5.895447244872745, "rewards/rejected": -4.942982456140351, "step": 255 }, { "epoch": 0.17552279739458349, "grad_norm": 0.4934868269872556, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -342019736.7017544, "logits/rejected": -321602689.8028169, "logps/chosen": -219.9298245614035, "logps/rejected": -273.1267605633803, "loss": 0.1924, "rewards/chosen": 0.793859649122807, "rewards/margins": 4.941746973066469, "rewards/rejected": -4.147887323943662, "step": 256 }, { "epoch": 0.17620843332190606, "grad_norm": 0.4648054487781154, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -316960326.8923077, "logits/rejected": -315305138.7936508, "logps/chosen": -234.33846153846153, "logps/rejected": -330.41269841269843, "loss": 0.2073, "rewards/chosen": 0.7149038461538462, "rewards/margins": 6.048237179487179, "rewards/rejected": -5.333333333333333, "step": 257 }, { "epoch": 0.17689406924922865, "grad_norm": 0.5323049917727632, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -264765440.0, "logits/rejected": -257425408.0, "logps/chosen": -294.75, "logps/rejected": -336.75, "loss": 0.212, "rewards/chosen": 0.632080078125, "rewards/margins": 6.069580078125, "rewards/rejected": -5.4375, "step": 258 }, { "epoch": 0.17757970517655125, "grad_norm": 0.5203105721056139, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -248484172.1081081, "logits/rejected": -235813091.55555555, "logps/chosen": -252.75675675675674, "logps/rejected": -399.4074074074074, "loss": 0.2376, "rewards/chosen": 0.7411317567567568, "rewards/margins": 4.032798423423423, "rewards/rejected": -3.2916666666666665, "step": 259 }, { "epoch": 0.17826534110387385, "grad_norm": 0.4304004947585433, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -189195374.2769231, "logits/rejected": -310777953.52380955, "logps/chosen": -302.2769230769231, "logps/rejected": -309.3333333333333, "loss": 0.2079, "rewards/chosen": 0.7341346153846153, "rewards/margins": 6.289690170940171, "rewards/rejected": -5.555555555555555, "step": 260 }, { "epoch": 0.17895097703119645, "grad_norm": 0.37887923334718665, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -258133656.7017544, "logits/rejected": -226374266.5915493, "logps/chosen": -309.05263157894734, "logps/rejected": -331.71830985915494, "loss": 0.1796, "rewards/chosen": 0.32456140350877194, "rewards/margins": 5.673152952804547, "rewards/rejected": -5.348591549295775, "step": 261 }, { "epoch": 0.17963661295851902, "grad_norm": 0.3882839677478724, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -273144849.9649123, "logits/rejected": -287811958.9859155, "logps/chosen": -376.140350877193, "logps/rejected": -327.4366197183099, "loss": 0.2102, "rewards/chosen": 0.6245888157894737, "rewards/margins": 5.6105043087472195, "rewards/rejected": -4.985915492957746, "step": 262 }, { "epoch": 0.1803222488858416, "grad_norm": 0.4726034411350337, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -302872899.3684211, "logits/rejected": -277651109.85915494, "logps/chosen": -240.140350877193, "logps/rejected": -303.09859154929575, "loss": 0.2083, "rewards/chosen": 0.6539199561403509, "rewards/margins": 5.506032632196689, "rewards/rejected": -4.852112676056338, "step": 263 }, { "epoch": 0.1810078848131642, "grad_norm": 0.5524782456692041, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -313364271.7288136, "logits/rejected": -351105795.71014494, "logps/chosen": -250.84745762711864, "logps/rejected": -329.7391304347826, "loss": 0.1988, "rewards/chosen": 0.8305084745762712, "rewards/margins": 6.200073691967576, "rewards/rejected": -5.369565217391305, "step": 264 }, { "epoch": 0.1816935207404868, "grad_norm": 0.374455809567838, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -300283730.44067794, "logits/rejected": -304664516.6376812, "logps/chosen": -289.76271186440675, "logps/rejected": -337.15942028985506, "loss": 0.1975, "rewards/chosen": 0.7288135593220338, "rewards/margins": 5.062146892655367, "rewards/rejected": -4.333333333333333, "step": 265 }, { "epoch": 0.1823791566678094, "grad_norm": 0.4778655950313047, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -300464686.54545456, "logits/rejected": -284400871.2258065, "logps/chosen": -256.969696969697, "logps/rejected": -337.03225806451616, "loss": 0.2358, "rewards/chosen": 0.5350378787878788, "rewards/margins": 4.914070136852395, "rewards/rejected": -4.379032258064516, "step": 266 }, { "epoch": 0.18306479259513198, "grad_norm": 0.5622542901506281, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -290624677.16129035, "logits/rejected": -315716701.09090906, "logps/chosen": -313.03225806451616, "logps/rejected": -366.06060606060606, "loss": 0.2234, "rewards/chosen": 0.5884576612903226, "rewards/margins": 5.952094024926686, "rewards/rejected": -5.363636363636363, "step": 267 }, { "epoch": 0.18375042852245457, "grad_norm": 0.5202255053586239, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -269034642.28571427, "logits/rejected": -237211192.8888889, "logps/chosen": -238.28571428571428, "logps/rejected": -275.3333333333333, "loss": 0.2065, "rewards/chosen": 0.5806361607142857, "rewards/margins": 4.913969494047619, "rewards/rejected": -4.333333333333333, "step": 268 }, { "epoch": 0.18443606444977717, "grad_norm": 0.4744927451740979, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -267942008.47058824, "logits/rejected": -251098999.46666667, "logps/chosen": -240.23529411764707, "logps/rejected": -391.2, "loss": 0.2538, "rewards/chosen": 0.625, "rewards/margins": 5.575, "rewards/rejected": -4.95, "step": 269 }, { "epoch": 0.18512170037709977, "grad_norm": 0.4336474523183549, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -256529044.6451613, "logits/rejected": -281399668.3636364, "logps/chosen": -277.80645161290323, "logps/rejected": -389.8181818181818, "loss": 0.1881, "rewards/chosen": 0.9657258064516129, "rewards/margins": 5.071786412512219, "rewards/rejected": -4.106060606060606, "step": 270 }, { "epoch": 0.18580733630442234, "grad_norm": 0.5417047023678242, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -288873155.4909091, "logits/rejected": -245912618.0821918, "logps/chosen": -268.07272727272726, "logps/rejected": -376.54794520547944, "loss": 0.1731, "rewards/chosen": 0.7806818181818181, "rewards/margins": 6.287531133250311, "rewards/rejected": -5.506849315068493, "step": 271 }, { "epoch": 0.18649297223174494, "grad_norm": 0.5770493149407899, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -283446649.2631579, "logits/rejected": -252839734.08450705, "logps/chosen": -250.10526315789474, "logps/rejected": -411.49295774647885, "loss": 0.1924, "rewards/chosen": 0.8125, "rewards/margins": 5.784330985915493, "rewards/rejected": -4.971830985915493, "step": 272 }, { "epoch": 0.18717860815906753, "grad_norm": 0.505144427334944, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -312708664.8888889, "logits/rejected": -320265069.71428573, "logps/chosen": -302.6666666666667, "logps/rejected": -294.57142857142856, "loss": 0.2267, "rewards/chosen": 0.9212239583333334, "rewards/margins": 5.912295386904762, "rewards/rejected": -4.991071428571429, "step": 273 }, { "epoch": 0.18786424408639013, "grad_norm": 0.5367594447172932, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -327270624.43835616, "logits/rejected": -274841302.1090909, "logps/chosen": -214.35616438356163, "logps/rejected": -302.54545454545456, "loss": 0.2316, "rewards/chosen": 0.7960188356164384, "rewards/margins": 6.718746108343711, "rewards/rejected": -5.922727272727273, "step": 274 }, { "epoch": 0.18854988001371273, "grad_norm": 0.4215475089731974, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -237218832.78688523, "logits/rejected": -218354214.20895523, "logps/chosen": -240.78688524590163, "logps/rejected": -309.97014925373134, "loss": 0.2171, "rewards/chosen": 0.6142418032786885, "rewards/margins": 5.834391057010032, "rewards/rejected": -5.220149253731344, "step": 275 }, { "epoch": 0.1892355159410353, "grad_norm": 0.5349948963314487, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -272756860.1212121, "logits/rejected": -231633820.9032258, "logps/chosen": -256.0, "logps/rejected": -374.4516129032258, "loss": 0.2169, "rewards/chosen": 0.8167613636363636, "rewards/margins": 5.744180718475073, "rewards/rejected": -4.92741935483871, "step": 276 }, { "epoch": 0.1899211518683579, "grad_norm": 0.4280603195808345, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -269004175.1864407, "logits/rejected": -302962480.23188406, "logps/chosen": -256.271186440678, "logps/rejected": -303.07246376811594, "loss": 0.1982, "rewards/chosen": 0.9560381355932204, "rewards/margins": 6.86908161385409, "rewards/rejected": -5.913043478260869, "step": 277 }, { "epoch": 0.1906067877956805, "grad_norm": 0.44970061219668284, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -233409338.3859649, "logits/rejected": -285921568.4507042, "logps/chosen": -316.35087719298247, "logps/rejected": -347.49295774647885, "loss": 0.1717, "rewards/chosen": 0.8569078947368421, "rewards/margins": 6.7160628243143075, "rewards/rejected": -5.859154929577465, "step": 278 }, { "epoch": 0.1912924237230031, "grad_norm": 0.47633322225973795, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -267649024.0, "logits/rejected": -247726080.0, "logps/chosen": -261.125, "logps/rejected": -365.0, "loss": 0.1927, "rewards/chosen": 1.0009765625, "rewards/margins": 7.1103515625, "rewards/rejected": -6.109375, "step": 279 }, { "epoch": 0.1919780596503257, "grad_norm": 0.6194479135223209, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -322414324.8695652, "logits/rejected": -307392720.2711864, "logps/chosen": -291.0144927536232, "logps/rejected": -337.35593220338984, "loss": 0.2362, "rewards/chosen": 0.7800045289855072, "rewards/margins": 6.424072325595676, "rewards/rejected": -5.6440677966101696, "step": 280 }, { "epoch": 0.19266369557764826, "grad_norm": 0.5692153641034196, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -296143282.42424244, "logits/rejected": -284400871.2258065, "logps/chosen": -254.78787878787878, "logps/rejected": -306.83870967741933, "loss": 0.2334, "rewards/chosen": 0.5288825757575758, "rewards/margins": 4.867592253176931, "rewards/rejected": -4.338709677419355, "step": 281 }, { "epoch": 0.19334933150497086, "grad_norm": 0.5003904307635952, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -233074246.8923077, "logits/rejected": -238875599.23809522, "logps/chosen": -213.66153846153847, "logps/rejected": -311.6190476190476, "loss": 0.2414, "rewards/chosen": 0.38317307692307695, "rewards/margins": 4.149046092796093, "rewards/rejected": -3.765873015873016, "step": 282 }, { "epoch": 0.19403496743229345, "grad_norm": 0.5017303865113214, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -234881024.0, "logits/rejected": -281857228.8, "logps/chosen": -292.82191780821915, "logps/rejected": -324.3636363636364, "loss": 0.2459, "rewards/chosen": 0.4854452054794521, "rewards/margins": 6.221808841843089, "rewards/rejected": -5.736363636363636, "step": 283 }, { "epoch": 0.19472060335961605, "grad_norm": 0.43505138137663024, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -261724569.6, "logits/rejected": -256451730.2857143, "logps/chosen": -244.92307692307693, "logps/rejected": -328.12698412698415, "loss": 0.1847, "rewards/chosen": 1.3076923076923077, "rewards/margins": 5.061660561660561, "rewards/rejected": -3.753968253968254, "step": 284 }, { "epoch": 0.19540623928693865, "grad_norm": 0.4372010553122799, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -279087656.63492066, "logits/rejected": -294762779.5692308, "logps/chosen": -221.46031746031747, "logps/rejected": -378.0923076923077, "loss": 0.2034, "rewards/chosen": 0.9330357142857143, "rewards/margins": 6.471497252747253, "rewards/rejected": -5.538461538461538, "step": 285 }, { "epoch": 0.19609187521426122, "grad_norm": 0.49482497367285005, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -257093112.78873238, "logits/rejected": -295514471.2982456, "logps/chosen": -254.19718309859155, "logps/rejected": -332.63157894736844, "loss": 0.2147, "rewards/chosen": 1.0008802816901408, "rewards/margins": 3.9087750185322463, "rewards/rejected": -2.9078947368421053, "step": 286 }, { "epoch": 0.19677751114158382, "grad_norm": 0.4817329554903424, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -299386526.8965517, "logits/rejected": -281857228.8, "logps/chosen": -267.86206896551727, "logps/rejected": -299.42857142857144, "loss": 0.1921, "rewards/chosen": 0.8927801724137931, "rewards/margins": 5.535637315270936, "rewards/rejected": -4.642857142857143, "step": 287 }, { "epoch": 0.19746314706890641, "grad_norm": 0.4563180560928307, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -289271675.87096775, "logits/rejected": -303515089.45454544, "logps/chosen": -320.7741935483871, "logps/rejected": -382.06060606060606, "loss": 0.1487, "rewards/chosen": 1.5997983870967742, "rewards/margins": 7.198283235581623, "rewards/rejected": -5.598484848484849, "step": 288 }, { "epoch": 0.198148782996229, "grad_norm": 0.6172387565280656, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -327426312.2580645, "logits/rejected": -384096566.3030303, "logps/chosen": -308.1290322580645, "logps/rejected": -405.3333333333333, "loss": 0.2178, "rewards/chosen": 0.7410534274193549, "rewards/margins": 5.703174639540567, "rewards/rejected": -4.962121212121212, "step": 289 }, { "epoch": 0.19883441892355158, "grad_norm": 0.5037994341967101, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -273339565.2923077, "logits/rejected": -260180000.5079365, "logps/chosen": -245.90769230769232, "logps/rejected": -332.44444444444446, "loss": 0.2346, "rewards/chosen": 0.5175480769230769, "rewards/margins": 5.644532203907204, "rewards/rejected": -5.126984126984127, "step": 290 }, { "epoch": 0.19952005485087418, "grad_norm": 0.4379656861891776, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -284950528.0, "logits/rejected": -260571136.0, "logps/chosen": -279.25, "logps/rejected": -316.75, "loss": 0.2018, "rewards/chosen": 0.939453125, "rewards/margins": 21070266.939453125, "rewards/rejected": -21070266.0, "step": 291 }, { "epoch": 0.20020569077819678, "grad_norm": 0.4972915628179558, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -261511525.58730158, "logits/rejected": -286503227.0769231, "logps/chosen": -277.8412698412698, "logps/rejected": -368.24615384615385, "loss": 0.2017, "rewards/chosen": 1.089781746031746, "rewards/margins": 6.505166361416362, "rewards/rejected": -5.415384615384616, "step": 292 }, { "epoch": 0.20089132670551937, "grad_norm": 0.5052960780135819, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -304054776.1230769, "logits/rejected": -326223644.4444444, "logps/chosen": -216.12307692307692, "logps/rejected": -366.73015873015873, "loss": 0.2054, "rewards/chosen": 0.9519230769230769, "rewards/margins": 6.3011294261294255, "rewards/rejected": -5.349206349206349, "step": 293 }, { "epoch": 0.20157696263284197, "grad_norm": 0.41760595619475294, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -270790152.98245615, "logits/rejected": -306243266.70422536, "logps/chosen": -242.3859649122807, "logps/rejected": -373.63380281690144, "loss": 0.1608, "rewards/chosen": 0.7982456140350878, "rewards/margins": 5.953175191499877, "rewards/rejected": -5.154929577464789, "step": 294 }, { "epoch": 0.20226259856016454, "grad_norm": 0.6249582722394987, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -273816827.1698113, "logits/rejected": -315187964.58666664, "logps/chosen": -212.9811320754717, "logps/rejected": -366.50666666666666, "loss": 0.1949, "rewards/chosen": 0.7830188679245284, "rewards/margins": 6.409685534591195, "rewards/rejected": -5.626666666666667, "step": 295 }, { "epoch": 0.20294823448748714, "grad_norm": 0.5214642048933712, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -197394432.0, "logits/rejected": -208535552.0, "logps/chosen": -312.25, "logps/rejected": -275.0, "loss": 0.2002, "rewards/chosen": 0.4560546875, "rewards/margins": 5.2568359375, "rewards/rejected": -4.80078125, "step": 296 }, { "epoch": 0.20363387041480974, "grad_norm": 0.4918304453296973, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -258885348.43076923, "logits/rejected": -252723460.06349206, "logps/chosen": -214.15384615384616, "logps/rejected": -305.77777777777777, "loss": 0.2072, "rewards/chosen": 0.6605769230769231, "rewards/margins": 6.192322954822955, "rewards/rejected": -5.531746031746032, "step": 297 }, { "epoch": 0.20431950634213233, "grad_norm": 0.5298032489431955, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -193583261.53846154, "logits/rejected": -256185425.26984128, "logps/chosen": -194.95384615384614, "logps/rejected": -283.6825396825397, "loss": 0.2223, "rewards/chosen": 0.7105769230769231, "rewards/margins": 5.571688034188034, "rewards/rejected": -4.861111111111111, "step": 298 }, { "epoch": 0.20500514226945493, "grad_norm": 0.4978603292638029, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -245402328.94915253, "logits/rejected": -264058790.95652175, "logps/chosen": -285.2881355932203, "logps/rejected": -319.536231884058, "loss": 0.201, "rewards/chosen": 0.8384533898305084, "rewards/margins": 6.244250491279784, "rewards/rejected": -5.405797101449275, "step": 299 }, { "epoch": 0.2056907781967775, "grad_norm": 0.5178797202848506, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -282784390.7368421, "logits/rejected": -358129033.84615386, "logps/chosen": -284.63157894736844, "logps/rejected": -335.38461538461536, "loss": 0.2393, "rewards/chosen": 0.9634046052631579, "rewards/margins": 5.208596912955465, "rewards/rejected": -4.2451923076923075, "step": 300 }, { "epoch": 0.2063764141241001, "grad_norm": 0.5861287309314386, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -308530158.6440678, "logits/rejected": -275486749.68115944, "logps/chosen": -269.2881355932203, "logps/rejected": -380.7536231884058, "loss": 0.1864, "rewards/chosen": 1.0566737288135593, "rewards/margins": 6.469717207074428, "rewards/rejected": -5.413043478260869, "step": 301 }, { "epoch": 0.2070620500514227, "grad_norm": 0.48319157158134535, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -313849644.13793105, "logits/rejected": -276584389.48571426, "logps/chosen": -235.31034482758622, "logps/rejected": -350.62857142857143, "loss": 0.1643, "rewards/chosen": 1.2165948275862069, "rewards/margins": 6.198737684729064, "rewards/rejected": -4.982142857142857, "step": 302 }, { "epoch": 0.2077476859787453, "grad_norm": 0.49893395553143116, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -269572894.37288135, "logits/rejected": -275486749.68115944, "logps/chosen": -274.1694915254237, "logps/rejected": -285.6811594202899, "loss": 0.1924, "rewards/chosen": 0.777542372881356, "rewards/margins": 6.197832227953819, "rewards/rejected": -5.420289855072464, "step": 303 }, { "epoch": 0.20843332190606786, "grad_norm": 0.5015913197535329, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -259767227.73333332, "logits/rejected": -291380766.11764705, "logps/chosen": -283.2, "logps/rejected": -270.5882352941176, "loss": 0.1882, "rewards/chosen": 1.1635416666666667, "rewards/margins": 6.112071078431372, "rewards/rejected": -4.948529411764706, "step": 304 }, { "epoch": 0.20911895783339046, "grad_norm": 0.5028666234192135, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -304248359.38461536, "logits/rejected": -337310342.7368421, "logps/chosen": -215.53846153846155, "logps/rejected": -313.6842105263158, "loss": 0.1758, "rewards/chosen": 1.1305588942307692, "rewards/margins": 6.341085210020243, "rewards/rejected": -5.2105263157894735, "step": 305 }, { "epoch": 0.20980459376071306, "grad_norm": 0.5762669282883881, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -221511680.0, "logits/rejected": -260571136.0, "logps/chosen": -330.5, "logps/rejected": -364.5, "loss": 0.191, "rewards/chosen": 1.55859375, "rewards/margins": 6.51171875, "rewards/rejected": -4.953125, "step": 306 }, { "epoch": 0.21049022968803566, "grad_norm": 0.5910199390530045, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -288889870.0273973, "logits/rejected": -298329404.5090909, "logps/chosen": -221.8082191780822, "logps/rejected": -311.8545454545455, "loss": 0.2449, "rewards/chosen": 1.029109589041096, "rewards/margins": 5.138200498132005, "rewards/rejected": -4.109090909090909, "step": 307 }, { "epoch": 0.21117586561535825, "grad_norm": 0.43601549790209104, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -255972381.25714287, "logits/rejected": -237629016.27586207, "logps/chosen": -231.54285714285714, "logps/rejected": -312.82758620689657, "loss": 0.1939, "rewards/chosen": 1.0035714285714286, "rewards/margins": 6.4130541871921185, "rewards/rejected": -5.4094827586206895, "step": 308 }, { "epoch": 0.21186150154268082, "grad_norm": 0.5387777739073359, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -344457216.0, "logits/rejected": -359137280.0, "logps/chosen": -275.5, "logps/rejected": -419.5, "loss": 0.1877, "rewards/chosen": 1.2412109375, "rewards/margins": 31686993.241210938, "rewards/rejected": -31686992.0, "step": 309 }, { "epoch": 0.21254713747000342, "grad_norm": 0.580292450824243, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -350796334.54545456, "logits/rejected": -264241152.0, "logps/chosen": -304.969696969697, "logps/rejected": -356.1290322580645, "loss": 0.1921, "rewards/chosen": 1.2698863636363635, "rewards/margins": 6.237628299120234, "rewards/rejected": -4.967741935483871, "step": 310 }, { "epoch": 0.21323277339732602, "grad_norm": 0.5209160285375788, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -215544571.66101694, "logits/rejected": -241202873.50724638, "logps/chosen": -253.6949152542373, "logps/rejected": -295.42028985507244, "loss": 0.1756, "rewards/chosen": 1.2023305084745763, "rewards/margins": 7.390736305576025, "rewards/rejected": -6.188405797101449, "step": 311 }, { "epoch": 0.21391840932464862, "grad_norm": 0.6649014448145453, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -295047591.7241379, "logits/rejected": -269154479.54285717, "logps/chosen": -218.20689655172413, "logps/rejected": -323.65714285714284, "loss": 0.2243, "rewards/chosen": 0.7505387931034483, "rewards/margins": 5.864824507389162, "rewards/rejected": -5.114285714285714, "step": 312 }, { "epoch": 0.21460404525197121, "grad_norm": 0.5389284649890367, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -269894344.34782606, "logits/rejected": -259904668.20338982, "logps/chosen": -270.3768115942029, "logps/rejected": -312.135593220339, "loss": 0.2085, "rewards/chosen": 1.1557971014492754, "rewards/margins": 7.104949643822157, "rewards/rejected": -5.9491525423728815, "step": 313 }, { "epoch": 0.21528968117929378, "grad_norm": 0.5232870288268451, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -325219879.38461536, "logits/rejected": -365370481.7777778, "logps/chosen": -257.2307692307692, "logps/rejected": -355.04761904761904, "loss": 0.2038, "rewards/chosen": 1.0423076923076924, "rewards/margins": 4.6772283272283275, "rewards/rejected": -3.634920634920635, "step": 314 }, { "epoch": 0.21597531710661638, "grad_norm": 0.521217968048286, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -298163505.4035088, "logits/rejected": -362954982.7605634, "logps/chosen": -282.94736842105266, "logps/rejected": -351.5492957746479, "loss": 0.1953, "rewards/chosen": 0.9237938596491229, "rewards/margins": 6.508300901902643, "rewards/rejected": -5.584507042253521, "step": 315 }, { "epoch": 0.21666095303393898, "grad_norm": 0.6591096793509256, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -245243422.11764705, "logits/rejected": -307023052.8, "logps/chosen": -274.5882352941176, "logps/rejected": -347.73333333333335, "loss": 0.2268, "rewards/chosen": 0.9090073529411765, "rewards/margins": 5.04234068627451, "rewards/rejected": -4.133333333333334, "step": 316 }, { "epoch": 0.21734658896126158, "grad_norm": 0.4159162919403038, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -251658240.0, "logits/rejected": -277317511.5294118, "logps/chosen": -270.26666666666665, "logps/rejected": -316.94117647058823, "loss": 0.156, "rewards/chosen": 1.465625, "rewards/margins": 7.568566176470588, "rewards/rejected": -6.102941176470588, "step": 317 }, { "epoch": 0.21803222488858418, "grad_norm": 0.49486861544727767, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -258073057.88235295, "logits/rejected": -208317098.66666666, "logps/chosen": -177.1764705882353, "logps/rejected": -277.6, "loss": 0.238, "rewards/chosen": 0.7619485294117647, "rewards/margins": 5.870281862745098, "rewards/rejected": -5.108333333333333, "step": 318 }, { "epoch": 0.21871786081590674, "grad_norm": 0.4966593798881043, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -274220702.8965517, "logits/rejected": -298394770.28571427, "logps/chosen": -209.10344827586206, "logps/rejected": -275.8857142857143, "loss": 0.2084, "rewards/chosen": 0.6422413793103449, "rewards/margins": 5.9708128078817735, "rewards/rejected": -5.328571428571428, "step": 319 }, { "epoch": 0.21940349674322934, "grad_norm": 0.484654789450681, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -249141657.6, "logits/rejected": -226245692.2352941, "logps/chosen": -214.26666666666668, "logps/rejected": -374.11764705882354, "loss": 0.1869, "rewards/chosen": 0.7802083333333333, "rewards/margins": 6.5522671568627455, "rewards/rejected": -5.772058823529412, "step": 320 }, { "epoch": 0.22008913267055194, "grad_norm": 0.6171163471222523, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -222966769.15942028, "logits/rejected": -262463904.54237288, "logps/chosen": -278.0289855072464, "logps/rejected": -309.96610169491527, "loss": 0.2076, "rewards/chosen": 1.0960144927536233, "rewards/margins": 5.778217882584132, "rewards/rejected": -4.682203389830509, "step": 321 }, { "epoch": 0.22077476859787454, "grad_norm": 0.4986612266496611, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -249794104.8888889, "logits/rejected": -265714826.3783784, "logps/chosen": -278.51851851851853, "logps/rejected": -353.2972972972973, "loss": 0.1766, "rewards/chosen": 1.0324074074074074, "rewards/margins": 6.46483983983984, "rewards/rejected": -5.4324324324324325, "step": 322 }, { "epoch": 0.2214604045251971, "grad_norm": 0.5621795143313154, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -246432549.7704918, "logits/rejected": -269937893.25373137, "logps/chosen": -274.3606557377049, "logps/rejected": -306.86567164179104, "loss": 0.1654, "rewards/chosen": 1.4477459016393444, "rewards/margins": 6.764910080743823, "rewards/rejected": -5.317164179104478, "step": 323 }, { "epoch": 0.2221460404525197, "grad_norm": 0.473396664098212, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -231673615.05882353, "logits/rejected": -298354824.53333336, "logps/chosen": -222.58823529411765, "logps/rejected": -288.26666666666665, "loss": 0.2392, "rewards/chosen": 0.6107536764705882, "rewards/margins": 6.227420343137254, "rewards/rejected": -5.616666666666666, "step": 324 }, { "epoch": 0.2228316763798423, "grad_norm": 0.6220206943405606, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -266432206.3283582, "logits/rejected": -253858530.62295082, "logps/chosen": -214.2089552238806, "logps/rejected": -324.72131147540983, "loss": 0.2214, "rewards/chosen": 0.5774253731343284, "rewards/margins": 4.167589307560558, "rewards/rejected": -3.5901639344262297, "step": 325 }, { "epoch": 0.2235173123071649, "grad_norm": 0.455275797495554, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -245303233.93939394, "logits/rejected": -273306260.6451613, "logps/chosen": -294.54545454545456, "logps/rejected": -331.0967741935484, "loss": 0.1931, "rewards/chosen": 1.1193181818181819, "rewards/margins": 6.578995601173021, "rewards/rejected": -5.459677419354839, "step": 326 }, { "epoch": 0.2242029482344875, "grad_norm": 0.4834301344832099, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -274307481.6, "logits/rejected": -293354556.2352941, "logps/chosen": -278.6666666666667, "logps/rejected": -332.2352941176471, "loss": 0.1681, "rewards/chosen": 1.2291666666666667, "rewards/margins": 7.229166666666667, "rewards/rejected": -6.0, "step": 327 }, { "epoch": 0.22488858416181007, "grad_norm": 0.6408325025445161, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -278672401.3559322, "logits/rejected": -313174698.6666667, "logps/chosen": -288.8135593220339, "logps/rejected": -310.72463768115944, "loss": 0.1866, "rewards/chosen": 0.8463983050847458, "rewards/margins": 5.755818594939818, "rewards/rejected": -4.909420289855072, "step": 328 }, { "epoch": 0.22557422008913267, "grad_norm": 0.5418461535708504, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -262334650.1818182, "logits/rejected": -287377474.0645161, "logps/chosen": -234.1818181818182, "logps/rejected": -294.7096774193548, "loss": 0.2258, "rewards/chosen": 0.49384469696969696, "rewards/margins": 6.913199535679374, "rewards/rejected": -6.419354838709677, "step": 329 }, { "epoch": 0.22625985601645526, "grad_norm": 0.4939695716623795, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -256691404.8, "logits/rejected": -266708389.6470588, "logps/chosen": -248.0, "logps/rejected": -335.05882352941177, "loss": 0.2177, "rewards/chosen": 0.37552083333333336, "rewards/margins": 6.779932598039216, "rewards/rejected": -6.404411764705882, "step": 330 }, { "epoch": 0.22694549194377786, "grad_norm": 0.5187283098779368, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -281153668.12903225, "logits/rejected": -285466872.24242425, "logps/chosen": -315.0967741935484, "logps/rejected": -414.7878787878788, "loss": 0.2078, "rewards/chosen": 0.6839717741935484, "rewards/margins": 7.214274804496578, "rewards/rejected": -6.53030303030303, "step": 331 }, { "epoch": 0.22763112787110046, "grad_norm": 0.6842684226789193, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -254272871.06493506, "logits/rejected": -269093386.0392157, "logps/chosen": -259.94805194805195, "logps/rejected": -299.6078431372549, "loss": 0.2589, "rewards/chosen": 0.5135450487012987, "rewards/margins": 7.062564656544436, "rewards/rejected": -6.549019607843137, "step": 332 }, { "epoch": 0.22831676379842303, "grad_norm": 0.5077024692250621, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -320411929.0980392, "logits/rejected": -347745931.6363636, "logps/chosen": -300.2352941176471, "logps/rejected": -394.8051948051948, "loss": 0.1759, "rewards/chosen": 0.3869485294117647, "rewards/margins": 6.958377100840336, "rewards/rejected": -6.571428571428571, "step": 333 }, { "epoch": 0.22900239972574563, "grad_norm": 0.4849377608042863, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -222822400.0, "logits/rejected": -259260416.0, "logps/chosen": -279.5, "logps/rejected": -271.5, "loss": 0.2264, "rewards/chosen": 0.65185546875, "rewards/margins": 6.30029296875, "rewards/rejected": -5.6484375, "step": 334 }, { "epoch": 0.22968803565306822, "grad_norm": 0.48041336267309936, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -279276118.64615387, "logits/rejected": -303587718.0952381, "logps/chosen": -381.2923076923077, "logps/rejected": -333.2063492063492, "loss": 0.1817, "rewards/chosen": 1.3, "rewards/margins": 6.796031746031746, "rewards/rejected": -5.496031746031746, "step": 335 }, { "epoch": 0.23037367158039082, "grad_norm": 0.508819714720495, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -273748241.06666666, "logits/rejected": -283485605.64705884, "logps/chosen": -221.2, "logps/rejected": -316.2352941176471, "loss": 0.1989, "rewards/chosen": 0.7513020833333334, "rewards/margins": 6.850566789215686, "rewards/rejected": -6.099264705882353, "step": 336 }, { "epoch": 0.2310593075077134, "grad_norm": 0.4891853244668918, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -276061463.27272725, "logits/rejected": -232582775.23287672, "logps/chosen": -279.8545454545455, "logps/rejected": -324.82191780821915, "loss": 0.1819, "rewards/chosen": 1.053409090909091, "rewards/margins": 6.779436488169365, "rewards/rejected": -5.726027397260274, "step": 337 }, { "epoch": 0.231744943435036, "grad_norm": 0.6255180569711202, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -251425223.1111111, "logits/rejected": -325058560.0, "logps/chosen": -230.0, "logps/rejected": -356.85714285714283, "loss": 0.2383, "rewards/chosen": 0.6080729166666666, "rewards/margins": 7.56343005952381, "rewards/rejected": -6.955357142857143, "step": 338 }, { "epoch": 0.23243057936235859, "grad_norm": 0.5336565954475025, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -300192329.14285713, "logits/rejected": -340204657.7777778, "logps/chosen": -307.42857142857144, "logps/rejected": -304.44444444444446, "loss": 0.1589, "rewards/chosen": 1.2494419642857142, "rewards/margins": 8.138330853174603, "rewards/rejected": -6.888888888888889, "step": 339 }, { "epoch": 0.23311621528968118, "grad_norm": 0.5322527086317685, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -262919658.95890412, "logits/rejected": -326088070.9818182, "logps/chosen": -308.6027397260274, "logps/rejected": -254.25454545454545, "loss": 0.2177, "rewards/chosen": 1.0830479452054795, "rewards/margins": 7.23304794520548, "rewards/rejected": -6.15, "step": 340 }, { "epoch": 0.23380185121700378, "grad_norm": 0.5203521146663478, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -225665370.14084506, "logits/rejected": -230760304.28070176, "logps/chosen": -253.9718309859155, "logps/rejected": -364.63157894736844, "loss": 0.2227, "rewards/chosen": 1.0325704225352113, "rewards/margins": 7.453623054114159, "rewards/rejected": -6.421052631578948, "step": 341 }, { "epoch": 0.23448748714432635, "grad_norm": 0.5488312273241793, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -252257426.2857143, "logits/rejected": -307582293.3333333, "logps/chosen": -280.2857142857143, "logps/rejected": -367.1111111111111, "loss": 0.1782, "rewards/chosen": 0.9157366071428571, "rewards/margins": 6.881014384920634, "rewards/rejected": -5.965277777777778, "step": 342 }, { "epoch": 0.23517312307164895, "grad_norm": 0.46096764685146785, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -232820664.14035088, "logits/rejected": -271743639.4366197, "logps/chosen": -269.89473684210526, "logps/rejected": -343.88732394366195, "loss": 0.163, "rewards/chosen": 1.0093201754385965, "rewards/margins": 7.671292006424512, "rewards/rejected": -6.661971830985915, "step": 343 }, { "epoch": 0.23585875899897155, "grad_norm": 0.5166898225427601, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -356096409.6, "logits/rejected": -334845269.3333333, "logps/chosen": -278.4, "logps/rejected": -369.0, "loss": 0.2275, "rewards/chosen": 1.051171875, "rewards/margins": 6.712630208333334, "rewards/rejected": -5.661458333333333, "step": 344 }, { "epoch": 0.23654439492629414, "grad_norm": 0.48446493085193093, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -237555652.63768116, "logits/rejected": -234881024.0, "logps/chosen": -263.18840579710144, "logps/rejected": -477.2881355932203, "loss": 0.2008, "rewards/chosen": 1.0095108695652173, "rewards/margins": 6.594256632277082, "rewards/rejected": -5.584745762711864, "step": 345 }, { "epoch": 0.23723003085361674, "grad_norm": 0.4725440302155308, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -254601611.22807017, "logits/rejected": -267962858.36619717, "logps/chosen": -259.0877192982456, "logps/rejected": -359.6619718309859, "loss": 0.1778, "rewards/chosen": 0.8799342105263158, "rewards/margins": 6.703877872498147, "rewards/rejected": -5.823943661971831, "step": 346 }, { "epoch": 0.2379156667809393, "grad_norm": 0.43496622473082314, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -307861913.6, "logits/rejected": -315312971.2941176, "logps/chosen": -292.53333333333336, "logps/rejected": -317.6470588235294, "loss": 0.1534, "rewards/chosen": 1.703125, "rewards/margins": 7.879595588235294, "rewards/rejected": -6.176470588235294, "step": 347 }, { "epoch": 0.2386013027082619, "grad_norm": 0.5103211147468771, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -288517275.1515151, "logits/rejected": -314437499.87096775, "logps/chosen": -247.5151515151515, "logps/rejected": -338.5806451612903, "loss": 0.1939, "rewards/chosen": 1.0144412878787878, "rewards/margins": 5.345086449169111, "rewards/rejected": -4.330645161290323, "step": 348 }, { "epoch": 0.2392869386355845, "grad_norm": 0.5289997010687199, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -243569225.14285713, "logits/rejected": -254454442.66666666, "logps/chosen": -290.2857142857143, "logps/rejected": -355.1111111111111, "loss": 0.1813, "rewards/chosen": 1.3191964285714286, "rewards/margins": 5.027529761904762, "rewards/rejected": -3.7083333333333335, "step": 349 }, { "epoch": 0.2399725745629071, "grad_norm": 0.4403926244327624, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -262247138.62295082, "logits/rejected": -337046757.25373137, "logps/chosen": -277.7704918032787, "logps/rejected": -347.7014925373134, "loss": 0.1959, "rewards/chosen": 0.6737961065573771, "rewards/margins": 6.490960285661855, "rewards/rejected": -5.817164179104478, "step": 350 }, { "epoch": 0.2406582104902297, "grad_norm": 0.4991738057912124, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -248728884.82539684, "logits/rejected": -272823343.26153845, "logps/chosen": -249.14285714285714, "logps/rejected": -346.0923076923077, "loss": 0.179, "rewards/chosen": 1.2251984126984128, "rewards/margins": 6.632890720390721, "rewards/rejected": -5.407692307692308, "step": 351 }, { "epoch": 0.24134384641755227, "grad_norm": 0.4739015773066563, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -230387126.85714287, "logits/rejected": -294999381.3333333, "logps/chosen": -238.14285714285714, "logps/rejected": -355.55555555555554, "loss": 0.1707, "rewards/chosen": 1.0725446428571428, "rewards/margins": 6.815600198412698, "rewards/rejected": -5.743055555555555, "step": 352 }, { "epoch": 0.24202948234487487, "grad_norm": 0.5707229594682722, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -257670075.73333332, "logits/rejected": -198612630.5882353, "logps/chosen": -248.26666666666668, "logps/rejected": -319.05882352941177, "loss": 0.1728, "rewards/chosen": 1.0643229166666666, "rewards/margins": 7.005499387254902, "rewards/rejected": -5.9411764705882355, "step": 353 }, { "epoch": 0.24271511827219747, "grad_norm": 0.451653353762594, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -260293571.7647059, "logits/rejected": -335823940.26666665, "logps/chosen": -241.2941176470588, "logps/rejected": -409.6, "loss": 0.2103, "rewards/chosen": 0.9981617647058824, "rewards/margins": 5.773161764705883, "rewards/rejected": -4.775, "step": 354 }, { "epoch": 0.24340075419952006, "grad_norm": 0.46063205464212337, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -261910983.1111111, "logits/rejected": -271609523.8918919, "logps/chosen": -293.6296296296296, "logps/rejected": -351.56756756756755, "loss": 0.1689, "rewards/chosen": 1.0243055555555556, "rewards/margins": 6.693224474474475, "rewards/rejected": -5.668918918918919, "step": 355 }, { "epoch": 0.24408639012684263, "grad_norm": 0.5400269982137276, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -298261617.7777778, "logits/rejected": -336451196.5405405, "logps/chosen": -312.2962962962963, "logps/rejected": -345.0810810810811, "loss": 0.162, "rewards/chosen": 1.0949074074074074, "rewards/margins": 5.412474974974975, "rewards/rejected": -4.3175675675675675, "step": 356 }, { "epoch": 0.24477202605416523, "grad_norm": 0.44810660400719277, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -257250645.33333334, "logits/rejected": -222897298.2857143, "logps/chosen": -202.11111111111111, "logps/rejected": -381.14285714285717, "loss": 0.2286, "rewards/chosen": 0.6475694444444444, "rewards/margins": 6.692212301587301, "rewards/rejected": -6.044642857142857, "step": 357 }, { "epoch": 0.24545766198148783, "grad_norm": 0.4682556504237395, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -263681911.46666667, "logits/rejected": -288913528.4705882, "logps/chosen": -308.8, "logps/rejected": -417.88235294117646, "loss": 0.1825, "rewards/chosen": 0.9182291666666667, "rewards/margins": 7.32999387254902, "rewards/rejected": -6.411764705882353, "step": 358 }, { "epoch": 0.24614329790881043, "grad_norm": 0.5359141037700903, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -237635492.29850745, "logits/rejected": -279161872.78688526, "logps/chosen": -263.64179104477614, "logps/rejected": -312.655737704918, "loss": 0.1981, "rewards/chosen": 1.0139925373134329, "rewards/margins": 6.960713848788843, "rewards/rejected": -5.94672131147541, "step": 359 }, { "epoch": 0.24682893383613302, "grad_norm": 0.46663495855271825, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -233528022.70967743, "logits/rejected": -319783904.969697, "logps/chosen": -242.06451612903226, "logps/rejected": -262.7878787878788, "loss": 0.1845, "rewards/chosen": 0.876008064516129, "rewards/margins": 6.5654020039100685, "rewards/rejected": -5.6893939393939394, "step": 360 }, { "epoch": 0.2475145697634556, "grad_norm": 0.5220705492363296, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -251795758.16393444, "logits/rejected": -213596496.23880598, "logps/chosen": -265.8360655737705, "logps/rejected": -337.67164179104475, "loss": 0.2051, "rewards/chosen": 0.569672131147541, "rewards/margins": 6.85325422069978, "rewards/rejected": -6.2835820895522385, "step": 361 }, { "epoch": 0.2482002056907782, "grad_norm": 0.46336743786891227, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -244426681.37931034, "logits/rejected": -262443593.14285713, "logps/chosen": -340.13793103448273, "logps/rejected": -315.42857142857144, "loss": 0.1693, "rewards/chosen": 1.2165948275862069, "rewards/margins": 8.238023399014779, "rewards/rejected": -7.021428571428571, "step": 362 }, { "epoch": 0.2488858416181008, "grad_norm": 0.39975249927876466, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -325731231.3962264, "logits/rejected": -256803252.90666667, "logps/chosen": -203.32075471698113, "logps/rejected": -372.05333333333334, "loss": 0.1698, "rewards/chosen": 1.0737028301886793, "rewards/margins": 6.89370283018868, "rewards/rejected": -5.82, "step": 363 }, { "epoch": 0.2495714775454234, "grad_norm": 0.4738960402871566, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -243540232.2580645, "logits/rejected": -290805077.3333333, "logps/chosen": -236.38709677419354, "logps/rejected": -376.72727272727275, "loss": 0.1898, "rewards/chosen": 0.7721774193548387, "rewards/margins": 6.764601661779081, "rewards/rejected": -5.992424242424242, "step": 364 }, { "epoch": 0.250257113472746, "grad_norm": 0.450073891880436, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -220500553.14285713, "logits/rejected": -277469341.53846157, "logps/chosen": -244.57142857142858, "logps/rejected": -406.6461538461538, "loss": 0.1707, "rewards/chosen": 1.1369047619047619, "rewards/margins": 5.9753663003663, "rewards/rejected": -4.838461538461538, "step": 365 }, { "epoch": 0.25094274940006855, "grad_norm": 0.4172866033250963, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -286432833.1636364, "logits/rejected": -297853040.2191781, "logps/chosen": -372.3636363636364, "logps/rejected": -354.63013698630135, "loss": 0.1605, "rewards/chosen": 1.4840909090909091, "rewards/margins": 7.812858032378581, "rewards/rejected": -6.328767123287672, "step": 366 }, { "epoch": 0.2516283853273912, "grad_norm": 0.48693263729852193, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -277782762.05714285, "logits/rejected": -244715943.72413793, "logps/chosen": -304.9142857142857, "logps/rejected": -350.62068965517244, "loss": 0.2142, "rewards/chosen": 0.9767857142857143, "rewards/margins": 6.735406403940887, "rewards/rejected": -5.758620689655173, "step": 367 }, { "epoch": 0.25231402125471375, "grad_norm": 0.5960762424424358, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -258674166.6909091, "logits/rejected": -223159955.28767124, "logps/chosen": -295.56363636363636, "logps/rejected": -366.90410958904107, "loss": 0.1604, "rewards/chosen": 1.2545454545454546, "rewards/margins": 7.816189290161893, "rewards/rejected": -6.561643835616438, "step": 368 }, { "epoch": 0.2529996571820363, "grad_norm": 0.5408093090023096, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -316012635.70149255, "logits/rejected": -272561000.91803277, "logps/chosen": -326.92537313432837, "logps/rejected": -296.91803278688525, "loss": 0.1866, "rewards/chosen": 1.2238805970149254, "rewards/margins": 7.781257646195254, "rewards/rejected": -6.557377049180328, "step": 369 }, { "epoch": 0.25368529310935894, "grad_norm": 0.4826369091454894, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -339525354.30508476, "logits/rejected": -345756538.4347826, "logps/chosen": -312.40677966101697, "logps/rejected": -334.3768115942029, "loss": 0.1611, "rewards/chosen": 1.2584745762711864, "rewards/margins": 7.8526774748219115, "rewards/rejected": -6.594202898550725, "step": 370 }, { "epoch": 0.2543709290366815, "grad_norm": 0.5096643268960686, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -253281841.5483871, "logits/rejected": -325630510.54545456, "logps/chosen": -286.19354838709677, "logps/rejected": -383.030303030303, "loss": 0.2023, "rewards/chosen": 0.8125, "rewards/margins": 6.994318181818182, "rewards/rejected": -6.181818181818182, "step": 371 }, { "epoch": 0.25505656496400414, "grad_norm": 0.4861420101132808, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -266300173.96363637, "logits/rejected": -283603897.8630137, "logps/chosen": -196.07272727272726, "logps/rejected": -404.6027397260274, "loss": 0.1783, "rewards/chosen": 0.7801136363636364, "rewards/margins": 5.649976650062267, "rewards/rejected": -4.86986301369863, "step": 372 }, { "epoch": 0.2557422008913267, "grad_norm": 0.5162468449883798, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -313021482.08219177, "logits/rejected": -268740496.2909091, "logps/chosen": -248.986301369863, "logps/rejected": -382.25454545454545, "loss": 0.2184, "rewards/chosen": 1.0214041095890412, "rewards/margins": 7.248676836861769, "rewards/rejected": -6.2272727272727275, "step": 373 }, { "epoch": 0.2564278368186493, "grad_norm": 0.4617431324787613, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -235191713.1851852, "logits/rejected": -211755672.2162162, "logps/chosen": -313.48148148148147, "logps/rejected": -336.86486486486484, "loss": 0.1391, "rewards/chosen": 1.1539351851851851, "rewards/margins": 8.072854104104104, "rewards/rejected": -6.918918918918919, "step": 374 }, { "epoch": 0.2571134727459719, "grad_norm": 0.4720292265270994, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -257093112.78873238, "logits/rejected": -237530058.10526314, "logps/chosen": -193.80281690140845, "logps/rejected": -362.10526315789474, "loss": 0.1946, "rewards/chosen": 1.1566901408450705, "rewards/margins": 7.665462070669632, "rewards/rejected": -6.508771929824562, "step": 375 }, { "epoch": 0.2577991086732945, "grad_norm": 0.49655404579976387, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -268435456.0, "logits/rejected": -256608893.90163934, "logps/chosen": -264.5970149253731, "logps/rejected": -382.42622950819674, "loss": 0.186, "rewards/chosen": 1.1184701492537314, "rewards/margins": 6.266011132860289, "rewards/rejected": -5.147540983606557, "step": 376 }, { "epoch": 0.25848474460061704, "grad_norm": 0.5618972543642542, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -266729298.44067797, "logits/rejected": -315119883.1304348, "logps/chosen": -277.4237288135593, "logps/rejected": -279.18840579710144, "loss": 0.1852, "rewards/chosen": 0.8532838983050848, "rewards/margins": 12932288.853283899, "rewards/rejected": -12932288.0, "step": 377 }, { "epoch": 0.25917038052793967, "grad_norm": 0.442895657488422, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -299593142.85714287, "logits/rejected": -301215554.95384616, "logps/chosen": -253.96825396825398, "logps/rejected": -359.87692307692305, "loss": 0.1854, "rewards/chosen": 1.0992063492063493, "rewards/margins": 7.22997557997558, "rewards/rejected": -6.130769230769231, "step": 378 }, { "epoch": 0.25985601645526224, "grad_norm": 0.4362075050715723, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -248952237.41935483, "logits/rejected": -307836493.57575756, "logps/chosen": -224.7741935483871, "logps/rejected": -362.1818181818182, "loss": 0.195, "rewards/chosen": 0.836945564516129, "rewards/margins": 6.586945564516129, "rewards/rejected": -5.75, "step": 379 }, { "epoch": 0.26054165238258487, "grad_norm": 0.5584235048964092, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -324980887.7037037, "logits/rejected": -315139597.8378378, "logps/chosen": -355.25925925925924, "logps/rejected": -312.64864864864865, "loss": 0.1736, "rewards/chosen": 1.1099537037037037, "rewards/margins": 6.745088838838839, "rewards/rejected": -5.635135135135135, "step": 380 }, { "epoch": 0.26122728830990743, "grad_norm": 0.43225182915455873, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -225529788.852459, "logits/rejected": -265430581.49253732, "logps/chosen": -180.98360655737704, "logps/rejected": -324.2985074626866, "loss": 0.2167, "rewards/chosen": 0.5138319672131147, "rewards/margins": 5.372040922436995, "rewards/rejected": -4.858208955223881, "step": 381 }, { "epoch": 0.26191292423723, "grad_norm": 0.6319361988002582, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -257425408.0, "logits/rejected": -266338304.0, "logps/chosen": -287.25, "logps/rejected": -331.0, "loss": 0.2115, "rewards/chosen": 0.68536376953125, "rewards/margins": 5.47442626953125, "rewards/rejected": -4.7890625, "step": 382 }, { "epoch": 0.26259856016455263, "grad_norm": 0.4893114035798379, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -270592526.62857145, "logits/rejected": -308353659.5862069, "logps/chosen": -298.0571428571429, "logps/rejected": -352.55172413793105, "loss": 0.203, "rewards/chosen": 1.18125, "rewards/margins": 7.379525862068965, "rewards/rejected": -6.198275862068965, "step": 383 }, { "epoch": 0.2632841960918752, "grad_norm": 0.5025288400699698, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -208747283.69230768, "logits/rejected": -244594149.0526316, "logps/chosen": -274.7692307692308, "logps/rejected": -379.36842105263156, "loss": 0.1855, "rewards/chosen": 0.734375, "rewards/margins": 6.490953947368421, "rewards/rejected": -5.756578947368421, "step": 384 }, { "epoch": 0.2639698320191978, "grad_norm": 0.48548144249274183, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -245721686.64615384, "logits/rejected": -261245220.57142857, "logps/chosen": -243.44615384615383, "logps/rejected": -374.85714285714283, "loss": 0.1931, "rewards/chosen": 0.9519230769230769, "rewards/margins": 7.658272283272283, "rewards/rejected": -6.7063492063492065, "step": 385 }, { "epoch": 0.2646554679465204, "grad_norm": 0.44412357236828887, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -311576868.5714286, "logits/rejected": -243269632.0, "logps/chosen": -293.42857142857144, "logps/rejected": -332.0, "loss": 0.1828, "rewards/chosen": 0.7945382254464286, "rewards/margins": 5.245927114335318, "rewards/rejected": -4.451388888888889, "step": 386 }, { "epoch": 0.26534110387384297, "grad_norm": 0.5536045460220763, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -353793221.61403507, "logits/rejected": -324438275.6056338, "logps/chosen": -362.10526315789474, "logps/rejected": -318.8732394366197, "loss": 0.1715, "rewards/chosen": 1.344298245614035, "rewards/margins": 7.46401655547319, "rewards/rejected": -6.119718309859155, "step": 387 }, { "epoch": 0.2660267398011656, "grad_norm": 0.5567354418651393, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -241104829.93548387, "logits/rejected": -274536261.8181818, "logps/chosen": -262.96774193548384, "logps/rejected": -277.09090909090907, "loss": 0.1963, "rewards/chosen": 1.091733870967742, "rewards/margins": 5.3190065982404695, "rewards/rejected": -4.2272727272727275, "step": 388 }, { "epoch": 0.26671237572848816, "grad_norm": 0.4900702145485347, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -227509216.96969697, "logits/rejected": -290354076.9032258, "logps/chosen": -264.0, "logps/rejected": -333.93548387096774, "loss": 0.1922, "rewards/chosen": 1.0634469696969697, "rewards/margins": 7.127963098729228, "rewards/rejected": -6.064516129032258, "step": 389 }, { "epoch": 0.2673980116558108, "grad_norm": 0.5118150148770332, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -273161432.3380282, "logits/rejected": -329068903.2982456, "logps/chosen": -347.0422535211268, "logps/rejected": -341.6140350877193, "loss": 0.214, "rewards/chosen": 1.0631602112676057, "rewards/margins": 7.098247930565852, "rewards/rejected": -6.035087719298246, "step": 390 }, { "epoch": 0.26808364758313336, "grad_norm": 0.5566984643367753, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -290958293.91780823, "logits/rejected": -322732627.7818182, "logps/chosen": -288.6575342465753, "logps/rejected": -353.74545454545455, "loss": 0.1887, "rewards/chosen": 1.321917808219178, "rewards/margins": 8.458281444582815, "rewards/rejected": -7.136363636363637, "step": 391 }, { "epoch": 0.2687692835104559, "grad_norm": 0.45167862203996084, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -275672686.4313725, "logits/rejected": -298285827.3246753, "logps/chosen": -319.6862745098039, "logps/rejected": -369.45454545454544, "loss": 0.1542, "rewards/chosen": 1.588235294117647, "rewards/margins": 7.867456073338426, "rewards/rejected": -6.279220779220779, "step": 392 }, { "epoch": 0.26945491943777855, "grad_norm": 0.5482849312536983, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -290106026.6666667, "logits/rejected": -287010230.85714287, "logps/chosen": -296.0, "logps/rejected": -346.2857142857143, "loss": 0.2117, "rewards/chosen": 1.0147569444444444, "rewards/margins": 7.635292658730159, "rewards/rejected": -6.620535714285714, "step": 393 }, { "epoch": 0.2701405553651011, "grad_norm": 0.4322637286159423, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -215260212.06779662, "logits/rejected": -264545087.07246378, "logps/chosen": -275.2542372881356, "logps/rejected": -327.42028985507244, "loss": 0.1611, "rewards/chosen": 1.3940677966101696, "rewards/margins": 7.241893883566691, "rewards/rejected": -5.8478260869565215, "step": 394 }, { "epoch": 0.27082619129242375, "grad_norm": 0.4635669957570222, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -213475610.4827586, "logits/rejected": -297915421.25714284, "logps/chosen": -278.3448275862069, "logps/rejected": -362.9714285714286, "loss": 0.1455, "rewards/chosen": 1.540948275862069, "rewards/margins": 8.083805418719212, "rewards/rejected": -6.542857142857143, "step": 395 }, { "epoch": 0.2715118272197463, "grad_norm": 0.4093295393138275, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -299554485.67741936, "logits/rejected": -317750303.030303, "logps/chosen": -262.4516129032258, "logps/rejected": -360.72727272727275, "loss": 0.1777, "rewards/chosen": 1.0665322580645162, "rewards/margins": 7.415017106549365, "rewards/rejected": -6.348484848484849, "step": 396 }, { "epoch": 0.2721974631470689, "grad_norm": 0.4659088764147217, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -204122794.66666666, "logits/rejected": -273576860.9032258, "logps/chosen": -311.5151515151515, "logps/rejected": -329.2903225806452, "loss": 0.1863, "rewards/chosen": 1.152462121212121, "rewards/margins": 7.781494379276637, "rewards/rejected": -6.629032258064516, "step": 397 }, { "epoch": 0.2728830990743915, "grad_norm": 0.4835528444451089, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -320831992.1230769, "logits/rejected": -311576868.5714286, "logps/chosen": -260.9230769230769, "logps/rejected": -340.3174603174603, "loss": 0.2105, "rewards/chosen": 0.5828125, "rewards/margins": 7.543129960317461, "rewards/rejected": -6.9603174603174605, "step": 398 }, { "epoch": 0.2735687350017141, "grad_norm": 0.49921968137895006, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -347721331.61290324, "logits/rejected": -359947543.27272725, "logps/chosen": -323.61290322580646, "logps/rejected": -301.57575757575756, "loss": 0.2062, "rewards/chosen": 0.8790322580645161, "rewards/margins": 5.33357771260997, "rewards/rejected": -4.454545454545454, "step": 399 }, { "epoch": 0.2742543709290367, "grad_norm": 0.5459890078170968, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -245975634.58064517, "logits/rejected": -294872281.2121212, "logps/chosen": -261.6774193548387, "logps/rejected": -373.3333333333333, "loss": 0.1859, "rewards/chosen": 0.8770161290322581, "rewards/margins": 6.3542888563049855, "rewards/rejected": -5.4772727272727275, "step": 400 }, { "epoch": 0.2749400068563593, "grad_norm": 0.4412097693846523, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -296397482.6666667, "logits/rejected": -286295073.0322581, "logps/chosen": -265.2121212121212, "logps/rejected": -333.4193548387097, "loss": 0.1875, "rewards/chosen": 0.9564393939393939, "rewards/margins": 8.037084555229717, "rewards/rejected": -7.080645161290323, "step": 401 }, { "epoch": 0.27562564278368185, "grad_norm": 0.5659770038526539, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -260526197.02857143, "logits/rejected": -256575699.86206895, "logps/chosen": -274.51428571428573, "logps/rejected": -355.86206896551727, "loss": 0.2268, "rewards/chosen": 0.684375, "rewards/margins": 7.261961206896552, "rewards/rejected": -6.577586206896552, "step": 402 }, { "epoch": 0.27631127871100447, "grad_norm": 0.527412921991481, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -238192316.63157895, "logits/rejected": -247141297.23076922, "logps/chosen": -293.05263157894734, "logps/rejected": -345.2307692307692, "loss": 0.2378, "rewards/chosen": 1.004111842105263, "rewards/margins": 7.282957995951417, "rewards/rejected": -6.278846153846154, "step": 403 }, { "epoch": 0.27699691463832704, "grad_norm": 0.46082525688319587, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -257716679.1111111, "logits/rejected": -270233014.85714287, "logps/chosen": -232.0, "logps/rejected": -300.57142857142856, "loss": 0.2104, "rewards/chosen": 0.9314236111111112, "rewards/margins": 7.976066468253968, "rewards/rejected": -7.044642857142857, "step": 404 }, { "epoch": 0.27768255056564967, "grad_norm": 0.49530790103997036, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -284680061.96825397, "logits/rejected": -276953119.50769234, "logps/chosen": -214.0952380952381, "logps/rejected": -360.3692307692308, "loss": 0.1915, "rewards/chosen": 1.1924603174603174, "rewards/margins": 5.546306471306471, "rewards/rejected": -4.3538461538461535, "step": 405 }, { "epoch": 0.27836818649297224, "grad_norm": 0.4642271030902599, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -301465600.0, "logits/rejected": -299892736.0, "logps/chosen": -255.0, "logps/rejected": -374.75, "loss": 0.1793, "rewards/chosen": 1.0927734375, "rewards/margins": 6.3662109375, "rewards/rejected": -5.2734375, "step": 406 }, { "epoch": 0.2790538224202948, "grad_norm": 0.4522525264395138, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -199753728.0, "logits/rejected": -200540160.0, "logps/chosen": -297.25, "logps/rejected": -363.25, "loss": 0.1759, "rewards/chosen": 1.326171875, "rewards/margins": 8.654296875, "rewards/rejected": -7.328125, "step": 407 }, { "epoch": 0.27973945834761743, "grad_norm": 0.4168861027010473, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -280231936.0, "logits/rejected": -252968960.0, "logps/chosen": -269.5, "logps/rejected": -392.0, "loss": 0.1881, "rewards/chosen": 1.1181640625, "rewards/margins": 7.4462890625, "rewards/rejected": -6.328125, "step": 408 }, { "epoch": 0.28042509427494, "grad_norm": 0.4514429466626167, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -194426285.41935483, "logits/rejected": -268181255.75757575, "logps/chosen": -247.48387096774192, "logps/rejected": -382.06060606060606, "loss": 0.1602, "rewards/chosen": 1.3397177419354838, "rewards/margins": 6.074566226783968, "rewards/rejected": -4.734848484848484, "step": 409 }, { "epoch": 0.28111073020226257, "grad_norm": 0.5197229207819042, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -274514157.4492754, "logits/rejected": -280094199.3220339, "logps/chosen": -318.1449275362319, "logps/rejected": -341.6949152542373, "loss": 0.1872, "rewards/chosen": 1.2835144927536233, "rewards/margins": 7.308938221567183, "rewards/rejected": -6.02542372881356, "step": 410 }, { "epoch": 0.2817963661295852, "grad_norm": 0.549253165228203, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -276577340.2352941, "logits/rejected": -293321659.73333335, "logps/chosen": -255.52941176470588, "logps/rejected": -333.06666666666666, "loss": 0.1923, "rewards/chosen": 1.2113970588235294, "rewards/margins": 7.34889705882353, "rewards/rejected": -6.1375, "step": 411 }, { "epoch": 0.28248200205690777, "grad_norm": 0.4578704629668356, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -299239524.72131145, "logits/rejected": -369599564.41791046, "logps/chosen": -241.04918032786884, "logps/rejected": -333.13432835820896, "loss": 0.2057, "rewards/chosen": 1.026639344262295, "rewards/margins": 6.5490274039637875, "rewards/rejected": -5.522388059701493, "step": 412 }, { "epoch": 0.2831676379842304, "grad_norm": 0.48962418400117635, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -272509922.74285716, "logits/rejected": -251658240.0, "logps/chosen": -233.6, "logps/rejected": -444.13793103448273, "loss": 0.2029, "rewards/chosen": 1.1162946428571427, "rewards/margins": 5.883536022167488, "rewards/rejected": -4.767241379310345, "step": 413 }, { "epoch": 0.28385327391155296, "grad_norm": 0.4653713492186648, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -235374471.52941176, "logits/rejected": -230127479.46666667, "logps/chosen": -257.1764705882353, "logps/rejected": -281.3333333333333, "loss": 0.2222, "rewards/chosen": 0.9924172794117647, "rewards/margins": 6.459083946078431, "rewards/rejected": -5.466666666666667, "step": 414 }, { "epoch": 0.28453890983887553, "grad_norm": 0.42368534587202356, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -297668483.8787879, "logits/rejected": -246787435.3548387, "logps/chosen": -210.9090909090909, "logps/rejected": -298.06451612903226, "loss": 0.1818, "rewards/chosen": 1.1382575757575757, "rewards/margins": 6.815676930596286, "rewards/rejected": -5.67741935483871, "step": 415 }, { "epoch": 0.28522454576619816, "grad_norm": 0.5841827877047482, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -273416192.0, "logits/rejected": -266600448.0, "logps/chosen": -259.75, "logps/rejected": -296.0, "loss": 0.1623, "rewards/chosen": 1.5009765625, "rewards/margins": 7.1337890625, "rewards/rejected": -5.6328125, "step": 416 }, { "epoch": 0.2859101816935207, "grad_norm": 0.6110328674998029, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -243828872.53333333, "logits/rejected": -306924363.2941176, "logps/chosen": -275.73333333333335, "logps/rejected": -312.94117647058823, "loss": 0.2073, "rewards/chosen": 0.6973958333333333, "rewards/margins": 6.388572303921569, "rewards/rejected": -5.6911764705882355, "step": 417 }, { "epoch": 0.28659581762084335, "grad_norm": 0.4658773207224201, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -269024130.24561405, "logits/rejected": -279305201.57746476, "logps/chosen": -352.280701754386, "logps/rejected": -381.2957746478873, "loss": 0.1631, "rewards/chosen": 1.5350877192982457, "rewards/margins": 7.077341240425007, "rewards/rejected": -5.542253521126761, "step": 418 }, { "epoch": 0.2872814535481659, "grad_norm": 0.46724885472739985, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -288744717.4736842, "logits/rejected": -284976373.1830986, "logps/chosen": -275.64912280701753, "logps/rejected": -345.9154929577465, "loss": 0.1846, "rewards/chosen": 1.006030701754386, "rewards/margins": 7.090537744007907, "rewards/rejected": -6.084507042253521, "step": 419 }, { "epoch": 0.2879670894754885, "grad_norm": 0.5012959342491994, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -325844992.0, "logits/rejected": -308543488.0, "logps/chosen": -289.75, "logps/rejected": -292.25, "loss": 0.1863, "rewards/chosen": 1.0048828125, "rewards/margins": 7.2236328125, "rewards/rejected": -6.21875, "step": 420 }, { "epoch": 0.2886527254028111, "grad_norm": 0.4770539929134645, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -282416469.3333333, "logits/rejected": -280771644.2352941, "logps/chosen": -276.8, "logps/rejected": -373.6470588235294, "loss": 0.1808, "rewards/chosen": 1.4005208333333334, "rewards/margins": 7.194638480392157, "rewards/rejected": -5.794117647058823, "step": 421 }, { "epoch": 0.2893383613301337, "grad_norm": 0.5539540473367863, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -251396096.0, "logits/rejected": -325582848.0, "logps/chosen": -267.5, "logps/rejected": -306.0, "loss": 0.1806, "rewards/chosen": 1.099609375, "rewards/margins": 7.560546875, "rewards/rejected": -6.4609375, "step": 422 }, { "epoch": 0.2900239972574563, "grad_norm": 0.6407312118572804, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -253815310.62857142, "logits/rejected": -282898573.2413793, "logps/chosen": -204.34285714285716, "logps/rejected": -370.7586206896552, "loss": 0.2247, "rewards/chosen": 0.8200892857142857, "rewards/margins": 7.139054802955664, "rewards/rejected": -6.318965517241379, "step": 423 }, { "epoch": 0.2907096331847789, "grad_norm": 0.5491748878266259, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -265439524.57142857, "logits/rejected": -333913201.7777778, "logps/chosen": -334.85714285714283, "logps/rejected": -359.55555555555554, "loss": 0.1567, "rewards/chosen": 1.1674107142857142, "rewards/margins": 6.966021825396825, "rewards/rejected": -5.798611111111111, "step": 424 }, { "epoch": 0.29139526911210145, "grad_norm": 0.5574466289609247, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -350156733.9354839, "logits/rejected": -257250645.33333334, "logps/chosen": -313.03225806451616, "logps/rejected": -377.2121212121212, "loss": 0.1913, "rewards/chosen": 1.28125, "rewards/margins": 7.364583333333333, "rewards/rejected": -6.083333333333333, "step": 425 }, { "epoch": 0.2920809050394241, "grad_norm": 0.489669348547015, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -300262821.64705884, "logits/rejected": -274587101.8666667, "logps/chosen": -246.35294117647058, "logps/rejected": -367.46666666666664, "loss": 0.2074, "rewards/chosen": 1.1792279411764706, "rewards/margins": 6.7875612745098035, "rewards/rejected": -5.608333333333333, "step": 426 }, { "epoch": 0.29276654096674665, "grad_norm": 0.3771952408565762, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -257588118.06896552, "logits/rejected": -243509306.5142857, "logps/chosen": -243.0344827586207, "logps/rejected": -349.7142857142857, "loss": 0.165, "rewards/chosen": 1.0355603448275863, "rewards/margins": 7.2998460591133005, "rewards/rejected": -6.264285714285714, "step": 427 }, { "epoch": 0.29345217689406927, "grad_norm": 0.5085929308791385, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -242843092.6101695, "logits/rejected": -366424123.3623188, "logps/chosen": -248.27118644067798, "logps/rejected": -355.71014492753625, "loss": 0.1888, "rewards/chosen": 1.0985169491525424, "rewards/margins": 7.192719847703267, "rewards/rejected": -6.094202898550725, "step": 428 }, { "epoch": 0.29413781282139184, "grad_norm": 0.4782195544747323, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -292402907.4285714, "logits/rejected": -310249440.49230766, "logps/chosen": -288.5079365079365, "logps/rejected": -335.26153846153846, "loss": 0.1976, "rewards/chosen": 0.44742063492063494, "rewards/margins": 6.570497557997558, "rewards/rejected": -6.123076923076923, "step": 429 }, { "epoch": 0.2948234487487144, "grad_norm": 0.6353394775103036, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -249036800.0, "logits/rejected": -235667456.0, "logps/chosen": -338.0, "logps/rejected": -355.0, "loss": 0.1936, "rewards/chosen": 1.4033203125, "rewards/margins": 7.5908203125, "rewards/rejected": -6.1875, "step": 430 }, { "epoch": 0.29550908467603704, "grad_norm": 0.5586453599144835, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -243531776.0, "logits/rejected": -321912832.0, "logps/chosen": -265.25, "logps/rejected": -308.5, "loss": 0.17, "rewards/chosen": 1.337890625, "rewards/margins": 8.306640625, "rewards/rejected": -6.96875, "step": 431 }, { "epoch": 0.2961947206033596, "grad_norm": 0.5399588222213617, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -242624354.46153846, "logits/rejected": -271364811.17460316, "logps/chosen": -278.89230769230767, "logps/rejected": -341.8412698412698, "loss": 0.1974, "rewards/chosen": 1.114423076923077, "rewards/margins": 8.1382326007326, "rewards/rejected": -7.023809523809524, "step": 432 }, { "epoch": 0.29688035653068223, "grad_norm": 0.6193917851340883, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -227586582.26086956, "logits/rejected": -272416490.30508476, "logps/chosen": -236.7536231884058, "logps/rejected": -365.5593220338983, "loss": 0.2462, "rewards/chosen": 0.11993319746376811, "rewards/margins": 5.831797604243429, "rewards/rejected": -5.711864406779661, "step": 433 }, { "epoch": 0.2975659924580048, "grad_norm": 1.2425861040098105, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -264584947.40983605, "logits/rejected": -288718358.92537314, "logps/chosen": -286.1639344262295, "logps/rejected": -284.8955223880597, "loss": 0.1621, "rewards/chosen": 1.3934426229508197, "rewards/margins": 7.647173966234401, "rewards/rejected": -6.253731343283582, "step": 434 }, { "epoch": 0.29825162838532737, "grad_norm": 0.580762461281743, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -311365391.0588235, "logits/rejected": -339459003.73333335, "logps/chosen": -196.23529411764707, "logps/rejected": -334.4, "loss": 0.2549, "rewards/chosen": 0.37637867647058826, "rewards/margins": 6.2263786764705875, "rewards/rejected": -5.85, "step": 435 }, { "epoch": 0.29893726431265, "grad_norm": 0.5324204343314699, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -298964488.39344263, "logits/rejected": -224864775.64179105, "logps/chosen": -308.72131147540983, "logps/rejected": -347.7014925373134, "loss": 0.1656, "rewards/chosen": 1.3299180327868851, "rewards/margins": 7.777679226816736, "rewards/rejected": -6.447761194029851, "step": 436 }, { "epoch": 0.29962290023997257, "grad_norm": 0.5011490610152931, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -270503879.89041096, "logits/rejected": -269045536.58181816, "logps/chosen": -221.8082191780822, "logps/rejected": -288.8727272727273, "loss": 0.1963, "rewards/chosen": 1.3672945205479452, "rewards/margins": 7.149112702366128, "rewards/rejected": -5.781818181818182, "step": 437 }, { "epoch": 0.3003085361672952, "grad_norm": 0.5723826275492535, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -293274451.1168831, "logits/rejected": -221064493.17647058, "logps/chosen": -253.2987012987013, "logps/rejected": -416.62745098039215, "loss": 0.2207, "rewards/chosen": 1.0089285714285714, "rewards/margins": 8.332457983193278, "rewards/rejected": -7.323529411764706, "step": 438 }, { "epoch": 0.30099417209461776, "grad_norm": 0.5933008565269842, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -289970389.9701493, "logits/rejected": -264584947.40983605, "logps/chosen": -217.07462686567163, "logps/rejected": -259.9344262295082, "loss": 0.209, "rewards/chosen": 1.0354477611940298, "rewards/margins": 6.674792023489112, "rewards/rejected": -5.639344262295082, "step": 439 }, { "epoch": 0.30167980802194033, "grad_norm": 0.4757748108823095, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -257487611.66101694, "logits/rejected": -310135347.942029, "logps/chosen": -287.59322033898303, "logps/rejected": -294.4927536231884, "loss": 0.1562, "rewards/chosen": 0.6822033898305084, "rewards/margins": 7.421333824613117, "rewards/rejected": -6.739130434782608, "step": 440 }, { "epoch": 0.30236544394926296, "grad_norm": 0.4977788407811517, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -238263527.22580644, "logits/rejected": -297668483.8787879, "logps/chosen": -197.41935483870967, "logps/rejected": -298.90909090909093, "loss": 0.1859, "rewards/chosen": 0.8689516129032258, "rewards/margins": 7.111375855327468, "rewards/rejected": -6.242424242424242, "step": 441 }, { "epoch": 0.3030510798765855, "grad_norm": 0.6153564960139174, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -234660271.15789473, "logits/rejected": -254884627.69230768, "logps/chosen": -248.6315789473684, "logps/rejected": -389.2307692307692, "loss": 0.2269, "rewards/chosen": 0.9029605263157895, "rewards/margins": 5.729883603238866, "rewards/rejected": -4.826923076923077, "step": 442 }, { "epoch": 0.3037367158039081, "grad_norm": 0.5670612920693815, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -333510718.06060606, "logits/rejected": -323908508.9032258, "logps/chosen": -225.21212121212122, "logps/rejected": -418.06451612903226, "loss": 0.1989, "rewards/chosen": 1.15625, "rewards/margins": 7.15625, "rewards/rejected": -6.0, "step": 443 }, { "epoch": 0.3044223517312307, "grad_norm": 0.4567161861849896, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -272105472.0, "logits/rejected": -265551872.0, "logps/chosen": -243.0, "logps/rejected": -313.5, "loss": 0.1671, "rewards/chosen": 1.642578125, "rewards/margins": 7.490234375, "rewards/rejected": -5.84765625, "step": 444 }, { "epoch": 0.3051079876585533, "grad_norm": 0.5332930335664174, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -288826226.2153846, "logits/rejected": -289207247.2380952, "logps/chosen": -253.04615384615386, "logps/rejected": -438.85714285714283, "loss": 0.2058, "rewards/chosen": 0.9521634615384615, "rewards/margins": 5.341052350427351, "rewards/rejected": -4.388888888888889, "step": 445 }, { "epoch": 0.3057936235858759, "grad_norm": 0.5286080422013268, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -205333091.3432836, "logits/rejected": -252758385.3114754, "logps/chosen": -246.2089552238806, "logps/rejected": -302.6885245901639, "loss": 0.1991, "rewards/chosen": 1.2490671641791045, "rewards/margins": 7.1179196231954975, "rewards/rejected": -5.868852459016393, "step": 446 }, { "epoch": 0.3064792595131985, "grad_norm": 0.48344180270306064, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -250199351.6521739, "logits/rejected": -263316983.3220339, "logps/chosen": -307.4782608695652, "logps/rejected": -297.49152542372883, "loss": 0.2004, "rewards/chosen": 1.2518115942028984, "rewards/margins": 7.734862441660526, "rewards/rejected": -6.483050847457627, "step": 447 }, { "epoch": 0.30716489544052106, "grad_norm": 0.5039497032733465, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -274388661.67741936, "logits/rejected": -329443514.1818182, "logps/chosen": -271.61290322580646, "logps/rejected": -329.2121212121212, "loss": 0.1924, "rewards/chosen": 1.1628024193548387, "rewards/margins": 7.3900751466275665, "rewards/rejected": -6.2272727272727275, "step": 448 }, { "epoch": 0.3078505313678437, "grad_norm": 0.47280018198882934, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -233682651.42857143, "logits/rejected": -286610773.3333333, "logps/chosen": -219.42857142857142, "logps/rejected": -376.44444444444446, "loss": 0.1577, "rewards/chosen": 1.2008928571428572, "rewards/margins": 7.325892857142858, "rewards/rejected": -6.125, "step": 449 }, { "epoch": 0.30853616729516625, "grad_norm": 0.42497249807051257, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -245862474.47272727, "logits/rejected": -259702110.68493152, "logps/chosen": -214.1090909090909, "logps/rejected": -373.041095890411, "loss": 0.183, "rewards/chosen": 0.7755681818181818, "rewards/margins": 7.1933764009962635, "rewards/rejected": -6.417808219178082, "step": 450 }, { "epoch": 0.3092218032224889, "grad_norm": 0.49056015510988166, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -238136304.7164179, "logits/rejected": -222779425.5737705, "logps/chosen": -222.80597014925374, "logps/rejected": -318.1639344262295, "loss": 0.2192, "rewards/chosen": 0.7388059701492538, "rewards/margins": 6.1814289209689255, "rewards/rejected": -5.442622950819672, "step": 451 }, { "epoch": 0.30990743914981145, "grad_norm": 0.5004719924039145, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -245366784.0, "logits/rejected": -267386880.0, "logps/chosen": -232.88888888888889, "logps/rejected": -324.85714285714283, "loss": 0.2303, "rewards/chosen": 0.7916666666666666, "rewards/margins": 4.99702380952381, "rewards/rejected": -4.205357142857143, "step": 452 }, { "epoch": 0.310593075077134, "grad_norm": 0.49103222917963457, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -272416490.30508476, "logits/rejected": -258223237.5652174, "logps/chosen": -254.3728813559322, "logps/rejected": -372.40579710144925, "loss": 0.1953, "rewards/chosen": 1.0879237288135593, "rewards/margins": 7.044445467943994, "rewards/rejected": -5.956521739130435, "step": 453 }, { "epoch": 0.31127871100445664, "grad_norm": 0.40553276182206055, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -321928028.7536232, "logits/rejected": -306255281.89830506, "logps/chosen": -281.7391304347826, "logps/rejected": -456.9491525423729, "loss": 0.1806, "rewards/chosen": 1.5235507246376812, "rewards/margins": 7.709991402603784, "rewards/rejected": -6.186440677966102, "step": 454 }, { "epoch": 0.3119643469317792, "grad_norm": 0.4324343788172849, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -219917561.0810811, "logits/rejected": -318767104.0, "logps/chosen": -252.32432432432432, "logps/rejected": -311.7037037037037, "loss": 0.2227, "rewards/chosen": 1.0380067567567568, "rewards/margins": 5.806525275275275, "rewards/rejected": -4.768518518518518, "step": 455 }, { "epoch": 0.31264998285910184, "grad_norm": 0.48157402142377764, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -251149839.5151515, "logits/rejected": -257882045.93548387, "logps/chosen": -294.3030303030303, "logps/rejected": -283.35483870967744, "loss": 0.1823, "rewards/chosen": 1.2017045454545454, "rewards/margins": 6.992027126099707, "rewards/rejected": -5.790322580645161, "step": 456 }, { "epoch": 0.3133356187864244, "grad_norm": 0.4358436702266056, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -325147172.056338, "logits/rejected": -292571100.0701754, "logps/chosen": -215.2112676056338, "logps/rejected": -347.7894736842105, "loss": 0.2078, "rewards/chosen": 1.2183098591549295, "rewards/margins": 6.911292315295281, "rewards/rejected": -5.692982456140351, "step": 457 }, { "epoch": 0.314021254713747, "grad_norm": 0.4449865574948, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -262029609.8909091, "logits/rejected": -311872357.69863015, "logps/chosen": -303.41818181818184, "logps/rejected": -339.28767123287673, "loss": 0.1677, "rewards/chosen": 0.9920454545454546, "rewards/margins": 7.635881070983811, "rewards/rejected": -6.6438356164383565, "step": 458 }, { "epoch": 0.3147068906410696, "grad_norm": 0.4979782218107709, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -332881269.84126985, "logits/rejected": -344578205.53846157, "logps/chosen": -285.2063492063492, "logps/rejected": -384.0, "loss": 0.2026, "rewards/chosen": 0.8823784722222222, "rewards/margins": 6.851609241452992, "rewards/rejected": -5.969230769230769, "step": 459 }, { "epoch": 0.3153925265683922, "grad_norm": 0.47141967181507155, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -274560471.36507934, "logits/rejected": -297343889.72307694, "logps/chosen": -280.12698412698415, "logps/rejected": -349.53846153846155, "loss": 0.1633, "rewards/chosen": 1.6567460317460319, "rewards/margins": 7.233669108669108, "rewards/rejected": -5.576923076923077, "step": 460 }, { "epoch": 0.3160781624957148, "grad_norm": 0.5274664558061926, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -293461469.8666667, "logits/rejected": -433740378.35294116, "logps/chosen": -270.4, "logps/rejected": -309.4117647058824, "loss": 0.1704, "rewards/chosen": 1.2489583333333334, "rewards/margins": 6.631311274509804, "rewards/rejected": -5.382352941176471, "step": 461 }, { "epoch": 0.31676379842303737, "grad_norm": 0.5243239856158128, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -261724569.6, "logits/rejected": -225959805.96825397, "logps/chosen": -302.5230769230769, "logps/rejected": -361.6507936507937, "loss": 0.1879, "rewards/chosen": 1.1721153846153847, "rewards/margins": 5.870528083028083, "rewards/rejected": -4.698412698412699, "step": 462 }, { "epoch": 0.31744943435035994, "grad_norm": 0.5667238802944791, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -321054906.1818182, "logits/rejected": -333108917.67741936, "logps/chosen": -258.1818181818182, "logps/rejected": -427.35483870967744, "loss": 0.1765, "rewards/chosen": 1.3115530303030303, "rewards/margins": 8.263165933528835, "rewards/rejected": -6.951612903225806, "step": 463 }, { "epoch": 0.31813507027768256, "grad_norm": 0.49445523449035994, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -281322303.07246375, "logits/rejected": -231184349.2881356, "logps/chosen": -223.768115942029, "logps/rejected": -363.1186440677966, "loss": 0.1864, "rewards/chosen": 1.2481884057971016, "rewards/margins": 6.358357897322525, "rewards/rejected": -5.110169491525424, "step": 464 }, { "epoch": 0.31882070620500513, "grad_norm": 0.6372025064151338, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -270710332.7457627, "logits/rejected": -265031383.1884058, "logps/chosen": -245.96610169491527, "logps/rejected": -288.92753623188406, "loss": 0.1618, "rewards/chosen": 1.1864406779661016, "rewards/margins": 7.519774011299434, "rewards/rejected": -6.333333333333333, "step": 465 }, { "epoch": 0.31950634213232776, "grad_norm": 0.5213378962892926, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -270299591.1111111, "logits/rejected": -303188260.5714286, "logps/chosen": -250.66666666666666, "logps/rejected": -395.42857142857144, "loss": 0.1913, "rewards/chosen": 1.1293402777777777, "rewards/margins": 8.441840277777779, "rewards/rejected": -7.3125, "step": 466 }, { "epoch": 0.32019197805965033, "grad_norm": 0.5288432430142296, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -254093642.32258064, "logits/rejected": -282670669.57575756, "logps/chosen": -288.51612903225805, "logps/rejected": -276.3636363636364, "loss": 0.1798, "rewards/chosen": 1.3175403225806452, "rewards/margins": 7.484206989247312, "rewards/rejected": -6.166666666666667, "step": 467 }, { "epoch": 0.3208776139869729, "grad_norm": 0.6362716178617432, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -277211230.52307695, "logits/rejected": -272962641.26984125, "logps/chosen": -249.1076923076923, "logps/rejected": -399.74603174603175, "loss": 0.1945, "rewards/chosen": 1.3067307692307693, "rewards/margins": 7.187683150183151, "rewards/rejected": -5.880952380952381, "step": 468 }, { "epoch": 0.3215632499142955, "grad_norm": 1.1358585729002064, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -274027861.3333333, "logits/rejected": -283664005.9076923, "logps/chosen": -216.38095238095238, "logps/rejected": -464.4923076923077, "loss": 0.1946, "rewards/chosen": 0.9017857142857143, "rewards/margins": 7.11717032967033, "rewards/rejected": -6.2153846153846155, "step": 469 }, { "epoch": 0.3222488858416181, "grad_norm": 0.48797497644805565, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -293092879.5151515, "logits/rejected": -305778291.61290324, "logps/chosen": -300.8484848484849, "logps/rejected": -378.83870967741933, "loss": 0.1984, "rewards/chosen": 0.7083333333333334, "rewards/margins": 7.506720430107527, "rewards/rejected": -6.798387096774194, "step": 470 }, { "epoch": 0.3229345217689407, "grad_norm": 0.4984766007800195, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -238752689.23076922, "logits/rejected": -234082108.95238096, "logps/chosen": -236.30769230769232, "logps/rejected": -387.04761904761904, "loss": 0.1932, "rewards/chosen": 1.0673076923076923, "rewards/margins": 6.4800061050061055, "rewards/rejected": -5.412698412698413, "step": 471 }, { "epoch": 0.3236201576962633, "grad_norm": 0.43503487037390565, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -224495128.3809524, "logits/rejected": -204682035.2, "logps/chosen": -240.25396825396825, "logps/rejected": -339.6923076923077, "loss": 0.1617, "rewards/chosen": 1.2996031746031746, "rewards/margins": 8.11498778998779, "rewards/rejected": -6.815384615384615, "step": 472 }, { "epoch": 0.32430579362358586, "grad_norm": 0.4446477383924445, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -327800989.53846157, "logits/rejected": -309979038.47619045, "logps/chosen": -335.5076923076923, "logps/rejected": -337.26984126984127, "loss": 0.182, "rewards/chosen": 1.0769230769230769, "rewards/margins": 8.172161172161172, "rewards/rejected": -7.095238095238095, "step": 473 }, { "epoch": 0.3249914295509085, "grad_norm": 0.49674041559412097, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -343799775.49206346, "logits/rejected": -363936531.6923077, "logps/chosen": -260.8253968253968, "logps/rejected": -377.10769230769233, "loss": 0.1839, "rewards/chosen": 0.9682539682539683, "rewards/margins": 7.8067155067155065, "rewards/rejected": -6.838461538461538, "step": 474 }, { "epoch": 0.32567706547823105, "grad_norm": 0.517481072131916, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -252541251.36842105, "logits/rejected": -232945191.3846154, "logps/chosen": -233.05263157894737, "logps/rejected": -315.0769230769231, "loss": 0.2131, "rewards/chosen": 1.2080592105263157, "rewards/margins": 5.121520748987854, "rewards/rejected": -3.9134615384615383, "step": 475 }, { "epoch": 0.3263627014055536, "grad_norm": 0.48749615655910444, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -321895059.5254237, "logits/rejected": -293236557.9130435, "logps/chosen": -290.1694915254237, "logps/rejected": -382.60869565217394, "loss": 0.1826, "rewards/chosen": 0.9867584745762712, "rewards/margins": 7.885309199213953, "rewards/rejected": -6.898550724637682, "step": 476 }, { "epoch": 0.32704833733287625, "grad_norm": 0.5951577100274161, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -256742244.84848484, "logits/rejected": -230822020.12903225, "logps/chosen": -296.4848484848485, "logps/rejected": -386.06451612903226, "loss": 0.2294, "rewards/chosen": 0.7466856060606061, "rewards/margins": 8.093459799608993, "rewards/rejected": -7.346774193548387, "step": 477 }, { "epoch": 0.3277339732601988, "grad_norm": 0.5363777474658095, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -262659693.1147541, "logits/rejected": -256415957.97014925, "logps/chosen": -291.1475409836066, "logps/rejected": -286.089552238806, "loss": 0.1861, "rewards/chosen": 0.7110655737704918, "rewards/margins": -930229.1993821874, "rewards/rejected": 930229.9104477612, "step": 478 }, { "epoch": 0.32841960918752144, "grad_norm": 0.45636131723966067, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -285212672.0, "logits/rejected": -349482314.83076924, "logps/chosen": -299.93650793650795, "logps/rejected": -373.66153846153844, "loss": 0.1524, "rewards/chosen": 1.4097222222222223, "rewards/margins": 7.878952991452992, "rewards/rejected": -6.469230769230769, "step": 479 }, { "epoch": 0.329105245114844, "grad_norm": 0.46074836910234934, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -255120205.2063492, "logits/rejected": -289858670.27692306, "logps/chosen": -250.4126984126984, "logps/rejected": -352.4923076923077, "loss": 0.1873, "rewards/chosen": 1.2212301587301588, "rewards/margins": 8.321230158730158, "rewards/rejected": -7.1, "step": 480 }, { "epoch": 0.3297908810421666, "grad_norm": 0.4358338313649534, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -308040687.21311474, "logits/rejected": -280705360.23880595, "logps/chosen": -216.13114754098362, "logps/rejected": -324.53731343283584, "loss": 0.1937, "rewards/chosen": 0.7684426229508197, "rewards/margins": 7.081875458771716, "rewards/rejected": -6.313432835820896, "step": 481 }, { "epoch": 0.3304765169694892, "grad_norm": 0.46101517894891403, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -272494459.87096775, "logits/rejected": -374182756.8484849, "logps/chosen": -380.38709677419354, "logps/rejected": -412.1212121212121, "loss": 0.1853, "rewards/chosen": 1.283266129032258, "rewards/margins": 8.37417521994135, "rewards/rejected": -7.090909090909091, "step": 482 }, { "epoch": 0.3311621528968118, "grad_norm": 0.4672045405483455, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -269175627.2941176, "logits/rejected": -280738747.73333335, "logps/chosen": -259.52941176470586, "logps/rejected": -355.2, "loss": 0.1831, "rewards/chosen": 1.1691176470588236, "rewards/margins": 7.68578431372549, "rewards/rejected": -6.516666666666667, "step": 483 }, { "epoch": 0.3318477888241344, "grad_norm": 0.7379500400324964, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -269937893.25373137, "logits/rejected": -326193084.852459, "logps/chosen": -289.1940298507463, "logps/rejected": -322.62295081967216, "loss": 0.1618, "rewards/chosen": 1.666044776119403, "rewards/margins": 7.61686444825055, "rewards/rejected": -5.950819672131147, "step": 484 }, { "epoch": 0.332533424751457, "grad_norm": 0.4598182877539427, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -258848475.42857143, "logits/rejected": -262240791.63076922, "logps/chosen": -207.23809523809524, "logps/rejected": -376.61538461538464, "loss": 0.1747, "rewards/chosen": 1.1884920634920635, "rewards/margins": 7.657722832722833, "rewards/rejected": -6.469230769230769, "step": 485 }, { "epoch": 0.33321906067877954, "grad_norm": 0.4379380305698221, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -309329920.0, "logits/rejected": -293601280.0, "logps/chosen": -221.25, "logps/rejected": -385.0, "loss": 0.1804, "rewards/chosen": 0.638671875, "rewards/margins": 6.208984375, "rewards/rejected": -5.5703125, "step": 486 }, { "epoch": 0.33390469660610217, "grad_norm": 0.4854110841297478, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -256562349.2923077, "logits/rejected": -249527799.87301588, "logps/chosen": -225.23076923076923, "logps/rejected": -332.95238095238096, "loss": 0.1821, "rewards/chosen": 0.6461538461538462, "rewards/margins": 7.165995115995116, "rewards/rejected": -6.51984126984127, "step": 487 }, { "epoch": 0.33459033253342474, "grad_norm": 0.45784302211594347, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -259925273.9710145, "logits/rejected": -228482933.15254238, "logps/chosen": -226.31884057971016, "logps/rejected": -271.728813559322, "loss": 0.197, "rewards/chosen": 0.8106884057971014, "rewards/margins": 6.666620609186932, "rewards/rejected": -5.8559322033898304, "step": 488 }, { "epoch": 0.33527596846074736, "grad_norm": 0.5283748707445497, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -299013285.16129035, "logits/rejected": -229797019.15151516, "logps/chosen": -283.8709677419355, "logps/rejected": -365.57575757575756, "loss": 0.1765, "rewards/chosen": 1.3044354838709677, "rewards/margins": 7.955950635386119, "rewards/rejected": -6.651515151515151, "step": 489 }, { "epoch": 0.33596160438806993, "grad_norm": 0.6545311097692097, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -317765479.1641791, "logits/rejected": -269535601.3114754, "logps/chosen": -259.5820895522388, "logps/rejected": -389.5081967213115, "loss": 0.1817, "rewards/chosen": 0.8740671641791045, "rewards/margins": 8.021608147785662, "rewards/rejected": -7.147540983606557, "step": 490 }, { "epoch": 0.3366472403153925, "grad_norm": 0.5405214840933849, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -276105040.45714283, "logits/rejected": -240666270.89655173, "logps/chosen": -289.8285714285714, "logps/rejected": -284.9655172413793, "loss": 0.2229, "rewards/chosen": 0.7125, "rewards/margins": 6.285775862068966, "rewards/rejected": -5.573275862068965, "step": 491 }, { "epoch": 0.33733287624271513, "grad_norm": 0.6314147838275765, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -218866408.72727272, "logits/rejected": -282506669.41935486, "logps/chosen": -299.3939393939394, "logps/rejected": -366.4516129032258, "loss": 0.1702, "rewards/chosen": 1.8200757575757576, "rewards/margins": 7.545882209188661, "rewards/rejected": -5.725806451612903, "step": 492 }, { "epoch": 0.3380185121700377, "grad_norm": 0.4696275549667923, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -228657218.06451613, "logits/rejected": -247082635.63636363, "logps/chosen": -258.5806451612903, "logps/rejected": -284.1212121212121, "loss": 0.2012, "rewards/chosen": 0.9944556451612904, "rewards/margins": 7.123243523949169, "rewards/rejected": -6.128787878787879, "step": 493 }, { "epoch": 0.3387041480973603, "grad_norm": 0.4987105949079466, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -297482576.23880595, "logits/rejected": -327018193.8360656, "logps/chosen": -316.17910447761193, "logps/rejected": -402.3606557377049, "loss": 0.1963, "rewards/chosen": 1.296641791044776, "rewards/margins": 7.116313922192317, "rewards/rejected": -5.819672131147541, "step": 494 }, { "epoch": 0.3393897840246829, "grad_norm": 0.5008275490539021, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -238940027.87096775, "logits/rejected": -271994259.3939394, "logps/chosen": -309.6774193548387, "logps/rejected": -308.1212121212121, "loss": 0.1919, "rewards/chosen": 1.060483870967742, "rewards/margins": 7.2877565982404695, "rewards/rejected": -6.2272727272727275, "step": 495 }, { "epoch": 0.34007541995200546, "grad_norm": 0.5285444350456032, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -259059952.94117647, "logits/rejected": -293042039.46666664, "logps/chosen": -301.1764705882353, "logps/rejected": -330.6666666666667, "loss": 0.2213, "rewards/chosen": 0.9172794117647058, "rewards/margins": 6.688112745098039, "rewards/rejected": -5.770833333333333, "step": 496 }, { "epoch": 0.3407610558793281, "grad_norm": 0.4720281864627309, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -262240791.63076922, "logits/rejected": -297995312.7619048, "logps/chosen": -215.75384615384615, "logps/rejected": -328.63492063492066, "loss": 0.2118, "rewards/chosen": 0.7240384615384615, "rewards/margins": 6.906578144078145, "rewards/rejected": -6.182539682539683, "step": 497 }, { "epoch": 0.34144669180665066, "grad_norm": 0.5851645376846027, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -287019449.1076923, "logits/rejected": -283082231.8730159, "logps/chosen": -240.24615384615385, "logps/rejected": -400.25396825396825, "loss": 0.1759, "rewards/chosen": 1.2596153846153846, "rewards/margins": 8.180250305250306, "rewards/rejected": -6.920634920634921, "step": 498 }, { "epoch": 0.3421323277339733, "grad_norm": 0.4652400391795081, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -274027861.3333333, "logits/rejected": -334345947.4285714, "logps/chosen": -243.55555555555554, "logps/rejected": -406.57142857142856, "loss": 0.1919, "rewards/chosen": 1.4774305555555556, "rewards/margins": 6.638144841269841, "rewards/rejected": -5.160714285714286, "step": 499 }, { "epoch": 0.34281796366129585, "grad_norm": 0.429359163229758, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -211365417.96721312, "logits/rejected": -251157427.58208954, "logps/chosen": -312.39344262295083, "logps/rejected": -318.32835820895525, "loss": 0.1776, "rewards/chosen": 1.4487704918032787, "rewards/margins": 8.598024223146563, "rewards/rejected": -7.149253731343284, "step": 500 }, { "epoch": 0.3435035995886184, "grad_norm": 0.5653510437269211, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -209038699.3548387, "logits/rejected": -252675040.96969697, "logps/chosen": -230.19354838709677, "logps/rejected": -299.6363636363636, "loss": 0.1919, "rewards/chosen": 0.9534400201612904, "rewards/margins": 6.0595006262218964, "rewards/rejected": -5.106060606060606, "step": 501 }, { "epoch": 0.34418923551594105, "grad_norm": 0.516982002133718, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -283687470.54545456, "logits/rejected": -312516768.627451, "logps/chosen": -246.64935064935065, "logps/rejected": -346.3529411764706, "loss": 0.2023, "rewards/chosen": 1.2954545454545454, "rewards/margins": 8.246434937611408, "rewards/rejected": -6.950980392156863, "step": 502 }, { "epoch": 0.3448748714432636, "grad_norm": 0.4664057939200591, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -294246557.53846157, "logits/rejected": -310820001.68421054, "logps/chosen": -240.15384615384616, "logps/rejected": -335.1578947368421, "loss": 0.1598, "rewards/chosen": 0.8859675480769231, "rewards/margins": 7.4385991270242915, "rewards/rejected": -6.552631578947368, "step": 503 }, { "epoch": 0.34556050737058625, "grad_norm": 0.5214077502003301, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -283082231.8730159, "logits/rejected": -306635886.27692306, "logps/chosen": -184.0, "logps/rejected": -320.4923076923077, "loss": 0.2266, "rewards/chosen": 0.5843253968253969, "rewards/margins": 6.330479242979243, "rewards/rejected": -5.746153846153846, "step": 504 }, { "epoch": 0.3462461432979088, "grad_norm": 0.45506399349174015, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -264354511.56756756, "logits/rejected": -260978915.55555555, "logps/chosen": -262.27027027027026, "logps/rejected": -389.9259259259259, "loss": 0.189, "rewards/chosen": 1.089527027027027, "rewards/margins": 7.941378878878879, "rewards/rejected": -6.851851851851852, "step": 505 }, { "epoch": 0.3469317792252314, "grad_norm": 0.5465648375953563, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -279760076.8, "logits/rejected": -339738624.0, "logps/chosen": -249.4, "logps/rejected": -344.6666666666667, "loss": 0.222, "rewards/chosen": 1.13359375, "rewards/margins": 6.623177083333333, "rewards/rejected": -5.489583333333333, "step": 506 }, { "epoch": 0.347617415152554, "grad_norm": 0.5518348057553244, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -312213504.0, "logits/rejected": -211288064.0, "logps/chosen": -203.0, "logps/rejected": -318.0, "loss": 0.1703, "rewards/chosen": 1.0927734375, "rewards/margins": 22249775.092773438, "rewards/rejected": -22249774.0, "step": 507 }, { "epoch": 0.3483030510798766, "grad_norm": 0.5405695765510644, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -244173020.55384615, "logits/rejected": -248196274.7936508, "logps/chosen": -309.16923076923075, "logps/rejected": -315.42857142857144, "loss": 0.1833, "rewards/chosen": 0.9596153846153846, "rewards/margins": 7.086599511599512, "rewards/rejected": -6.126984126984127, "step": 508 }, { "epoch": 0.34898868700719915, "grad_norm": 0.5330853710598485, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -253335961.6, "logits/rejected": -227070940.68965518, "logps/chosen": -284.57142857142856, "logps/rejected": -393.37931034482756, "loss": 0.1968, "rewards/chosen": 1.2196428571428573, "rewards/margins": 6.857573891625616, "rewards/rejected": -5.637931034482759, "step": 509 }, { "epoch": 0.3496743229345218, "grad_norm": 0.5091671323887145, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -255732706.74285713, "logits/rejected": -263807258.4827586, "logps/chosen": -292.8, "logps/rejected": -406.0689655172414, "loss": 0.2086, "rewards/chosen": 1.078125, "rewards/margins": 7.776400862068965, "rewards/rejected": -6.698275862068965, "step": 510 }, { "epoch": 0.35035995886184435, "grad_norm": 0.5185177560316803, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -281726497.2467533, "logits/rejected": -328965019.60784316, "logps/chosen": -280.31168831168833, "logps/rejected": -349.1764705882353, "loss": 0.2295, "rewards/chosen": 0.9659090909090909, "rewards/margins": 5.686497326203209, "rewards/rejected": -4.720588235294118, "step": 511 }, { "epoch": 0.35104559478916697, "grad_norm": 0.4356272234500494, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -228017617.45454547, "logits/rejected": -248411036.9032258, "logps/chosen": -236.6060606060606, "logps/rejected": -337.03225806451616, "loss": 0.1909, "rewards/chosen": 1.3276515151515151, "rewards/margins": 8.230877321603128, "rewards/rejected": -6.903225806451613, "step": 512 }, { "epoch": 0.35173123071648954, "grad_norm": 0.4657327527945003, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -238288896.0, "logits/rejected": -299892736.0, "logps/chosen": -301.0, "logps/rejected": -344.0, "loss": 0.174, "rewards/chosen": 1.41015625, "rewards/margins": 6.79296875, "rewards/rejected": -5.3828125, "step": 513 }, { "epoch": 0.3524168666438121, "grad_norm": 0.5017906071864644, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -274514157.4492754, "logits/rejected": -256776712.6779661, "logps/chosen": -222.14492753623188, "logps/rejected": -378.3050847457627, "loss": 0.2097, "rewards/chosen": 0.8120471014492754, "rewards/margins": 5.727301338737411, "rewards/rejected": -4.915254237288136, "step": 514 }, { "epoch": 0.35310250257113474, "grad_norm": 0.6765158352325007, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -368645313.7297297, "logits/rejected": -298882996.1481481, "logps/chosen": -283.2432432432432, "logps/rejected": -369.48148148148147, "loss": 0.2184, "rewards/chosen": 0.9856418918918919, "rewards/margins": 8.309715965965966, "rewards/rejected": -7.324074074074074, "step": 515 }, { "epoch": 0.3537881384984573, "grad_norm": 0.585360300691634, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -233203302.4, "logits/rejected": -274509965.2413793, "logps/chosen": -246.62857142857143, "logps/rejected": -321.1034482758621, "loss": 0.2197, "rewards/chosen": 1.1633928571428571, "rewards/margins": 7.447875615763547, "rewards/rejected": -6.2844827586206895, "step": 516 }, { "epoch": 0.35447377442577993, "grad_norm": 0.4384294273467161, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -215561805.57575756, "logits/rejected": -248681637.16129032, "logps/chosen": -228.6060606060606, "logps/rejected": -305.03225806451616, "loss": 0.1741, "rewards/chosen": 1.5246212121212122, "rewards/margins": 5.653653470185728, "rewards/rejected": -4.129032258064516, "step": 517 }, { "epoch": 0.3551594103531025, "grad_norm": 0.4798318996458064, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -244232259.147541, "logits/rejected": -265180175.2835821, "logps/chosen": -257.57377049180326, "logps/rejected": -387.82089552238807, "loss": 0.1743, "rewards/chosen": 1.4118852459016393, "rewards/margins": 7.7103927085882065, "rewards/rejected": -6.298507462686567, "step": 518 }, { "epoch": 0.35584504628042507, "grad_norm": 0.47240070270060974, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -232405697.04918033, "logits/rejected": -251908646.20895523, "logps/chosen": -255.7377049180328, "logps/rejected": -328.5970149253731, "loss": 0.1816, "rewards/chosen": 0.9682377049180327, "rewards/margins": 7.85629740641057, "rewards/rejected": -6.888059701492537, "step": 519 }, { "epoch": 0.3565306822077477, "grad_norm": 0.4654332595193545, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -311505323.9402985, "logits/rejected": -347095845.7704918, "logps/chosen": -282.74626865671644, "logps/rejected": -333.6393442622951, "loss": 0.2051, "rewards/chosen": 1.1604477611940298, "rewards/margins": 7.578480548079276, "rewards/rejected": -6.418032786885246, "step": 520 }, { "epoch": 0.35721631813507027, "grad_norm": 0.4843566694024411, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -323701579.2941176, "logits/rejected": -269089113.76623374, "logps/chosen": -323.7647058823529, "logps/rejected": -372.3636363636364, "loss": 0.1271, "rewards/chosen": 1.786764705882353, "rewards/margins": 8.228323147440795, "rewards/rejected": -6.441558441558442, "step": 521 }, { "epoch": 0.3579019540623929, "grad_norm": 0.6719894815941665, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -248250368.0, "logits/rejected": -216793088.0, "logps/chosen": -217.0, "logps/rejected": -354.0, "loss": 0.1847, "rewards/chosen": 1.3447265625, "rewards/margins": 7.9306640625, "rewards/rejected": -6.5859375, "step": 522 }, { "epoch": 0.35858758998971546, "grad_norm": 0.48714676294545894, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -253522375.1111111, "logits/rejected": -290633003.3230769, "logps/chosen": -255.23809523809524, "logps/rejected": -289.4769230769231, "loss": 0.1832, "rewards/chosen": 0.9861111111111112, "rewards/margins": 7.793803418803419, "rewards/rejected": -6.8076923076923075, "step": 523 }, { "epoch": 0.35927322591703803, "grad_norm": 0.48853532884081274, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -283603897.8630137, "logits/rejected": -275451382.6909091, "logps/chosen": -246.35616438356163, "logps/rejected": -325.8181818181818, "loss": 0.2147, "rewards/chosen": 1.0325342465753424, "rewards/margins": 30009007.577988792, "rewards/rejected": -30009006.545454547, "step": 524 }, { "epoch": 0.35995886184436066, "grad_norm": 0.6086579901749656, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -286095683.3684211, "logits/rejected": -276824064.0, "logps/chosen": -335.1578947368421, "logps/rejected": -352.9230769230769, "loss": 0.1917, "rewards/chosen": 1.7351973684210527, "rewards/margins": 8.028466599190283, "rewards/rejected": -6.293269230769231, "step": 525 }, { "epoch": 0.3606444977716832, "grad_norm": 0.5628214128087161, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -239987133.2173913, "logits/rejected": -238008979.52542374, "logps/chosen": -179.59420289855072, "logps/rejected": -369.35593220338984, "loss": 0.2068, "rewards/chosen": 1.0009057971014492, "rewards/margins": 6.8653125767624665, "rewards/rejected": -5.864406779661017, "step": 526 }, { "epoch": 0.36133013369900585, "grad_norm": 0.5992109763291023, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -250656615.16417912, "logits/rejected": -303365069.6393443, "logps/chosen": -382.089552238806, "logps/rejected": -327.8688524590164, "loss": 0.2014, "rewards/chosen": 1.1940298507462686, "rewards/margins": 6.62025935894299, "rewards/rejected": -5.426229508196721, "step": 527 }, { "epoch": 0.3620157696263284, "grad_norm": 0.5521018864746977, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -269380651.26760566, "logits/rejected": -270201478.7368421, "logps/chosen": -225.80281690140845, "logps/rejected": -317.7543859649123, "loss": 0.1874, "rewards/chosen": 1.3089788732394365, "rewards/margins": 7.852838522362244, "rewards/rejected": -6.543859649122807, "step": 528 }, { "epoch": 0.362701405553651, "grad_norm": 0.46022079301749597, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -294175842.1917808, "logits/rejected": -358117301.5272727, "logps/chosen": -216.1095890410959, "logps/rejected": -453.8181818181818, "loss": 0.1953, "rewards/chosen": 1.50513698630137, "rewards/margins": 8.009682440846825, "rewards/rejected": -6.504545454545455, "step": 529 }, { "epoch": 0.3633870414809736, "grad_norm": 0.43908772158571385, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -266637897.14285713, "logits/rejected": -240007395.55555555, "logps/chosen": -232.71428571428572, "logps/rejected": -320.44444444444446, "loss": 0.1516, "rewards/chosen": 1.2008928571428572, "rewards/margins": 7.693948412698413, "rewards/rejected": -6.493055555555555, "step": 530 }, { "epoch": 0.3640726774082962, "grad_norm": 0.44738342090682576, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -345212546.1694915, "logits/rejected": -370800788.4057971, "logps/chosen": -298.3050847457627, "logps/rejected": -339.4782608695652, "loss": 0.1797, "rewards/chosen": 1.13135593220339, "rewards/margins": 7.740051584377303, "rewards/rejected": -6.608695652173913, "step": 531 }, { "epoch": 0.3647583133356188, "grad_norm": 0.4887774160630892, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -297271296.0, "logits/rejected": -220463104.0, "logps/chosen": -195.5, "logps/rejected": -324.5, "loss": 0.1991, "rewards/chosen": 0.8466796875, "rewards/margins": 7.2294921875, "rewards/rejected": -6.3828125, "step": 532 }, { "epoch": 0.3654439492629414, "grad_norm": 0.48376923533508115, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -272396743.1111111, "logits/rejected": -252257426.2857143, "logps/chosen": -253.33333333333334, "logps/rejected": -390.0, "loss": 0.2032, "rewards/chosen": 1.0507269965277777, "rewards/margins": 8.291798425099206, "rewards/rejected": -7.241071428571429, "step": 533 }, { "epoch": 0.36612958519026395, "grad_norm": 0.6286428363271878, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -322761679.2380952, "logits/rejected": -299666888.86153847, "logps/chosen": -344.3809523809524, "logps/rejected": -362.33846153846156, "loss": 0.1738, "rewards/chosen": 1.5, "rewards/margins": 7.753846153846154, "rewards/rejected": -6.253846153846154, "step": 534 }, { "epoch": 0.3668152211175866, "grad_norm": 0.5049071380916423, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -250279290.73972604, "logits/rejected": -286127792.8727273, "logps/chosen": -276.16438356164383, "logps/rejected": -320.8727272727273, "loss": 0.2025, "rewards/chosen": 1.452054794520548, "rewards/margins": 6.279327521793276, "rewards/rejected": -4.827272727272727, "step": 535 }, { "epoch": 0.36750085704490915, "grad_norm": 0.5630510221696051, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -324661801.5135135, "logits/rejected": -317835036.4444444, "logps/chosen": -271.13513513513516, "logps/rejected": -295.4074074074074, "loss": 0.2171, "rewards/chosen": 1.2609797297297298, "rewards/margins": 7.177646396396397, "rewards/rejected": -5.916666666666667, "step": 536 }, { "epoch": 0.36818649297223177, "grad_norm": 0.5507034551298338, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -217870791.1111111, "logits/rejected": -242370852.57142857, "logps/chosen": -231.55555555555554, "logps/rejected": -340.2857142857143, "loss": 0.2231, "rewards/chosen": 0.9001736111111112, "rewards/margins": 6.926959325396826, "rewards/rejected": -6.026785714285714, "step": 537 }, { "epoch": 0.36887212889955434, "grad_norm": 0.9584700704842978, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -228065280.0, "logits/rejected": -299368448.0, "logps/chosen": -250.0, "logps/rejected": -373.5, "loss": 0.1832, "rewards/chosen": 1.0576171875, "rewards/margins": 7.7919921875, "rewards/rejected": -6.734375, "step": 538 }, { "epoch": 0.3695577648268769, "grad_norm": 0.5202315748499737, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -253385306.3529412, "logits/rejected": -237397606.4, "logps/chosen": -287.52941176470586, "logps/rejected": -362.93333333333334, "loss": 0.2024, "rewards/chosen": 1.2417279411764706, "rewards/margins": 6.083394607843138, "rewards/rejected": -4.841666666666667, "step": 539 }, { "epoch": 0.37024340075419954, "grad_norm": 0.5610688861441208, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -242507031.27272728, "logits/rejected": -292107418.30136985, "logps/chosen": -234.1818181818182, "logps/rejected": -329.86301369863014, "loss": 0.1799, "rewards/chosen": 0.7636363636363637, "rewards/margins": 6.325280199252802, "rewards/rejected": -5.561643835616438, "step": 540 }, { "epoch": 0.3709290366815221, "grad_norm": 0.5325693595285383, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -292718268.6315789, "logits/rejected": -294891835.0769231, "logps/chosen": -218.52631578947367, "logps/rejected": -283.0769230769231, "loss": 0.2264, "rewards/chosen": 0.8618421052631579, "rewards/margins": 4.323380566801619, "rewards/rejected": -3.4615384615384617, "step": 541 }, { "epoch": 0.3716146726088447, "grad_norm": 0.7170475241424197, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -222611119.76119402, "logits/rejected": -362772916.4590164, "logps/chosen": -242.62686567164178, "logps/rejected": -343.344262295082, "loss": 0.1756, "rewards/chosen": 1.767723880597015, "rewards/margins": 8.423461585515048, "rewards/rejected": -6.655737704918033, "step": 542 }, { "epoch": 0.3723003085361673, "grad_norm": 0.503937202169234, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -272509922.74285716, "logits/rejected": -342197353.9310345, "logps/chosen": -235.88571428571427, "logps/rejected": -372.9655172413793, "loss": 0.1909, "rewards/chosen": 1.2883928571428571, "rewards/margins": 6.745289408866995, "rewards/rejected": -5.456896551724138, "step": 543 }, { "epoch": 0.37298594446348987, "grad_norm": 0.4711678638099667, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -253335961.6, "logits/rejected": -252815289.37931034, "logps/chosen": -258.74285714285713, "logps/rejected": -286.62068965517244, "loss": 0.1789, "rewards/chosen": 1.2741071428571429, "rewards/margins": 8.808589901477832, "rewards/rejected": -7.5344827586206895, "step": 544 }, { "epoch": 0.3736715803908125, "grad_norm": 0.5064601277498223, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -240931823.21311477, "logits/rejected": -278451704.35820895, "logps/chosen": -243.40983606557376, "logps/rejected": -339.5820895522388, "loss": 0.1904, "rewards/chosen": 0.9918032786885246, "rewards/margins": 7.924639099584047, "rewards/rejected": -6.932835820895522, "step": 545 }, { "epoch": 0.37435721631813507, "grad_norm": 0.6696985959140651, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -308395734.1090909, "logits/rejected": -349103987.72602737, "logps/chosen": -255.2, "logps/rejected": -370.4109589041096, "loss": 0.1669, "rewards/chosen": 1.238583096590909, "rewards/margins": 7.731733781522416, "rewards/rejected": -6.493150684931507, "step": 546 }, { "epoch": 0.37504285224545764, "grad_norm": 0.6609487858412467, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -277543087.54285717, "logits/rejected": -327155712.0, "logps/chosen": -253.02857142857144, "logps/rejected": -436.41379310344826, "loss": 0.2262, "rewards/chosen": 0.9, "rewards/margins": 6.994827586206897, "rewards/rejected": -6.094827586206897, "step": 547 }, { "epoch": 0.37572848817278026, "grad_norm": 0.6695538115277225, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -319592212.9836066, "logits/rejected": -258168801.43283582, "logps/chosen": -292.72131147540983, "logps/rejected": -329.07462686567163, "loss": 0.2044, "rewards/chosen": 0.7751024590163934, "rewards/margins": 5.909430817225348, "rewards/rejected": -5.134328358208955, "step": 548 }, { "epoch": 0.37641412410010283, "grad_norm": 0.5001948023929859, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -260858648.77419356, "logits/rejected": -256996445.0909091, "logps/chosen": -263.2258064516129, "logps/rejected": -278.54545454545456, "loss": 0.1904, "rewards/chosen": 0.5645161290322581, "rewards/margins": 7.269061583577712, "rewards/rejected": -6.704545454545454, "step": 549 }, { "epoch": 0.37709976002742546, "grad_norm": 0.5488584750269484, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -275667014.62068963, "logits/rejected": -242310933.94285715, "logps/chosen": -235.31034482758622, "logps/rejected": -296.22857142857146, "loss": 0.1789, "rewards/chosen": 0.8857758620689655, "rewards/margins": 7.442918719211822, "rewards/rejected": -6.557142857142857, "step": 550 }, { "epoch": 0.377785395954748, "grad_norm": 0.5320418063659266, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -268181255.75757575, "logits/rejected": -278718265.8064516, "logps/chosen": -291.1515151515151, "logps/rejected": -385.03225806451616, "loss": 0.1777, "rewards/chosen": 1.2869318181818181, "rewards/margins": 8.069189882697946, "rewards/rejected": -6.782258064516129, "step": 551 }, { "epoch": 0.3784710318820706, "grad_norm": 0.49938643348263834, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -266076160.0, "logits/rejected": -238288896.0, "logps/chosen": -228.0, "logps/rejected": -345.5, "loss": 0.1629, "rewards/chosen": 1.55908203125, "rewards/margins": 8.06689453125, "rewards/rejected": -6.5078125, "step": 552 }, { "epoch": 0.3791566678093932, "grad_norm": 0.5745982059359382, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -215127205.16129032, "logits/rejected": -279366066.42424244, "logps/chosen": -258.7096774193548, "logps/rejected": -331.1515151515151, "loss": 0.1605, "rewards/chosen": 1.6653225806451613, "rewards/margins": 7.5630498533724335, "rewards/rejected": -5.8977272727272725, "step": 553 }, { "epoch": 0.3798423037367158, "grad_norm": 0.43957452561058913, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -300679168.0, "logits/rejected": -264503296.0, "logps/chosen": -250.0, "logps/rejected": -355.5, "loss": 0.1774, "rewards/chosen": 1.2119140625, "rewards/margins": 8.2041015625, "rewards/rejected": -6.9921875, "step": 554 }, { "epoch": 0.3805279396640384, "grad_norm": 0.46166289700384494, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -237577362.2857143, "logits/rejected": -274027861.3333333, "logps/chosen": -234.14285714285714, "logps/rejected": -315.3333333333333, "loss": 0.1664, "rewards/chosen": 0.8024553571428571, "rewards/margins": 8.219122023809524, "rewards/rejected": -7.416666666666667, "step": 555 }, { "epoch": 0.381213575591361, "grad_norm": 0.4894511233842729, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -260512881.7777778, "logits/rejected": -289706569.14285713, "logps/chosen": -204.22222222222223, "logps/rejected": -369.14285714285717, "loss": 0.2003, "rewards/chosen": 1.1258680555555556, "rewards/margins": 8.65265376984127, "rewards/rejected": -7.526785714285714, "step": 556 }, { "epoch": 0.38189921151868356, "grad_norm": 0.5566100523066168, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -258582170.41269842, "logits/rejected": -260175903.5076923, "logps/chosen": -295.36507936507934, "logps/rejected": -273.4769230769231, "loss": 0.1885, "rewards/chosen": 1.3015873015873016, "rewards/margins": 6.732356532356532, "rewards/rejected": -5.430769230769231, "step": 557 }, { "epoch": 0.3825848474460062, "grad_norm": 0.6208245995191221, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -331909256.53333336, "logits/rejected": -298535755.2941176, "logps/chosen": -272.53333333333336, "logps/rejected": -437.1764705882353, "loss": 0.17, "rewards/chosen": 1.40625, "rewards/margins": 8.082720588235293, "rewards/rejected": -6.676470588235294, "step": 558 }, { "epoch": 0.38327048337332875, "grad_norm": 0.5390368794940827, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -258693846.70967743, "logits/rejected": -272756860.1212121, "logps/chosen": -232.7741935483871, "logps/rejected": -355.3939393939394, "loss": 0.1762, "rewards/chosen": 1.25, "rewards/margins": 8.136363636363637, "rewards/rejected": -6.886363636363637, "step": 559 }, { "epoch": 0.3839561193006514, "grad_norm": 0.6388303235535974, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -253335961.6, "logits/rejected": -273208284.6896552, "logps/chosen": -313.14285714285717, "logps/rejected": -307.58620689655174, "loss": 0.1789, "rewards/chosen": 1.5267857142857142, "rewards/margins": 7.604371921182266, "rewards/rejected": -6.077586206896552, "step": 560 }, { "epoch": 0.38464175522797395, "grad_norm": 0.6175553612931349, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -267726559.54929578, "logits/rejected": -277265569.68421054, "logps/chosen": -237.74647887323943, "logps/rejected": -399.1578947368421, "loss": 0.179, "rewards/chosen": 1.3943661971830985, "rewards/margins": 7.911910056832221, "rewards/rejected": -6.517543859649122, "step": 561 }, { "epoch": 0.3853273911552965, "grad_norm": 0.5951383380071551, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -253160677.25373134, "logits/rejected": -253308457.96721312, "logps/chosen": -228.53731343283582, "logps/rejected": -306.3606557377049, "loss": 0.1756, "rewards/chosen": 1.4309701492537314, "rewards/margins": 7.20146195253242, "rewards/rejected": -5.770491803278689, "step": 562 }, { "epoch": 0.38601302708261914, "grad_norm": 0.596931753387662, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -195238086.19354838, "logits/rejected": -232593221.8181818, "logps/chosen": -308.38709677419354, "logps/rejected": -334.54545454545456, "loss": 0.1722, "rewards/chosen": 1.7580645161290323, "rewards/margins": 8.568670576735093, "rewards/rejected": -6.8106060606060606, "step": 563 }, { "epoch": 0.3866986630099417, "grad_norm": 0.6400612311172608, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -233528022.70967743, "logits/rejected": -233355822.54545453, "logps/chosen": -227.09677419354838, "logps/rejected": -320.0, "loss": 0.1912, "rewards/chosen": 0.8251008064516129, "rewards/margins": 5.673585654936462, "rewards/rejected": -4.848484848484849, "step": 564 }, { "epoch": 0.38738429893726434, "grad_norm": 0.5809257703014877, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -258510905.69014084, "logits/rejected": -367332729.2631579, "logps/chosen": -223.32394366197184, "logps/rejected": -356.49122807017545, "loss": 0.2129, "rewards/chosen": 0.9709507042253521, "rewards/margins": 6.032354212997282, "rewards/rejected": -5.06140350877193, "step": 565 }, { "epoch": 0.3880699348645869, "grad_norm": 0.5011179007915688, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -267201837.17647058, "logits/rejected": -372174574.93333334, "logps/chosen": -260.47058823529414, "logps/rejected": -334.6666666666667, "loss": 0.1765, "rewards/chosen": 1.53125, "rewards/margins": 8.772916666666667, "rewards/rejected": -7.241666666666666, "step": 566 }, { "epoch": 0.3887555707919095, "grad_norm": 0.46398753970367057, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -287130068.1142857, "logits/rejected": -310378496.0, "logps/chosen": -252.8, "logps/rejected": -364.9655172413793, "loss": 0.1704, "rewards/chosen": 1.5125, "rewards/margins": 7.779741379310345, "rewards/rejected": -6.267241379310345, "step": 567 }, { "epoch": 0.3894412067192321, "grad_norm": 0.5403182845484281, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -232463967.45762712, "logits/rejected": -234637875.942029, "logps/chosen": -321.35593220338984, "logps/rejected": -312.1159420289855, "loss": 0.1647, "rewards/chosen": 1.2605932203389831, "rewards/margins": 8.724361336281012, "rewards/rejected": -7.463768115942029, "step": 568 }, { "epoch": 0.3901268426465547, "grad_norm": 0.49332947380725367, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -290805077.3333333, "logits/rejected": -297795584.0, "logps/chosen": -232.0, "logps/rejected": -364.85714285714283, "loss": 0.1986, "rewards/chosen": 1.1532118055555556, "rewards/margins": 9.313926091269842, "rewards/rejected": -8.160714285714286, "step": 569 }, { "epoch": 0.3908124785738773, "grad_norm": 0.6147275035406001, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -253522375.1111111, "logits/rejected": -264821901.7846154, "logps/chosen": -289.6507936507937, "logps/rejected": -354.2153846153846, "loss": 0.1983, "rewards/chosen": 1.0992063492063493, "rewards/margins": 8.053052503052504, "rewards/rejected": -6.953846153846154, "step": 570 }, { "epoch": 0.39149811450119987, "grad_norm": 0.5263749859400585, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -215883294.11764705, "logits/rejected": -243269632.0, "logps/chosen": -285.4117647058824, "logps/rejected": -314.1333333333333, "loss": 0.1876, "rewards/chosen": 1.40625, "rewards/margins": 8.047916666666666, "rewards/rejected": -6.641666666666667, "step": 571 }, { "epoch": 0.39218375042852244, "grad_norm": 0.553899197982938, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -268185049.79104477, "logits/rejected": -281637199.73770493, "logps/chosen": -288.7164179104478, "logps/rejected": -282.4918032786885, "loss": 0.195, "rewards/chosen": 1.0792910447761195, "rewards/margins": 7.677651700513824, "rewards/rejected": -6.598360655737705, "step": 572 }, { "epoch": 0.39286938635584506, "grad_norm": 1.5893634001836798, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -239743985.15942028, "logits/rejected": -265591860.06779662, "logps/chosen": -260.17391304347825, "logps/rejected": -323.79661016949154, "loss": 0.199, "rewards/chosen": 1.1933876811594204, "rewards/margins": 8.879828359125522, "rewards/rejected": -7.686440677966102, "step": 573 }, { "epoch": 0.39355502228316763, "grad_norm": 0.44157086901243425, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -179443266.7826087, "logits/rejected": -215544571.66101694, "logps/chosen": -232.34782608695653, "logps/rejected": -399.45762711864404, "loss": 0.162, "rewards/chosen": 1.4365942028985508, "rewards/margins": 9.004390813068042, "rewards/rejected": -7.567796610169491, "step": 574 }, { "epoch": 0.3942406582104902, "grad_norm": 0.6017387551766551, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -259114780.44444445, "logits/rejected": -295020890.5846154, "logps/chosen": -357.58730158730157, "logps/rejected": -348.0615384615385, "loss": 0.1947, "rewards/chosen": 1.4732142857142858, "rewards/margins": 7.55782967032967, "rewards/rejected": -6.084615384615384, "step": 575 }, { "epoch": 0.39492629413781283, "grad_norm": 0.5739714731487394, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -270201478.7368421, "logits/rejected": -258510905.69014084, "logps/chosen": -261.3333333333333, "logps/rejected": -352.90140845070425, "loss": 0.1736, "rewards/chosen": 0.8103070175438597, "rewards/margins": 7.5004478626142825, "rewards/rejected": -6.690140845070423, "step": 576 }, { "epoch": 0.3956119300651354, "grad_norm": 0.5300494776515232, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -246795278.84057972, "logits/rejected": -295733976.9491525, "logps/chosen": -225.85507246376812, "logps/rejected": -364.47457627118644, "loss": 0.2109, "rewards/chosen": 1.1965579710144927, "rewards/margins": 7.2728291574551704, "rewards/rejected": -6.076271186440678, "step": 577 }, { "epoch": 0.396297565992458, "grad_norm": 0.4996576345922773, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -280942107.92727274, "logits/rejected": -271423179.39726025, "logps/chosen": -246.4, "logps/rejected": -352.43835616438355, "loss": 0.1733, "rewards/chosen": 1.3386363636363636, "rewards/margins": 7.6742528019925285, "rewards/rejected": -6.335616438356165, "step": 578 }, { "epoch": 0.3969832019197806, "grad_norm": 0.4487589685911126, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -308805632.0, "logits/rejected": -316932096.0, "logps/chosen": -337.375, "logps/rejected": -376.25, "loss": 0.1757, "rewards/chosen": 1.3232421875, "rewards/margins": 16695529.323242188, "rewards/rejected": -16695528.0, "step": 579 }, { "epoch": 0.39766883784710316, "grad_norm": 0.44552226271653034, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -322681787.73333335, "logits/rejected": -276083892.7058824, "logps/chosen": -252.0, "logps/rejected": -335.05882352941177, "loss": 0.1792, "rewards/chosen": 1.2947916666666666, "rewards/margins": 8.280085784313725, "rewards/rejected": -6.985294117647059, "step": 580 }, { "epoch": 0.3983544737744258, "grad_norm": 0.5130100613509561, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -207837153.43283582, "logits/rejected": -210402790.81967214, "logps/chosen": -227.1044776119403, "logps/rejected": -358.8196721311475, "loss": 0.1903, "rewards/chosen": 0.976679104477612, "rewards/margins": 7.099629924149744, "rewards/rejected": -6.122950819672131, "step": 581 }, { "epoch": 0.39904010970174836, "grad_norm": 0.4504949141405405, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -218692482.24561402, "logits/rejected": -254021228.1690141, "logps/chosen": -289.12280701754383, "logps/rejected": -347.943661971831, "loss": 0.1818, "rewards/chosen": 0.8322368421052632, "rewards/margins": 5.550546701260193, "rewards/rejected": -4.71830985915493, "step": 582 }, { "epoch": 0.399725745629071, "grad_norm": 0.520334728863551, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -322463778.7118644, "logits/rejected": -343811353.9710145, "logps/chosen": -215.32203389830508, "logps/rejected": -366.1449275362319, "loss": 0.1871, "rewards/chosen": 0.6477754237288136, "rewards/margins": 6.850673974453452, "rewards/rejected": -6.202898550724638, "step": 583 }, { "epoch": 0.40041138155639355, "grad_norm": 0.5109936884593255, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -344320094.52307695, "logits/rejected": -276690911.49206346, "logps/chosen": -199.87692307692308, "logps/rejected": -397.7142857142857, "loss": 0.2005, "rewards/chosen": 0.8185096153846154, "rewards/margins": 7.485176282051283, "rewards/rejected": -6.666666666666667, "step": 584 }, { "epoch": 0.4010970174837161, "grad_norm": 0.6625228476417558, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -264929769.07462686, "logits/rejected": -285487708.3278689, "logps/chosen": -252.65671641791045, "logps/rejected": -364.59016393442624, "loss": 0.2061, "rewards/chosen": 1.025186567164179, "rewards/margins": 7.582563616344507, "rewards/rejected": -6.557377049180328, "step": 585 }, { "epoch": 0.40178265341103875, "grad_norm": 0.4830921644153327, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -241202873.50724638, "logits/rejected": -325023015.0508475, "logps/chosen": -327.6521739130435, "logps/rejected": -309.6949152542373, "loss": 0.1748, "rewards/chosen": 1.9076086956521738, "rewards/margins": 8.882184966838615, "rewards/rejected": -6.97457627118644, "step": 586 }, { "epoch": 0.4024682893383613, "grad_norm": 0.4664905857593518, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -287237508.4137931, "logits/rejected": -319965476.5714286, "logps/chosen": -326.62068965517244, "logps/rejected": -355.2, "loss": 0.1523, "rewards/chosen": 1.831896551724138, "rewards/margins": 7.1176108374384235, "rewards/rejected": -5.285714285714286, "step": 587 }, { "epoch": 0.40315392526568394, "grad_norm": 0.6479013824700225, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -222491695.26153848, "logits/rejected": -298527922.7936508, "logps/chosen": -239.5076923076923, "logps/rejected": -384.0, "loss": 0.1876, "rewards/chosen": 1.1846153846153846, "rewards/margins": 7.561599511599512, "rewards/rejected": -6.376984126984127, "step": 588 }, { "epoch": 0.4038395611930065, "grad_norm": 0.537706551975951, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -279309577.4814815, "logits/rejected": -270249209.0810811, "logps/chosen": -221.92592592592592, "logps/rejected": -334.7027027027027, "loss": 0.1862, "rewards/chosen": 0.8171296296296297, "rewards/margins": 7.729291791791792, "rewards/rejected": -6.912162162162162, "step": 589 }, { "epoch": 0.4045251971203291, "grad_norm": 0.5946525284045527, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -202899456.0, "logits/rejected": -218365952.0, "logps/chosen": -218.25, "logps/rejected": -350.0, "loss": 0.1905, "rewards/chosen": 1.013671875, "rewards/margins": 7.224609375, "rewards/rejected": -6.2109375, "step": 590 }, { "epoch": 0.4052108330476517, "grad_norm": 0.5279473175080351, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -260712610.53968254, "logits/rejected": -188162930.2153846, "logps/chosen": -252.1904761904762, "logps/rejected": -304.0, "loss": 0.1695, "rewards/chosen": 1.4990079365079365, "rewards/margins": 9.060546398046398, "rewards/rejected": -7.561538461538461, "step": 591 }, { "epoch": 0.4058964689749743, "grad_norm": 0.5689165987728922, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -280756224.0, "logits/rejected": -266076160.0, "logps/chosen": -314.25, "logps/rejected": -351.5, "loss": 0.1616, "rewards/chosen": 1.744140625, "rewards/margins": 7.298828125, "rewards/rejected": -5.5546875, "step": 592 }, { "epoch": 0.4065821049022969, "grad_norm": 0.5872038821480982, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -315771172.5714286, "logits/rejected": -310611512.8888889, "logps/chosen": -331.42857142857144, "logps/rejected": -335.77777777777777, "loss": 0.195, "rewards/chosen": 0.6149553571428571, "rewards/margins": 6.531622023809524, "rewards/rejected": -5.916666666666667, "step": 593 }, { "epoch": 0.4072677408296195, "grad_norm": 0.4706114597982538, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -261527190.5882353, "logits/rejected": -244667733.33333334, "logps/chosen": -225.2941176470588, "logps/rejected": -379.73333333333335, "loss": 0.1853, "rewards/chosen": 1.1075367647058822, "rewards/margins": 8.390870098039215, "rewards/rejected": -7.283333333333333, "step": 594 }, { "epoch": 0.40795337675694204, "grad_norm": 0.6029621149074397, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -182007373.57575756, "logits/rejected": -184278775.7419355, "logps/chosen": -191.03030303030303, "logps/rejected": -307.61290322580646, "loss": 0.1814, "rewards/chosen": 1.1609848484848484, "rewards/margins": 7.943242913000978, "rewards/rejected": -6.782258064516129, "step": 595 }, { "epoch": 0.40863901268426467, "grad_norm": 0.4889249079432626, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -256350173.2881356, "logits/rejected": -306609701.10144925, "logps/chosen": -217.08474576271186, "logps/rejected": -361.27536231884056, "loss": 0.1694, "rewards/chosen": 1.1207627118644068, "rewards/margins": 7.048298943748464, "rewards/rejected": -5.927536231884058, "step": 596 }, { "epoch": 0.40932464861158724, "grad_norm": 0.48622951770910156, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -277211230.52307695, "logits/rejected": -292935517.46031743, "logps/chosen": -264.8615384615385, "logps/rejected": -322.2857142857143, "loss": 0.1676, "rewards/chosen": 1.0134615384615384, "rewards/margins": 7.902350427350427, "rewards/rejected": -6.888888888888889, "step": 597 }, { "epoch": 0.41001028453890986, "grad_norm": 0.49943488531402996, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -281706985.07462686, "logits/rejected": -273661146.2295082, "logps/chosen": -230.56716417910448, "logps/rejected": -396.59016393442624, "loss": 0.2135, "rewards/chosen": 0.8031716417910447, "rewards/margins": 5.647433936873012, "rewards/rejected": -4.844262295081967, "step": 598 }, { "epoch": 0.41069592046623243, "grad_norm": 0.5104233864441208, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -275471062.7096774, "logits/rejected": -249878838.3030303, "logps/chosen": -266.5806451612903, "logps/rejected": -359.75757575757575, "loss": 0.1852, "rewards/chosen": 0.9949596774193549, "rewards/margins": 8.116171798631475, "rewards/rejected": -7.121212121212121, "step": 599 }, { "epoch": 0.411381556393555, "grad_norm": 0.41309877129713723, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -240106714.2295082, "logits/rejected": -229872899.82089552, "logps/chosen": -266.4918032786885, "logps/rejected": -304.95522388059703, "loss": 0.1573, "rewards/chosen": 1.2346311475409837, "rewards/margins": 8.003287863958894, "rewards/rejected": -6.768656716417911, "step": 600 }, { "epoch": 0.41206719232087763, "grad_norm": 0.5167711447175244, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -272248459.6363636, "logits/rejected": -255987844.12903225, "logps/chosen": -279.27272727272725, "logps/rejected": -382.4516129032258, "loss": 0.1902, "rewards/chosen": 1.3446969696969697, "rewards/margins": 8.675342130987293, "rewards/rejected": -7.330645161290323, "step": 601 }, { "epoch": 0.4127528282482002, "grad_norm": 0.453936740057143, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -266837625.9047619, "logits/rejected": -240043244.30769232, "logps/chosen": -203.93650793650792, "logps/rejected": -374.6461538461538, "loss": 0.178, "rewards/chosen": 0.8888888888888888, "rewards/margins": 8.01965811965812, "rewards/rejected": -7.130769230769231, "step": 602 }, { "epoch": 0.4134384641755228, "grad_norm": 0.4835073530877723, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -244232259.147541, "logits/rejected": -255163926.92537314, "logps/chosen": -230.81967213114754, "logps/rejected": -384.0, "loss": 0.164, "rewards/chosen": 1.0020491803278688, "rewards/margins": 8.711004404208467, "rewards/rejected": -7.708955223880597, "step": 603 }, { "epoch": 0.4141241001028454, "grad_norm": 0.6976127537969471, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -238128227.0967742, "logits/rejected": -269960657.45454544, "logps/chosen": -268.38709677419354, "logps/rejected": -336.4848484848485, "loss": 0.1797, "rewards/chosen": 1.4314516129032258, "rewards/margins": 8.35569403714565, "rewards/rejected": -6.924242424242424, "step": 604 }, { "epoch": 0.41480973603016796, "grad_norm": 0.40357722474091695, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -264428956.6567164, "logits/rejected": -246707586.09836066, "logps/chosen": -309.97014925373134, "logps/rejected": -294.0327868852459, "loss": 0.1817, "rewards/chosen": 0.9794776119402985, "rewards/margins": 7.618821874235381, "rewards/rejected": -6.639344262295082, "step": 605 }, { "epoch": 0.4154953719574906, "grad_norm": 0.5030642646419462, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -321727789.1764706, "logits/rejected": -311217356.8, "logps/chosen": -217.41176470588235, "logps/rejected": -368.26666666666665, "loss": 0.1636, "rewards/chosen": 1.2141544117647058, "rewards/margins": 23453601.21415441, "rewards/rejected": -23453600.0, "step": 606 }, { "epoch": 0.41618100788481316, "grad_norm": 0.6167267964536253, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -328783352.35820895, "logits/rejected": -396602384.78688526, "logps/chosen": -288.23880597014926, "logps/rejected": -313.44262295081967, "loss": 0.1845, "rewards/chosen": 1.587686567164179, "rewards/margins": 21968647.882768534, "rewards/rejected": -21968646.295081966, "step": 607 }, { "epoch": 0.41686664381213573, "grad_norm": 0.47239561107810596, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -323386917.10144925, "logits/rejected": -311658114.1694915, "logps/chosen": -256.0, "logps/rejected": -363.3898305084746, "loss": 0.1718, "rewards/chosen": 1.7572463768115942, "rewards/margins": 8.435212478506509, "rewards/rejected": -6.677966101694915, "step": 608 }, { "epoch": 0.41755227973945835, "grad_norm": 0.6103588083057598, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -254454442.66666666, "logits/rejected": -297645787.4285714, "logps/chosen": -239.11111111111111, "logps/rejected": -264.0, "loss": 0.2201, "rewards/chosen": 1.2552083333333333, "rewards/margins": 7.86235119047619, "rewards/rejected": -6.607142857142857, "step": 609 }, { "epoch": 0.4182379156667809, "grad_norm": 0.6068870326906552, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -314437499.87096775, "logits/rejected": -277586664.72727275, "logps/chosen": -238.70967741935485, "logps/rejected": -386.90909090909093, "loss": 0.1981, "rewards/chosen": 0.2767137096774194, "rewards/margins": 7.685804618768328, "rewards/rejected": -7.409090909090909, "step": 610 }, { "epoch": 0.41892355159410355, "grad_norm": 0.4308900001789938, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -271131794.28571427, "logits/rejected": -237910243.55555555, "logps/chosen": -270.57142857142856, "logps/rejected": -370.22222222222223, "loss": 0.173, "rewards/chosen": 1.1473214285714286, "rewards/margins": 8.487599206349206, "rewards/rejected": -7.340277777777778, "step": 611 }, { "epoch": 0.4196091875214261, "grad_norm": 0.4233465887622749, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -232484278.85714287, "logits/rejected": -302279150.3448276, "logps/chosen": -274.74285714285713, "logps/rejected": -324.41379310344826, "loss": 0.185, "rewards/chosen": 1.5522321428571428, "rewards/margins": 8.61257697044335, "rewards/rejected": -7.060344827586207, "step": 612 }, { "epoch": 0.4202948234487487, "grad_norm": 0.46959844627088587, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -238189207.43661973, "logits/rejected": -211186885.6140351, "logps/chosen": -308.50704225352115, "logps/rejected": -401.96491228070175, "loss": 0.1855, "rewards/chosen": 1.573943661971831, "rewards/margins": 6.582715591796393, "rewards/rejected": -5.008771929824562, "step": 613 }, { "epoch": 0.4209804593760713, "grad_norm": 0.4738155429484321, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -236884273.6716418, "logits/rejected": -247257658.75409836, "logps/chosen": -264.35820895522386, "logps/rejected": -300.0655737704918, "loss": 0.2077, "rewards/chosen": 1.3801305970149254, "rewards/margins": 8.167015842916564, "rewards/rejected": -6.786885245901639, "step": 614 }, { "epoch": 0.4216660953033939, "grad_norm": 0.5278715084339888, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -335544320.0, "logits/rejected": -281617554.28571427, "logps/chosen": -260.0, "logps/rejected": -391.7142857142857, "loss": 0.2021, "rewards/chosen": 1.046875, "rewards/margins": 5.796875, "rewards/rejected": -4.75, "step": 615 }, { "epoch": 0.4223517312307165, "grad_norm": 0.516700992549592, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -236014619.6756757, "logits/rejected": -266571320.8888889, "logps/chosen": -236.97297297297297, "logps/rejected": -350.81481481481484, "loss": 0.2279, "rewards/chosen": 0.777027027027027, "rewards/margins": 8.258508508508509, "rewards/rejected": -7.481481481481482, "step": 616 }, { "epoch": 0.4230373671580391, "grad_norm": 0.5016677717636967, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -265551872.0, "logits/rejected": -335544320.0, "logps/chosen": -276.25, "logps/rejected": -333.25, "loss": 0.1609, "rewards/chosen": 0.833984375, "rewards/margins": 6.669921875, "rewards/rejected": -5.8359375, "step": 617 }, { "epoch": 0.42372300308536165, "grad_norm": 0.5037851292197869, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -258267446.3030303, "logits/rejected": -216886106.83870968, "logps/chosen": -232.24242424242425, "logps/rejected": -358.4516129032258, "loss": 0.1852, "rewards/chosen": 1.3598484848484849, "rewards/margins": 8.279203323558162, "rewards/rejected": -6.919354838709677, "step": 618 }, { "epoch": 0.4244086390126843, "grad_norm": 0.48738951440921163, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -277700485.73134327, "logits/rejected": -360022553.1803279, "logps/chosen": -267.2238805970149, "logps/rejected": -382.95081967213116, "loss": 0.1734, "rewards/chosen": 1.4347014925373134, "rewards/margins": 7.426504771225838, "rewards/rejected": -5.991803278688525, "step": 619 }, { "epoch": 0.42509427494000684, "grad_norm": 0.45526971475070693, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -315354788.88135594, "logits/rejected": -279620266.6666667, "logps/chosen": -202.84745762711864, "logps/rejected": -388.6376811594203, "loss": 0.1796, "rewards/chosen": 0.8511652542372882, "rewards/margins": 7.8656580078604765, "rewards/rejected": -7.0144927536231885, "step": 620 }, { "epoch": 0.42577991086732947, "grad_norm": 0.5084466015400418, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -233400681.4117647, "logits/rejected": -222577732.26666668, "logps/chosen": -260.70588235294116, "logps/rejected": -358.4, "loss": 0.177, "rewards/chosen": 1.7169117647058822, "rewards/margins": 8.675245098039216, "rewards/rejected": -6.958333333333333, "step": 621 }, { "epoch": 0.42646554679465204, "grad_norm": 0.5652133110314695, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -234200866.5945946, "logits/rejected": -272785104.5925926, "logps/chosen": -227.67567567567568, "logps/rejected": -285.3333333333333, "loss": 0.205, "rewards/chosen": 1.2356418918918919, "rewards/margins": 7.430086336336337, "rewards/rejected": -6.194444444444445, "step": 622 }, { "epoch": 0.4271511827219746, "grad_norm": 0.6550011710597644, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -330246251.7894737, "logits/rejected": -275051822.87323946, "logps/chosen": -237.75438596491227, "logps/rejected": -337.1267605633803, "loss": 0.17, "rewards/chosen": 1.287280701754386, "rewards/margins": 7.406999011613541, "rewards/rejected": -6.119718309859155, "step": 623 }, { "epoch": 0.42783681864929723, "grad_norm": 0.554716459961649, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -261208347.56923077, "logits/rejected": -260712610.53968254, "logps/chosen": -391.38461538461536, "logps/rejected": -355.04761904761904, "loss": 0.1854, "rewards/chosen": 1.603846153846154, "rewards/margins": 8.794322344322344, "rewards/rejected": -7.190476190476191, "step": 624 }, { "epoch": 0.4285224545766198, "grad_norm": 0.6578847368995578, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -194136356.57142857, "logits/rejected": -237773647.44827586, "logps/chosen": -201.82857142857142, "logps/rejected": -398.8965517241379, "loss": 0.2142, "rewards/chosen": 1.04375, "rewards/margins": 7.483405172413793, "rewards/rejected": -6.439655172413793, "step": 625 }, { "epoch": 0.42920809050394243, "grad_norm": 0.45366067744522576, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -312272697.8064516, "logits/rejected": -229034418.42424244, "logps/chosen": -214.83870967741936, "logps/rejected": -438.7878787878788, "loss": 0.1716, "rewards/chosen": 1.1179435483870968, "rewards/margins": 8.777034457478006, "rewards/rejected": -7.659090909090909, "step": 626 }, { "epoch": 0.429893726431265, "grad_norm": 0.5105240154282699, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -232843790.62857142, "logits/rejected": -222442743.1724138, "logps/chosen": -249.14285714285714, "logps/rejected": -290.48275862068965, "loss": 0.1933, "rewards/chosen": 1.2053571428571428, "rewards/margins": 8.179495073891626, "rewards/rejected": -6.974137931034483, "step": 627 }, { "epoch": 0.43057936235858757, "grad_norm": 0.5024391033162907, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -395535577.2121212, "logits/rejected": -194967485.93548387, "logps/chosen": -300.3636363636364, "logps/rejected": -356.1290322580645, "loss": 0.1709, "rewards/chosen": 1.3390151515151516, "rewards/margins": 23238056.564821605, "rewards/rejected": -23238055.225806452, "step": 628 }, { "epoch": 0.4312649982859102, "grad_norm": 0.46438711714612546, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -265307500.47457626, "logits/rejected": -333842283.5942029, "logps/chosen": -224.0, "logps/rejected": -305.15942028985506, "loss": 0.2071, "rewards/chosen": 0.6978283898305084, "rewards/margins": 7.502176215917465, "rewards/rejected": -6.804347826086956, "step": 629 }, { "epoch": 0.43195063421323276, "grad_norm": 0.48447208258331415, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -240753049.6, "logits/rejected": -227972758.5882353, "logps/chosen": -264.8, "logps/rejected": -333.6470588235294, "loss": 0.1659, "rewards/chosen": 1.5854166666666667, "rewards/margins": 6.306004901960785, "rewards/rejected": -4.720588235294118, "step": 630 }, { "epoch": 0.4326362701405554, "grad_norm": 0.4534623077424963, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -220856320.0, "logits/rejected": -252444672.0, "logps/chosen": -202.75, "logps/rejected": -362.0, "loss": 0.1627, "rewards/chosen": 1.06640625, "rewards/margins": 8.29296875, "rewards/rejected": -7.2265625, "step": 631 }, { "epoch": 0.43332190606787796, "grad_norm": 0.5069498282670581, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -315411660.8, "logits/rejected": -249359991.23287672, "logps/chosen": -258.6181818181818, "logps/rejected": -354.63013698630135, "loss": 0.1566, "rewards/chosen": 0.9954545454545455, "rewards/margins": 7.392714819427148, "rewards/rejected": -6.397260273972603, "step": 632 }, { "epoch": 0.43400754199520053, "grad_norm": 0.5172939799736281, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -271156085.6216216, "logits/rejected": -239852050.96296296, "logps/chosen": -240.43243243243242, "logps/rejected": -388.14814814814815, "loss": 0.21, "rewards/chosen": 1.2364864864864864, "rewards/margins": 8.3013013013013, "rewards/rejected": -7.064814814814815, "step": 633 }, { "epoch": 0.43469317792252316, "grad_norm": 0.5175464378229888, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -280341867.3548387, "logits/rejected": -271994259.3939394, "logps/chosen": -244.1290322580645, "logps/rejected": -392.72727272727275, "loss": 0.1736, "rewards/chosen": 1.1340725806451613, "rewards/margins": 7.853769550342131, "rewards/rejected": -6.71969696969697, "step": 634 }, { "epoch": 0.4353788138498457, "grad_norm": 0.4685644009355057, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -288310004.1846154, "logits/rejected": -242071259.42857143, "logps/chosen": -282.5846153846154, "logps/rejected": -389.07936507936506, "loss": 0.1916, "rewards/chosen": 1.023076923076923, "rewards/margins": 7.344505494505494, "rewards/rejected": -6.321428571428571, "step": 635 }, { "epoch": 0.43606444977716835, "grad_norm": 0.4784299852387135, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -315349522.962963, "logits/rejected": -245310104.2162162, "logps/chosen": -325.6296296296296, "logps/rejected": -356.7567567567568, "loss": 0.1353, "rewards/chosen": 2.0162037037037037, "rewards/margins": 7.1648523523523515, "rewards/rejected": -5.148648648648648, "step": 636 }, { "epoch": 0.4367500857044909, "grad_norm": 0.427562488086772, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -254832307.8918919, "logits/rejected": -386186657.1851852, "logps/chosen": -294.05405405405406, "logps/rejected": -342.51851851851853, "loss": 0.1777, "rewards/chosen": 1.8631756756756757, "rewards/margins": 6.853916416416416, "rewards/rejected": -4.9907407407407405, "step": 637 }, { "epoch": 0.4374357216318135, "grad_norm": 0.48669712593182274, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -238551040.0, "logits/rejected": -226754560.0, "logps/chosen": -242.0, "logps/rejected": -348.0, "loss": 0.1873, "rewards/chosen": 0.8720703125, "rewards/margins": 7.4658203125, "rewards/rejected": -6.59375, "step": 638 }, { "epoch": 0.4381213575591361, "grad_norm": 0.5483106227232574, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -227100286.14492753, "logits/rejected": -245117969.3559322, "logps/chosen": -315.1304347826087, "logps/rejected": -357.1525423728813, "loss": 0.1836, "rewards/chosen": 1.6775362318840579, "rewards/margins": 7.253807418324736, "rewards/rejected": -5.576271186440678, "step": 639 }, { "epoch": 0.4388069934864587, "grad_norm": 0.45768115243520563, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -358117301.5272727, "logits/rejected": -312102182.5753425, "logps/chosen": -220.8, "logps/rejected": -349.36986301369865, "loss": 0.1471, "rewards/chosen": 1.2034090909090909, "rewards/margins": 8.456833748443337, "rewards/rejected": -7.2534246575342465, "step": 640 }, { "epoch": 0.43949262941378126, "grad_norm": 0.5368741523157952, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -392476839.86885244, "logits/rejected": -369849970.6268657, "logps/chosen": -216.65573770491804, "logps/rejected": -338.14925373134326, "loss": 0.1737, "rewards/chosen": 0.6987704918032787, "rewards/margins": 7.564442133594324, "rewards/rejected": -6.865671641791045, "step": 641 }, { "epoch": 0.4401782653411039, "grad_norm": 0.51248540962371, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -223496484.57142857, "logits/rejected": -228589568.0, "logps/chosen": -203.42857142857142, "logps/rejected": -379.1111111111111, "loss": 0.1621, "rewards/chosen": 0.9765625, "rewards/margins": 8.580729166666668, "rewards/rejected": -7.604166666666667, "step": 642 }, { "epoch": 0.44086390126842645, "grad_norm": 0.5082718439121618, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -282416469.3333333, "logits/rejected": -260434014.52307692, "logps/chosen": -271.36507936507934, "logps/rejected": -325.9076923076923, "loss": 0.1885, "rewards/chosen": 1.2113095238095237, "rewards/margins": 8.1882326007326, "rewards/rejected": -6.976923076923077, "step": 643 }, { "epoch": 0.4415495371957491, "grad_norm": 0.5730320189420638, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -315691281.06666666, "logits/rejected": -326908988.2352941, "logps/chosen": -259.73333333333335, "logps/rejected": -318.11764705882354, "loss": 0.1746, "rewards/chosen": 1.1369791666666667, "rewards/margins": 8.085508578431373, "rewards/rejected": -6.948529411764706, "step": 644 }, { "epoch": 0.44223517312307165, "grad_norm": 0.4774060030402224, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -349175808.0, "logits/rejected": -321388544.0, "logps/chosen": -281.5, "logps/rejected": -392.0, "loss": 0.166, "rewards/chosen": 1.42578125, "rewards/margins": 8.82421875, "rewards/rejected": -7.3984375, "step": 645 }, { "epoch": 0.4429208090503942, "grad_norm": 0.438464921480374, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -221751028.86956522, "logits/rejected": -227487674.57627118, "logps/chosen": -288.0, "logps/rejected": -356.06779661016947, "loss": 0.1643, "rewards/chosen": 1.6014492753623188, "rewards/margins": 9.796364529599607, "rewards/rejected": -8.194915254237289, "step": 646 }, { "epoch": 0.44360644497771684, "grad_norm": 0.531446081322958, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -260504408.43636364, "logits/rejected": -268895105.75342464, "logps/chosen": -245.38181818181818, "logps/rejected": -328.7671232876712, "loss": 0.1559, "rewards/chosen": 1.3284090909090909, "rewards/margins": 8.472244707347448, "rewards/rejected": -7.1438356164383565, "step": 647 }, { "epoch": 0.4442920809050394, "grad_norm": 0.5603125522112095, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -219456809.29032257, "logits/rejected": -229288618.66666666, "logps/chosen": -192.6451612903226, "logps/rejected": -379.1515151515151, "loss": 0.1829, "rewards/chosen": 1.1693548387096775, "rewards/margins": 7.616324535679375, "rewards/rejected": -6.446969696969697, "step": 648 }, { "epoch": 0.44497771683236204, "grad_norm": 0.5962168733438524, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -236079396.57142857, "logits/rejected": -205376264.8275862, "logps/chosen": -271.54285714285714, "logps/rejected": -360.2758620689655, "loss": 0.1837, "rewards/chosen": 1.4642857142857142, "rewards/margins": 8.800492610837438, "rewards/rejected": -7.336206896551724, "step": 649 }, { "epoch": 0.4456633527596846, "grad_norm": 0.44262754430376444, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -278360006.30985916, "logits/rejected": -290216403.0877193, "logps/chosen": -208.90140845070422, "logps/rejected": -316.0701754385965, "loss": 0.2137, "rewards/chosen": 1.1276408450704225, "rewards/margins": 7.531149617000247, "rewards/rejected": -6.4035087719298245, "step": 650 }, { "epoch": 0.4463489886870072, "grad_norm": 0.6842247318952224, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -301510538.9714286, "logits/rejected": -315295955.86206895, "logps/chosen": -294.4, "logps/rejected": -341.51724137931035, "loss": 0.1955, "rewards/chosen": 1.4660714285714285, "rewards/margins": 8.026416256157635, "rewards/rejected": -6.560344827586207, "step": 651 }, { "epoch": 0.4470346246143298, "grad_norm": 0.5355092996611434, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -250442499.71014494, "logits/rejected": -381894933.69491524, "logps/chosen": -257.15942028985506, "logps/rejected": -396.47457627118644, "loss": 0.1582, "rewards/chosen": 1.3822463768115942, "rewards/margins": 8.577161631048883, "rewards/rejected": -7.194915254237288, "step": 652 }, { "epoch": 0.44772026054165237, "grad_norm": 0.5195895170629371, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -219076400.23188406, "logits/rejected": -271279051.9322034, "logps/chosen": -318.1449275362319, "logps/rejected": -341.6949152542373, "loss": 0.171, "rewards/chosen": 1.266304347826087, "rewards/margins": 8.240880619012527, "rewards/rejected": -6.97457627118644, "step": 653 }, { "epoch": 0.448405896468975, "grad_norm": 0.5710849694938456, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -301743164.2352941, "logits/rejected": -285771912.53333336, "logps/chosen": -356.0, "logps/rejected": -293.8666666666667, "loss": 0.1905, "rewards/chosen": 1.322610294117647, "rewards/margins": 7.6642769607843135, "rewards/rejected": -6.341666666666667, "step": 654 }, { "epoch": 0.44909153239629757, "grad_norm": 0.49955974103935147, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -231419058.7936508, "logits/rejected": -271016566.15384614, "logps/chosen": -264.3809523809524, "logps/rejected": -334.03076923076924, "loss": 0.2115, "rewards/chosen": 1.0818452380952381, "rewards/margins": 6.6895375457875454, "rewards/rejected": -5.607692307692307, "step": 655 }, { "epoch": 0.44977716832362014, "grad_norm": 0.42327673569877805, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -253583494.29508197, "logits/rejected": -335544320.0, "logps/chosen": -247.08196721311475, "logps/rejected": -326.6865671641791, "loss": 0.1735, "rewards/chosen": 1.1075819672131149, "rewards/margins": 8.070268534377295, "rewards/rejected": -6.962686567164179, "step": 656 }, { "epoch": 0.45046280425094276, "grad_norm": 0.5299729674850512, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -268746145.1851852, "logits/rejected": -282038604.1081081, "logps/chosen": -317.9259259259259, "logps/rejected": -427.6756756756757, "loss": 0.1534, "rewards/chosen": 1.1516203703703705, "rewards/margins": 8.77999874874875, "rewards/rejected": -7.628378378378378, "step": 657 }, { "epoch": 0.45114844017826533, "grad_norm": 0.5867501274920175, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -243516355.7647059, "logits/rejected": -336103560.53333336, "logps/chosen": -253.41176470588235, "logps/rejected": -343.46666666666664, "loss": 0.206, "rewards/chosen": 1.0174632352941178, "rewards/margins": 7.467463235294118, "rewards/rejected": -6.45, "step": 658 }, { "epoch": 0.45183407610558796, "grad_norm": 0.46405309515704296, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -212922608.94117647, "logits/rejected": -216146466.13333333, "logps/chosen": -209.88235294117646, "logps/rejected": -342.4, "loss": 0.1773, "rewards/chosen": 1.2766544117647058, "rewards/margins": 8.18498774509804, "rewards/rejected": -6.908333333333333, "step": 659 }, { "epoch": 0.4525197120329105, "grad_norm": 0.47810797332497257, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -262588850.42424244, "logits/rejected": -309837295.483871, "logps/chosen": -248.96969696969697, "logps/rejected": -409.80645161290323, "loss": 0.1761, "rewards/chosen": 1.271780303030303, "rewards/margins": 7.610489980449658, "rewards/rejected": -6.338709677419355, "step": 660 }, { "epoch": 0.4532053479602331, "grad_norm": 0.40977339800646034, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -258648746.66666666, "logits/rejected": -259747254.85714287, "logps/chosen": -274.22222222222223, "logps/rejected": -331.7142857142857, "loss": 0.1932, "rewards/chosen": 1.3732638888888888, "rewards/margins": 6.373263888888889, "rewards/rejected": -5.0, "step": 661 }, { "epoch": 0.4538909838875557, "grad_norm": 0.548469319616029, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -347008750.93333334, "logits/rejected": -304950573.1764706, "logps/chosen": -209.86666666666667, "logps/rejected": -425.88235294117646, "loss": 0.171, "rewards/chosen": 1.0454427083333333, "rewards/margins": 8.096913296568628, "rewards/rejected": -7.051470588235294, "step": 662 }, { "epoch": 0.4545766198148783, "grad_norm": 0.5370654734464015, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -235459548.68965518, "logits/rejected": -291444209.37142855, "logps/chosen": -269.51724137931035, "logps/rejected": -387.65714285714284, "loss": 0.1809, "rewards/chosen": 0.9547413793103449, "rewards/margins": 7.797598522167488, "rewards/rejected": -6.8428571428571425, "step": 663 }, { "epoch": 0.4552622557422009, "grad_norm": 0.5447897667138676, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -236716032.0, "logits/rejected": -252968960.0, "logps/chosen": -211.75, "logps/rejected": -324.5, "loss": 0.1967, "rewards/chosen": 1.3291015625, "rewards/margins": 6.0712890625, "rewards/rejected": -4.7421875, "step": 664 }, { "epoch": 0.4559478916695235, "grad_norm": 0.5858216843128354, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -219521600.90140846, "logits/rejected": -201620929.12280703, "logps/chosen": -214.08450704225353, "logps/rejected": -264.42105263157896, "loss": 0.2046, "rewards/chosen": 1.022887323943662, "rewards/margins": 7.777273288855943, "rewards/rejected": -6.754385964912281, "step": 665 }, { "epoch": 0.45663352759684606, "grad_norm": 0.4467506751517846, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -230171630.0350877, "logits/rejected": -227319461.85915494, "logps/chosen": -239.57894736842104, "logps/rejected": -356.056338028169, "loss": 0.1496, "rewards/chosen": 1.3969298245614035, "rewards/margins": 6.361718556955769, "rewards/rejected": -4.964788732394366, "step": 666 }, { "epoch": 0.4573191635241687, "grad_norm": 0.5140506852094673, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -208838778.26865673, "logits/rejected": -294013834.4918033, "logps/chosen": -222.56716417910448, "logps/rejected": -376.91803278688525, "loss": 0.1922, "rewards/chosen": 1.1707089552238805, "rewards/margins": 7.900217151945192, "rewards/rejected": -6.729508196721311, "step": 667 }, { "epoch": 0.45800479945149125, "grad_norm": 0.5997084718709048, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -276397524.6101695, "logits/rejected": -235124172.057971, "logps/chosen": -289.89830508474574, "logps/rejected": -321.15942028985506, "loss": 0.1805, "rewards/chosen": 1.1430084745762712, "rewards/margins": 6.367646155735692, "rewards/rejected": -5.22463768115942, "step": 668 }, { "epoch": 0.4586904353788139, "grad_norm": 0.4702211910970126, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -307116259.5555556, "logits/rejected": -331050422.85714287, "logps/chosen": -262.6666666666667, "logps/rejected": -331.14285714285717, "loss": 0.1968, "rewards/chosen": 0.984375, "rewards/margins": 6.787946428571429, "rewards/rejected": -5.803571428571429, "step": 669 }, { "epoch": 0.45937607130613645, "grad_norm": 0.5551879068544594, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -237965071.05882353, "logits/rejected": -237397606.4, "logps/chosen": -202.11764705882354, "logps/rejected": -368.26666666666665, "loss": 0.1972, "rewards/chosen": 0.9016544117647058, "rewards/margins": 5.772487745098039, "rewards/rejected": -4.870833333333334, "step": 670 }, { "epoch": 0.460061707233459, "grad_norm": 0.5047311486904867, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -241990013.83050847, "logits/rejected": -266247123.47826087, "logps/chosen": -294.50847457627117, "logps/rejected": -320.92753623188406, "loss": 0.1909, "rewards/chosen": 1.0307203389830508, "rewards/margins": 7.530720338983051, "rewards/rejected": -6.5, "step": 671 }, { "epoch": 0.46074734316078164, "grad_norm": 0.5581439388440469, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -259776247.7419355, "logits/rejected": -283179070.06060606, "logps/chosen": -261.4193548387097, "logps/rejected": -398.54545454545456, "loss": 0.1609, "rewards/chosen": 1.6098790322580645, "rewards/margins": 9.647757820136851, "rewards/rejected": -8.037878787878787, "step": 672 }, { "epoch": 0.4614329790881042, "grad_norm": 0.49047297058638717, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -233638267.25925925, "logits/rejected": -221504595.02702704, "logps/chosen": -268.5925925925926, "logps/rejected": -326.9189189189189, "loss": 0.1508, "rewards/chosen": 1.1944444444444444, "rewards/margins": 9.343093093093094, "rewards/rejected": -8.14864864864865, "step": 673 }, { "epoch": 0.4621186150154268, "grad_norm": 0.5179599691573655, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -281484401.7777778, "logits/rejected": -273228946.28571427, "logps/chosen": -291.1111111111111, "logps/rejected": -381.42857142857144, "loss": 0.1918, "rewards/chosen": 1.1041666666666667, "rewards/margins": 30521411.389880955, "rewards/rejected": -30521410.285714287, "step": 674 }, { "epoch": 0.4628042509427494, "grad_norm": 0.6692756205451389, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -230686720.0, "logits/rejected": -239634568.53333333, "logps/chosen": -262.11764705882354, "logps/rejected": -362.4, "loss": 0.1848, "rewards/chosen": 1.25, "rewards/margins": 8.216666666666667, "rewards/rejected": -6.966666666666667, "step": 675 }, { "epoch": 0.463489886870072, "grad_norm": 0.42566881346359936, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -298734607.2835821, "logits/rejected": -270085673.9672131, "logps/chosen": -224.3582089552239, "logps/rejected": -313.7049180327869, "loss": 0.1702, "rewards/chosen": 1.0615671641791045, "rewards/margins": 8.020583557621727, "rewards/rejected": -6.959016393442623, "step": 676 }, { "epoch": 0.4641755227973946, "grad_norm": 0.49233067071911685, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -279087656.63492066, "logits/rejected": -315669771.8153846, "logps/chosen": -312.3809523809524, "logps/rejected": -395.5692307692308, "loss": 0.1838, "rewards/chosen": 1.3234126984126984, "rewards/margins": 8.469566544566545, "rewards/rejected": -7.1461538461538465, "step": 677 }, { "epoch": 0.46486115872471717, "grad_norm": 0.5640509757940456, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -248582417.06666666, "logits/rejected": -296315241.4117647, "logps/chosen": -249.06666666666666, "logps/rejected": -378.11764705882354, "loss": 0.1644, "rewards/chosen": 1.0020833333333334, "rewards/margins": 8.112377450980393, "rewards/rejected": -7.110294117647059, "step": 678 }, { "epoch": 0.46554679465203974, "grad_norm": 0.4117696081028903, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -297414283.6363636, "logits/rejected": -280156524.7123288, "logps/chosen": -211.34545454545454, "logps/rejected": -391.8904109589041, "loss": 0.1511, "rewards/chosen": 0.9795454545454545, "rewards/margins": 8.431600249066001, "rewards/rejected": -7.4520547945205475, "step": 679 }, { "epoch": 0.46623243057936237, "grad_norm": 0.5970087231695544, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -266682612.53731343, "logits/rejected": -274486255.21311474, "logps/chosen": -311.64179104477614, "logps/rejected": -277.24590163934425, "loss": 0.205, "rewards/chosen": 0.9123134328358209, "rewards/margins": 6.027067531196477, "rewards/rejected": -5.114754098360656, "step": 680 }, { "epoch": 0.46691806650668494, "grad_norm": 0.4823352168921985, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -250851643.07692307, "logits/rejected": -265786421.89473686, "logps/chosen": -199.69230769230768, "logps/rejected": -371.36842105263156, "loss": 0.1627, "rewards/chosen": 1.0348557692307692, "rewards/margins": 8.53485576923077, "rewards/rejected": -7.5, "step": 681 }, { "epoch": 0.46760370243400756, "grad_norm": 0.5841010540723979, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -288474908.4444444, "logits/rejected": -282815926.85714287, "logps/chosen": -233.77777777777777, "logps/rejected": -356.57142857142856, "loss": 0.1995, "rewards/chosen": 1.3246527777777777, "rewards/margins": 8.226438492063492, "rewards/rejected": -6.901785714285714, "step": 682 }, { "epoch": 0.46828933836133013, "grad_norm": 0.4866241845974519, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -255605820.2352941, "logits/rejected": -250539758.93333334, "logps/chosen": -325.4117647058824, "logps/rejected": -409.06666666666666, "loss": 0.1745, "rewards/chosen": 1.4577205882352942, "rewards/margins": 9.416053921568627, "rewards/rejected": -7.958333333333333, "step": 683 }, { "epoch": 0.4689749742886527, "grad_norm": 0.5268707543483612, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -232783872.0, "logits/rejected": -242570581.33333334, "logps/chosen": -292.57142857142856, "logps/rejected": -373.77777777777777, "loss": 0.145, "rewards/chosen": 1.5329241071428572, "rewards/margins": 9.046812996031747, "rewards/rejected": -7.513888888888889, "step": 684 }, { "epoch": 0.4696606102159753, "grad_norm": 0.39835773162055954, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -216679327.3962264, "logits/rejected": -184773072.21333334, "logps/chosen": -242.8679245283019, "logps/rejected": -323.84, "loss": 0.1392, "rewards/chosen": 1.5896226415094339, "rewards/margins": 9.376289308176101, "rewards/rejected": -7.786666666666667, "step": 685 }, { "epoch": 0.4703462461432979, "grad_norm": 0.5912720783629015, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -232660510.11764705, "logits/rejected": -217544567.46666667, "logps/chosen": -246.11764705882354, "logps/rejected": -395.2, "loss": 0.1964, "rewards/chosen": 1.1911764705882353, "rewards/margins": 8.016176470588235, "rewards/rejected": -6.825, "step": 686 }, { "epoch": 0.4710318820706205, "grad_norm": 0.49652583958479196, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -190141781.33333334, "logits/rejected": -232660510.11764705, "logps/chosen": -236.0, "logps/rejected": -358.5882352941176, "loss": 0.1976, "rewards/chosen": 1.0427083333333333, "rewards/margins": 8.028002450980392, "rewards/rejected": -6.985294117647059, "step": 687 }, { "epoch": 0.4717175179979431, "grad_norm": 0.4664478260816547, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -274338550.5185185, "logits/rejected": -299495977.5135135, "logps/chosen": -234.37037037037038, "logps/rejected": -436.3243243243243, "loss": 0.1341, "rewards/chosen": 1.4722222222222223, "rewards/margins": 8.654654654654655, "rewards/rejected": -7.1824324324324325, "step": 688 }, { "epoch": 0.47240315392526566, "grad_norm": 0.4759282504567211, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -269260564.9836066, "logits/rejected": -250907021.37313432, "logps/chosen": -276.1967213114754, "logps/rejected": -361.55223880597015, "loss": 0.1409, "rewards/chosen": 1.4938524590163935, "rewards/margins": 9.464001712747738, "rewards/rejected": -7.970149253731344, "step": 689 }, { "epoch": 0.4730887898525883, "grad_norm": 0.5179300170332058, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -261424082.14925373, "logits/rejected": -224154607.21311477, "logps/chosen": -244.29850746268656, "logps/rejected": -344.91803278688525, "loss": 0.1747, "rewards/chosen": 1.1865671641791045, "rewards/margins": 5.293124541228284, "rewards/rejected": -4.10655737704918, "step": 690 }, { "epoch": 0.47377442577991086, "grad_norm": 0.5337786110513396, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -320375878.1369863, "logits/rejected": -294668921.0181818, "logps/chosen": -313.4246575342466, "logps/rejected": -400.2909090909091, "loss": 0.1955, "rewards/chosen": 1.3013698630136987, "rewards/margins": 5.474097135740972, "rewards/rejected": -4.172727272727273, "step": 691 }, { "epoch": 0.4744600617072335, "grad_norm": 0.6806719350763459, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -203656760.8888889, "logits/rejected": -330151643.4285714, "logps/chosen": -269.3333333333333, "logps/rejected": -381.14285714285717, "loss": 0.2112, "rewards/chosen": 1.2808159722222223, "rewards/margins": 9.490637400793652, "rewards/rejected": -8.209821428571429, "step": 692 }, { "epoch": 0.47514569763455605, "grad_norm": 0.5773133800722844, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -243711137.68421054, "logits/rejected": -231572840.56338027, "logps/chosen": -280.140350877193, "logps/rejected": -441.23943661971833, "loss": 0.1699, "rewards/chosen": 1.0548245614035088, "rewards/margins": 9.104120336051396, "rewards/rejected": -8.049295774647888, "step": 693 }, { "epoch": 0.4758313335618786, "grad_norm": 0.4534879370438901, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -250008022.03278688, "logits/rejected": -292224045.8507463, "logps/chosen": -220.85245901639345, "logps/rejected": -379.7014925373134, "loss": 0.1686, "rewards/chosen": 1.2105532786885247, "rewards/margins": 9.038911487643748, "rewards/rejected": -7.8283582089552235, "step": 694 }, { "epoch": 0.47651696948920125, "grad_norm": 0.6817519492073819, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -321606325.16923076, "logits/rejected": -294533347.5555556, "logps/chosen": -256.4923076923077, "logps/rejected": -433.015873015873, "loss": 0.1978, "rewards/chosen": 0.9557692307692308, "rewards/margins": 8.130372405372405, "rewards/rejected": -7.174603174603175, "step": 695 }, { "epoch": 0.4772026054165238, "grad_norm": 0.4425301606739517, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -279914603.7894737, "logits/rejected": -276705914.5915493, "logps/chosen": -189.19298245614036, "logps/rejected": -418.2535211267606, "loss": 0.1805, "rewards/chosen": 0.9528508771929824, "rewards/margins": 4.213414257474673, "rewards/rejected": -3.26056338028169, "step": 696 }, { "epoch": 0.47788824134384644, "grad_norm": 0.4478000389499797, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -292838679.27272725, "logits/rejected": -270600258.0645161, "logps/chosen": -236.6060606060606, "logps/rejected": -386.06451612903226, "loss": 0.1822, "rewards/chosen": 1.081439393939394, "rewards/margins": 7.3959555229716525, "rewards/rejected": -6.314516129032258, "step": 697 }, { "epoch": 0.478573877271169, "grad_norm": 0.4689336462551415, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -275712320.7710843, "logits/rejected": -218663048.53333333, "logps/chosen": -227.0843373493976, "logps/rejected": -349.8666666666667, "loss": 0.1879, "rewards/chosen": 1.6024096385542168, "rewards/margins": 8.019076305220883, "rewards/rejected": -6.416666666666667, "step": 698 }, { "epoch": 0.4792595131984916, "grad_norm": 0.5292190425443927, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -241641991.64179105, "logits/rejected": -216453590.03278688, "logps/chosen": -312.35820895522386, "logps/rejected": -332.59016393442624, "loss": 0.1594, "rewards/chosen": 1.6287313432835822, "rewards/margins": 8.038567408857354, "rewards/rejected": -6.409836065573771, "step": 699 }, { "epoch": 0.4799451491258142, "grad_norm": 0.5297415211793217, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -315806418.8235294, "logits/rejected": -416634197.3333333, "logps/chosen": -304.2352941176471, "logps/rejected": -355.73333333333335, "loss": 0.1782, "rewards/chosen": 1.443014705882353, "rewards/margins": 6.118014705882353, "rewards/rejected": -4.675, "step": 700 }, { "epoch": 0.4806307850531368, "grad_norm": 0.48120731317799087, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -248082767.73770493, "logits/rejected": -228620868.7761194, "logps/chosen": -259.672131147541, "logps/rejected": -364.4179104477612, "loss": 0.1607, "rewards/chosen": 1.8073770491803278, "rewards/margins": 8.411854661120627, "rewards/rejected": -6.604477611940299, "step": 701 }, { "epoch": 0.4813164209804594, "grad_norm": 0.5092148017919162, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -304116577.35211265, "logits/rejected": -282857975.01754385, "logps/chosen": -238.42253521126761, "logps/rejected": -308.0701754385965, "loss": 0.1974, "rewards/chosen": 1.3485915492957747, "rewards/margins": 7.795959970348407, "rewards/rejected": -6.447368421052632, "step": 702 }, { "epoch": 0.482002056907782, "grad_norm": 0.49831551891401893, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -299702085.8181818, "logits/rejected": -191449682.58064517, "logps/chosen": -185.45454545454547, "logps/rejected": -353.5483870967742, "loss": 0.1859, "rewards/chosen": 1.0672348484848484, "rewards/margins": 8.07529936461388, "rewards/rejected": -7.008064516129032, "step": 703 }, { "epoch": 0.48268769283510454, "grad_norm": 0.8042564381949061, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -237750810.9473684, "logits/rejected": -252303517.53846154, "logps/chosen": -269.89473684210526, "logps/rejected": -345.53846153846155, "loss": 0.1879, "rewards/chosen": 1.5707236842105263, "rewards/margins": 7.532262145748988, "rewards/rejected": -5.961538461538462, "step": 704 }, { "epoch": 0.48337332876242717, "grad_norm": 0.4767232868809925, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -287170013.8666667, "logits/rejected": -277810959.0588235, "logps/chosen": -252.26666666666668, "logps/rejected": -353.88235294117646, "loss": 0.1702, "rewards/chosen": 1.1333333333333333, "rewards/margins": 7.677450980392157, "rewards/rejected": -6.544117647058823, "step": 705 }, { "epoch": 0.48405896468974974, "grad_norm": 0.5387664392020065, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -241845151.3962264, "logits/rejected": -251881936.21333334, "logps/chosen": -211.62264150943398, "logps/rejected": -386.9866666666667, "loss": 0.1649, "rewards/chosen": 1.1126179245283019, "rewards/margins": 7.339284591194969, "rewards/rejected": -6.226666666666667, "step": 706 }, { "epoch": 0.4847446006170723, "grad_norm": 0.4515747832334138, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -269212178.962963, "logits/rejected": -313552563.8918919, "logps/chosen": -266.6666666666667, "logps/rejected": -356.7567567567568, "loss": 0.1438, "rewards/chosen": 1.4768518518518519, "rewards/margins": 8.760635635635635, "rewards/rejected": -7.283783783783784, "step": 707 }, { "epoch": 0.48543023654439493, "grad_norm": 0.6592219157598536, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -270242233.1076923, "logits/rejected": -242337564.44444445, "logps/chosen": -225.23076923076923, "logps/rejected": -435.8095238095238, "loss": 0.1927, "rewards/chosen": 0.9644230769230769, "rewards/margins": 7.64696275946276, "rewards/rejected": -6.682539682539683, "step": 708 }, { "epoch": 0.4861158724717175, "grad_norm": 0.5281925891709622, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -252948795.07692307, "logits/rejected": -282815926.85714287, "logps/chosen": -308.18461538461537, "logps/rejected": -402.7936507936508, "loss": 0.193, "rewards/chosen": 1.3384615384615384, "rewards/margins": 7.941636141636141, "rewards/rejected": -6.603174603174603, "step": 709 }, { "epoch": 0.48680150839904013, "grad_norm": 0.49590311631359046, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -225609404.63157895, "logits/rejected": -328219056.6760563, "logps/chosen": -295.29824561403507, "logps/rejected": -331.71830985915494, "loss": 0.1621, "rewards/chosen": 1.3958333333333333, "rewards/margins": 8.416960093896714, "rewards/rejected": -7.02112676056338, "step": 710 }, { "epoch": 0.4874871443263627, "grad_norm": 0.5216770206760418, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -323826899.3015873, "logits/rejected": -300699332.9230769, "logps/chosen": -272.76190476190476, "logps/rejected": -350.2769230769231, "loss": 0.17, "rewards/chosen": 1.1468253968253967, "rewards/margins": 7.9852869352869345, "rewards/rejected": -6.838461538461538, "step": 711 }, { "epoch": 0.48817278025368527, "grad_norm": 0.48184337052434467, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -308606254.87323946, "logits/rejected": -318472766.877193, "logps/chosen": -257.5774647887324, "logps/rejected": -367.43859649122805, "loss": 0.182, "rewards/chosen": 1.5422535211267605, "rewards/margins": 8.840499135161847, "rewards/rejected": -7.298245614035087, "step": 712 }, { "epoch": 0.4888584161810079, "grad_norm": 0.4997144849112379, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -275404453.4153846, "logits/rejected": -271098506.15873015, "logps/chosen": -235.56923076923076, "logps/rejected": -294.3492063492063, "loss": 0.1703, "rewards/chosen": 1.3528846153846155, "rewards/margins": 8.440186202686203, "rewards/rejected": -7.087301587301587, "step": 713 }, { "epoch": 0.48954405210833046, "grad_norm": 0.5228396594729003, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -322311586.25352114, "logits/rejected": -373219471.71929824, "logps/chosen": -266.8169014084507, "logps/rejected": -295.29824561403507, "loss": 0.1917, "rewards/chosen": 1.3362676056338028, "rewards/margins": 8.44153076352854, "rewards/rejected": -7.105263157894737, "step": 714 }, { "epoch": 0.4902296880356531, "grad_norm": 0.47256932424203085, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -221635853.47368422, "logits/rejected": -254493825.8028169, "logps/chosen": -217.12280701754386, "logps/rejected": -396.16901408450707, "loss": 0.168, "rewards/chosen": 1.1962719298245614, "rewards/margins": 7.104722634049914, "rewards/rejected": -5.908450704225352, "step": 715 }, { "epoch": 0.49091532396297566, "grad_norm": 0.5178212272385004, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -235139135.0153846, "logits/rejected": -284413756.95238096, "logps/chosen": -230.15384615384616, "logps/rejected": -364.6984126984127, "loss": 0.1873, "rewards/chosen": 0.9783653846153846, "rewards/margins": 7.42280982905983, "rewards/rejected": -6.444444444444445, "step": 716 }, { "epoch": 0.49160095989029823, "grad_norm": 0.6547250114428167, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -257802009.23943663, "logits/rejected": -308759641.8245614, "logps/chosen": -243.6056338028169, "logps/rejected": -417.12280701754383, "loss": 0.213, "rewards/chosen": 1.1681338028169015, "rewards/margins": 8.747081171237953, "rewards/rejected": -7.578947368421052, "step": 717 }, { "epoch": 0.49228659581762085, "grad_norm": 0.4943669981643289, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -208553700.43076923, "logits/rejected": -218636418.03174603, "logps/chosen": -302.03076923076924, "logps/rejected": -332.1904761904762, "loss": 0.1626, "rewards/chosen": 1.6807692307692308, "rewards/margins": 9.688705738705739, "rewards/rejected": -8.007936507936508, "step": 718 }, { "epoch": 0.4929722317449434, "grad_norm": 0.4498063106879301, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -324835458.7234042, "logits/rejected": -317731473.38271606, "logps/chosen": -302.97872340425533, "logps/rejected": -403.358024691358, "loss": 0.1197, "rewards/chosen": 1.5252659574468086, "rewards/margins": 9.290698056212241, "rewards/rejected": -7.765432098765432, "step": 719 }, { "epoch": 0.49365786767226605, "grad_norm": 0.45369765914652715, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -228847112.98245615, "logits/rejected": -252839734.08450705, "logps/chosen": -213.89473684210526, "logps/rejected": -348.84507042253523, "loss": 0.1618, "rewards/chosen": 1.069078947368421, "rewards/margins": 6.259219792438844, "rewards/rejected": -5.190140845070423, "step": 720 }, { "epoch": 0.4943435035995886, "grad_norm": 0.5607952381734975, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -223021267.86206895, "logits/rejected": -277063738.51428574, "logps/chosen": -331.0344827586207, "logps/rejected": -323.65714285714284, "loss": 0.1659, "rewards/chosen": 0.9488146551724138, "rewards/margins": 8.377386083743843, "rewards/rejected": -7.428571428571429, "step": 721 }, { "epoch": 0.4950291395269112, "grad_norm": 0.5356875976970674, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -304744356.29850745, "logits/rejected": -278886836.4590164, "logps/chosen": -297.07462686567163, "logps/rejected": -393.9672131147541, "loss": 0.1838, "rewards/chosen": 1.3264925373134329, "rewards/margins": 6.77731220944458, "rewards/rejected": -5.450819672131147, "step": 722 }, { "epoch": 0.4957147754542338, "grad_norm": 0.6220791249030264, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -265596234.83076924, "logits/rejected": -348326960.7619048, "logps/chosen": -287.75384615384615, "logps/rejected": -375.87301587301585, "loss": 0.1835, "rewards/chosen": 1.4903846153846154, "rewards/margins": 7.791971916971917, "rewards/rejected": -6.301587301587301, "step": 723 }, { "epoch": 0.4964004113815564, "grad_norm": 0.6537202367395412, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -230373712.23880598, "logits/rejected": -289750771.40983605, "logps/chosen": -276.2985074626866, "logps/rejected": -364.8524590163934, "loss": 0.2148, "rewards/chosen": 1.0051305970149254, "rewards/margins": 7.234638793736236, "rewards/rejected": -6.229508196721311, "step": 724 }, { "epoch": 0.497086047308879, "grad_norm": 0.44571190739812466, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -223416593.06666666, "logits/rejected": -223778454.5882353, "logps/chosen": -270.8, "logps/rejected": -362.3529411764706, "loss": 0.1667, "rewards/chosen": 1.5208333333333333, "rewards/margins": 5.366421568627451, "rewards/rejected": -3.8455882352941178, "step": 725 }, { "epoch": 0.4977716832362016, "grad_norm": 0.49317508878938154, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -259335949.01694915, "logits/rejected": -235610468.17391303, "logps/chosen": -263.864406779661, "logps/rejected": -315.82608695652175, "loss": 0.1606, "rewards/chosen": 1.4226694915254237, "rewards/margins": 8.234263694423975, "rewards/rejected": -6.811594202898551, "step": 726 }, { "epoch": 0.49845731916352415, "grad_norm": 0.5281781010276467, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -326885111.7419355, "logits/rejected": -297922684.1212121, "logps/chosen": -268.9032258064516, "logps/rejected": -358.54545454545456, "loss": 0.1823, "rewards/chosen": 1.2207188760080645, "rewards/margins": 8.402537057826246, "rewards/rejected": -7.181818181818182, "step": 727 }, { "epoch": 0.4991429550908468, "grad_norm": 0.6531864951098769, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -283979053.1764706, "logits/rejected": -366022929.06666666, "logps/chosen": -288.2352941176471, "logps/rejected": -336.53333333333336, "loss": 0.1857, "rewards/chosen": 1.2058823529411764, "rewards/margins": 8.46421568627451, "rewards/rejected": -7.258333333333334, "step": 728 }, { "epoch": 0.49982859101816934, "grad_norm": 0.43622565637366884, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -175898624.0, "logits/rejected": -233308160.0, "logps/chosen": -229.0, "logps/rejected": -376.0, "loss": 0.1878, "rewards/chosen": 1.4296875, "rewards/margins": 9.0625, "rewards/rejected": -7.6328125, "step": 729 }, { "epoch": 0.500514226945492, "grad_norm": 0.7156552001113178, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -225705984.0, "logits/rejected": -247726080.0, "logps/chosen": -328.0, "logps/rejected": -296.0, "loss": 0.1643, "rewards/chosen": 1.5673828125, "rewards/margins": 9.3720703125, "rewards/rejected": -7.8046875, "step": 730 }, { "epoch": 0.5011998628728145, "grad_norm": 0.6896622231261965, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -212977436.44444445, "logits/rejected": -314872393.14285713, "logps/chosen": -264.6666666666667, "logps/rejected": -393.7142857142857, "loss": 0.2092, "rewards/chosen": 1.2152777777777777, "rewards/margins": 7.9875992063492065, "rewards/rejected": -6.772321428571429, "step": 731 }, { "epoch": 0.5018854988001371, "grad_norm": 0.6241588657440187, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -248063122.2857143, "logits/rejected": -230919736.8888889, "logps/chosen": -259.7142857142857, "logps/rejected": -368.44444444444446, "loss": 0.159, "rewards/chosen": 0.8895089285714286, "rewards/margins": 7.3756200396825395, "rewards/rejected": -6.486111111111111, "step": 732 }, { "epoch": 0.5025711347274597, "grad_norm": 0.6107656424762778, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -295328346.35294116, "logits/rejected": -241312290.13333333, "logps/chosen": -318.3529411764706, "logps/rejected": -374.4, "loss": 0.1823, "rewards/chosen": 1.6571691176470589, "rewards/margins": 8.640502450980392, "rewards/rejected": -6.983333333333333, "step": 733 }, { "epoch": 0.5032567706547824, "grad_norm": 0.5087283242259366, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -228116017.5483871, "logits/rejected": -248353636.84848484, "logps/chosen": -266.83870967741933, "logps/rejected": -414.06060606060606, "loss": 0.1699, "rewards/chosen": 1.2046370967741935, "rewards/margins": 8.363728005865102, "rewards/rejected": -7.159090909090909, "step": 734 }, { "epoch": 0.5039424065821049, "grad_norm": 0.5612729460263627, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -301260443.82608694, "logits/rejected": -352037176.40677965, "logps/chosen": -303.07246376811594, "logps/rejected": -388.06779661016947, "loss": 0.1875, "rewards/chosen": 1.2210144927536233, "rewards/margins": 9.017624662245149, "rewards/rejected": -7.796610169491525, "step": 735 }, { "epoch": 0.5046280425094275, "grad_norm": 0.5090219005374454, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -323363601.53424656, "logits/rejected": -311751177.3090909, "logps/chosen": -191.34246575342465, "logps/rejected": -385.1636363636364, "loss": 0.184, "rewards/chosen": 1.365582191780822, "rewards/margins": 8.901945828144457, "rewards/rejected": -7.536363636363636, "step": 736 }, { "epoch": 0.5053136784367501, "grad_norm": 0.5057272945928586, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -218802858.66666666, "logits/rejected": -278322029.71428573, "logps/chosen": -262.44444444444446, "logps/rejected": -348.2857142857143, "loss": 0.1932, "rewards/chosen": 1.3871527777777777, "rewards/margins": 6.994295634920634, "rewards/rejected": -5.607142857142857, "step": 737 }, { "epoch": 0.5059993143640726, "grad_norm": 0.4909430557618629, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -224395264.0, "logits/rejected": -289406976.0, "logps/chosen": -246.57142857142858, "logps/rejected": -386.6666666666667, "loss": 0.1517, "rewards/chosen": 1.5680803571428572, "rewards/margins": 9.450024801587302, "rewards/rejected": -7.881944444444445, "step": 738 }, { "epoch": 0.5066849502913953, "grad_norm": 0.5258140695959852, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -257250645.33333334, "logits/rejected": -241104829.93548387, "logps/chosen": -285.3333333333333, "logps/rejected": -393.80645161290323, "loss": 0.1831, "rewards/chosen": 1.2684659090909092, "rewards/margins": 8.050723973607038, "rewards/rejected": -6.782258064516129, "step": 739 }, { "epoch": 0.5073705862187179, "grad_norm": 0.5174381789039353, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -278956760.9491525, "logits/rejected": -250442499.71014494, "logps/chosen": -239.79661016949152, "logps/rejected": -400.69565217391306, "loss": 0.1644, "rewards/chosen": 0.9661016949152542, "rewards/margins": 8.176246622451485, "rewards/rejected": -7.2101449275362315, "step": 740 }, { "epoch": 0.5080562221460404, "grad_norm": 0.5691294246239457, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -205587472.25396827, "logits/rejected": -305345331.2, "logps/chosen": -243.93650793650792, "logps/rejected": -322.46153846153845, "loss": 0.1726, "rewards/chosen": 1.5158730158730158, "rewards/margins": 8.623565323565323, "rewards/rejected": -7.107692307692307, "step": 741 }, { "epoch": 0.508741858073363, "grad_norm": 0.638358589846272, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -183737575.22580644, "logits/rejected": -215053405.0909091, "logps/chosen": -253.03225806451613, "logps/rejected": -341.3333333333333, "loss": 0.1451, "rewards/chosen": 1.7096774193548387, "rewards/margins": 8.914222873900293, "rewards/rejected": -7.204545454545454, "step": 742 }, { "epoch": 0.5094274940006857, "grad_norm": 0.5270037624863775, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -356515840.0, "logits/rejected": -369331768.8888889, "logps/chosen": -286.85714285714283, "logps/rejected": -388.8888888888889, "loss": 0.1512, "rewards/chosen": 1.0669642857142858, "rewards/margins": 8.803075396825397, "rewards/rejected": -7.736111111111111, "step": 743 }, { "epoch": 0.5101131299280083, "grad_norm": 0.5615981917523659, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -255852544.0, "logits/rejected": -291803721.14285713, "logps/chosen": -211.11111111111111, "logps/rejected": -356.0, "loss": 0.1682, "rewards/chosen": 1.578125, "rewards/margins": 7.515625, "rewards/rejected": -5.9375, "step": 744 }, { "epoch": 0.5107987658553308, "grad_norm": 0.5687623565868154, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -287309824.0, "logits/rejected": -281018368.0, "logps/chosen": -248.0, "logps/rejected": -398.0, "loss": 0.1917, "rewards/chosen": 0.99853515625, "rewards/margins": 8.39697265625, "rewards/rejected": -7.3984375, "step": 745 }, { "epoch": 0.5114844017826534, "grad_norm": 0.5041635518217356, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -237582440.1355932, "logits/rejected": -291777669.5652174, "logps/chosen": -254.64406779661016, "logps/rejected": -322.0869565217391, "loss": 0.1409, "rewards/chosen": 1.896186440677966, "rewards/margins": 9.533867600098256, "rewards/rejected": -7.63768115942029, "step": 746 }, { "epoch": 0.512170037709976, "grad_norm": 0.6441230779192002, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -283901952.0, "logits/rejected": -277348352.0, "logps/chosen": -257.75, "logps/rejected": -308.5, "loss": 0.1818, "rewards/chosen": 1.125, "rewards/margins": 5.765625, "rewards/rejected": -4.640625, "step": 747 }, { "epoch": 0.5128556736372986, "grad_norm": 0.4755196213742783, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -260172051.1044776, "logits/rejected": -237631387.27868852, "logps/chosen": -228.29850746268656, "logps/rejected": -350.95081967213116, "loss": 0.1642, "rewards/chosen": 1.7555970149253732, "rewards/margins": 9.427728162466357, "rewards/rejected": -7.672131147540983, "step": 748 }, { "epoch": 0.5135413095646212, "grad_norm": 0.48600170299039386, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -305955411.7818182, "logits/rejected": -235800323.50684932, "logps/chosen": -338.4727272727273, "logps/rejected": -389.26027397260276, "loss": 0.1521, "rewards/chosen": 0.825, "rewards/margins": 8.742808219178082, "rewards/rejected": -7.917808219178082, "step": 749 }, { "epoch": 0.5142269454919438, "grad_norm": 0.4768703278904184, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -237183779.1372549, "logits/rejected": -252965555.53246754, "logps/chosen": -213.01960784313727, "logps/rejected": -384.0, "loss": 0.1513, "rewards/chosen": 1.0870098039215685, "rewards/margins": 9.015581232492998, "rewards/rejected": -7.928571428571429, "step": 750 }, { "epoch": 0.5149125814192663, "grad_norm": 0.44333797274189346, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -261399849.29032257, "logits/rejected": -354863538.42424244, "logps/chosen": -313.5483870967742, "logps/rejected": -388.3636363636364, "loss": 0.1389, "rewards/chosen": 2.4193548387096775, "rewards/margins": 8.00268817204301, "rewards/rejected": -5.583333333333333, "step": 751 }, { "epoch": 0.515598217346589, "grad_norm": 0.5252395679737261, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -225221414.78787878, "logits/rejected": -232986822.19354838, "logps/chosen": -282.42424242424244, "logps/rejected": -421.16129032258067, "loss": 0.187, "rewards/chosen": 0.7580492424242424, "rewards/margins": 9.532242790811338, "rewards/rejected": -8.774193548387096, "step": 752 }, { "epoch": 0.5162838532739116, "grad_norm": 0.6582867034089855, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -312791657.20547944, "logits/rejected": -291160957.6727273, "logps/chosen": -312.54794520547944, "logps/rejected": -324.94545454545454, "loss": 0.1812, "rewards/chosen": 1.5565068493150684, "rewards/margins": 9.429234122042342, "rewards/rejected": -7.872727272727273, "step": 753 }, { "epoch": 0.5169694892012341, "grad_norm": 0.4798608226281609, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -251347550.8148148, "logits/rejected": -258006375.7837838, "logps/chosen": -258.0740740740741, "logps/rejected": -354.5945945945946, "loss": 0.172, "rewards/chosen": 0.8049768518518519, "rewards/margins": 7.43335523023023, "rewards/rejected": -6.628378378378378, "step": 754 }, { "epoch": 0.5176551251285567, "grad_norm": 0.5943600532454335, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -250406208.9552239, "logits/rejected": -298414415.73770493, "logps/chosen": -321.1940298507463, "logps/rejected": -364.59016393442624, "loss": 0.1767, "rewards/chosen": 1.3097014925373134, "rewards/margins": 8.465439197455346, "rewards/rejected": -7.155737704918033, "step": 755 }, { "epoch": 0.5183407610558793, "grad_norm": 0.5917890301450913, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -280494080.0, "logits/rejected": -283639808.0, "logps/chosen": -209.5, "logps/rejected": -343.0, "loss": 0.1539, "rewards/chosen": 1.55078125, "rewards/margins": 10.83203125, "rewards/rejected": -9.28125, "step": 756 }, { "epoch": 0.519026396983202, "grad_norm": 0.4753841619135645, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -255129388.13793105, "logits/rejected": -270592526.62857145, "logps/chosen": -210.75862068965517, "logps/rejected": -367.0857142857143, "loss": 0.182, "rewards/chosen": 0.8782327586206896, "rewards/margins": 7.7496613300492605, "rewards/rejected": -6.871428571428571, "step": 757 }, { "epoch": 0.5197120329105245, "grad_norm": 0.49684207551282467, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -260168422.0289855, "logits/rejected": -294596538.5762712, "logps/chosen": -307.4782608695652, "logps/rejected": -334.64406779661016, "loss": 0.1599, "rewards/chosen": 2.0670289855072466, "rewards/margins": 10.261944239744535, "rewards/rejected": -8.194915254237289, "step": 758 }, { "epoch": 0.5203976688378471, "grad_norm": 0.565226909303693, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -210778544.67605633, "logits/rejected": -257544982.45614034, "logps/chosen": -244.73239436619718, "logps/rejected": -389.05263157894734, "loss": 0.197, "rewards/chosen": 1.4190140845070423, "rewards/margins": 9.59445268099827, "rewards/rejected": -8.175438596491228, "step": 759 }, { "epoch": 0.5210833047651697, "grad_norm": 0.48737187182976505, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -292762419.2, "logits/rejected": -289840869.51724136, "logps/chosen": -208.0, "logps/rejected": -360.55172413793105, "loss": 0.1823, "rewards/chosen": 1.3973214285714286, "rewards/margins": 8.910252463054187, "rewards/rejected": -7.512931034482759, "step": 760 }, { "epoch": 0.5217689406924922, "grad_norm": 0.5561323908064862, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -247217212.2352941, "logits/rejected": -251238809.6, "logps/chosen": -254.35294117647058, "logps/rejected": -346.6666666666667, "loss": 0.1799, "rewards/chosen": 1.2509191176470589, "rewards/margins": 8.034252450980393, "rewards/rejected": -6.783333333333333, "step": 761 }, { "epoch": 0.5224545766198149, "grad_norm": 0.44932468944274956, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -257336682.33846155, "logits/rejected": -296663787.6825397, "logps/chosen": -256.9846153846154, "logps/rejected": -415.4920634920635, "loss": 0.1733, "rewards/chosen": 1.5144230769230769, "rewards/margins": 9.442994505494505, "rewards/rejected": -7.928571428571429, "step": 762 }, { "epoch": 0.5231402125471375, "grad_norm": 0.5819618820370793, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -242310933.94285715, "logits/rejected": -275956276.9655172, "logps/chosen": -191.0857142857143, "logps/rejected": -337.6551724137931, "loss": 0.1804, "rewards/chosen": 1.1526785714285714, "rewards/margins": 8.376816502463054, "rewards/rejected": -7.224137931034483, "step": 763 }, { "epoch": 0.52382584847446, "grad_norm": 0.4567658693618427, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -274445205.01492536, "logits/rejected": -334719211.0163934, "logps/chosen": -200.11940298507463, "logps/rejected": -375.08196721311475, "loss": 0.1607, "rewards/chosen": 1.7145522388059702, "rewards/margins": 9.59160141913384, "rewards/rejected": -7.877049180327869, "step": 764 }, { "epoch": 0.5245114844017826, "grad_norm": 0.5358821445356877, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -250480891.50877193, "logits/rejected": -323729379.1549296, "logps/chosen": -265.82456140350877, "logps/rejected": -293.40845070422534, "loss": 0.1744, "rewards/chosen": 1.0021929824561404, "rewards/margins": 7.910643686681492, "rewards/rejected": -6.908450704225352, "step": 765 }, { "epoch": 0.5251971203291053, "grad_norm": 0.522783462923949, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -300283730.44067794, "logits/rejected": -263329346.7826087, "logps/chosen": -199.59322033898306, "logps/rejected": -328.81159420289856, "loss": 0.1591, "rewards/chosen": 1.0858050847457628, "rewards/margins": 8.585805084745763, "rewards/rejected": -7.5, "step": 766 }, { "epoch": 0.5258827562564279, "grad_norm": 0.5208303275419396, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -261077027.92982456, "logits/rejected": -237244012.1690141, "logps/chosen": -261.89473684210526, "logps/rejected": -376.7887323943662, "loss": 0.1436, "rewards/chosen": 1.7324561403508771, "rewards/margins": 9.147949098097357, "rewards/rejected": -7.415492957746479, "step": 767 }, { "epoch": 0.5265683921837504, "grad_norm": 0.5044405299580116, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -331527240.1126761, "logits/rejected": -305816270.5964912, "logps/chosen": -287.77464788732397, "logps/rejected": -379.2280701754386, "loss": 0.1764, "rewards/chosen": 1.4815140845070423, "rewards/margins": 27484125.551689524, "rewards/rejected": -27484124.07017544, "step": 768 }, { "epoch": 0.527254028111073, "grad_norm": 0.5257306459657117, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -221404243.93442622, "logits/rejected": -304493950.0895522, "logps/chosen": -258.3606557377049, "logps/rejected": -352.0, "loss": 0.1858, "rewards/chosen": 0.8688524590163934, "rewards/margins": 6.839001712747737, "rewards/rejected": -5.970149253731344, "step": 769 }, { "epoch": 0.5279396640383957, "grad_norm": 0.4730604294615649, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -220168696.12307692, "logits/rejected": -252190850.03174603, "logps/chosen": -273.2307692307692, "logps/rejected": -337.015873015873, "loss": 0.1771, "rewards/chosen": 1.4432692307692307, "rewards/margins": 8.498824786324786, "rewards/rejected": -7.055555555555555, "step": 770 }, { "epoch": 0.5286252999657182, "grad_norm": 0.46897424457031844, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -311543580.4444444, "logits/rejected": -334945133.71428573, "logps/chosen": -213.55555555555554, "logps/rejected": -314.0, "loss": 0.1878, "rewards/chosen": 1.2552083333333333, "rewards/margins": 9.219494047619047, "rewards/rejected": -7.964285714285714, "step": 771 }, { "epoch": 0.5293109358930408, "grad_norm": 0.4695188910239558, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -221717362.2153846, "logits/rejected": -252190850.03174603, "logps/chosen": -304.73846153846154, "logps/rejected": -318.22222222222223, "loss": 0.1559, "rewards/chosen": 1.2788461538461537, "rewards/margins": 9.60424297924298, "rewards/rejected": -8.325396825396826, "step": 772 }, { "epoch": 0.5299965718203634, "grad_norm": 0.48091794372137575, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -299924999.8769231, "logits/rejected": -276424606.47619045, "logps/chosen": -301.53846153846155, "logps/rejected": -343.36507936507934, "loss": 0.1832, "rewards/chosen": 1.2774038461538462, "rewards/margins": 7.769467338217338, "rewards/rejected": -6.492063492063492, "step": 773 }, { "epoch": 0.5306822077476859, "grad_norm": 0.5897211146767871, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -213310317.7142857, "logits/rejected": -239219959.1724138, "logps/chosen": -243.88571428571427, "logps/rejected": -361.6551724137931, "loss": 0.1866, "rewards/chosen": 1.3160714285714286, "rewards/margins": 6.540209359605912, "rewards/rejected": -5.224137931034483, "step": 774 }, { "epoch": 0.5313678436750086, "grad_norm": 0.46377187011578236, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -246477040.94117647, "logits/rejected": -267037354.66666666, "logps/chosen": -240.23529411764707, "logps/rejected": -360.53333333333336, "loss": 0.1648, "rewards/chosen": 1.8051470588235294, "rewards/margins": 5.146813725490196, "rewards/rejected": -3.341666666666667, "step": 775 }, { "epoch": 0.5320534796023312, "grad_norm": 0.5179746957300094, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -261203897.37931034, "logits/rejected": -280898530.74285716, "logps/chosen": -238.06896551724137, "logps/rejected": -365.9428571428571, "loss": 0.1716, "rewards/chosen": 1.0447198275862069, "rewards/margins": 8.973291256157635, "rewards/rejected": -7.928571428571429, "step": 776 }, { "epoch": 0.5327391155296538, "grad_norm": 0.5011893987339849, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -264047568.73846152, "logits/rejected": -309979038.47619045, "logps/chosen": -231.3846153846154, "logps/rejected": -402.7936507936508, "loss": 0.1593, "rewards/chosen": 0.7629807692307692, "rewards/margins": 7.921710927960929, "rewards/rejected": -7.158730158730159, "step": 777 }, { "epoch": 0.5334247514569763, "grad_norm": 0.5015589985487767, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -299702085.8181818, "logits/rejected": -321743706.83870965, "logps/chosen": -232.4848484848485, "logps/rejected": -287.48387096774195, "loss": 0.1786, "rewards/chosen": 0.946969696969697, "rewards/margins": 8.035679374389051, "rewards/rejected": -7.088709677419355, "step": 778 }, { "epoch": 0.534110387384299, "grad_norm": 0.6296372569218618, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -249713055.53623188, "logits/rejected": -220947403.93220338, "logps/chosen": -225.3913043478261, "logps/rejected": -341.4237288135593, "loss": 0.1819, "rewards/chosen": 1.2735507246376812, "rewards/margins": 8.315923605993614, "rewards/rejected": -7.0423728813559325, "step": 779 }, { "epoch": 0.5347960233116216, "grad_norm": 0.5744849065357989, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -385875968.0, "logits/rejected": -343023319.90361446, "logps/chosen": -289.6, "logps/rejected": -402.50602409638554, "loss": 0.1317, "rewards/chosen": 1.3215277777777779, "rewards/margins": 8.839600066934405, "rewards/rejected": -7.518072289156627, "step": 780 }, { "epoch": 0.5354816592389441, "grad_norm": 0.5956915155151383, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -257329411.6056338, "logits/rejected": -234145181.19298247, "logps/chosen": -280.7887323943662, "logps/rejected": -426.10526315789474, "loss": 0.189, "rewards/chosen": 1.528169014084507, "rewards/margins": 9.436063750926612, "rewards/rejected": -7.907894736842105, "step": 781 }, { "epoch": 0.5361672951662667, "grad_norm": 0.8052966077710151, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -294109680.4848485, "logits/rejected": -329049913.8064516, "logps/chosen": -262.3030303030303, "logps/rejected": -389.6774193548387, "loss": 0.1794, "rewards/chosen": 1.5047348484848484, "rewards/margins": 9.19021871945259, "rewards/rejected": -7.685483870967742, "step": 782 }, { "epoch": 0.5368529310935893, "grad_norm": 0.5811653039613405, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -205287879.1111111, "logits/rejected": -197581677.7142857, "logps/chosen": -267.3333333333333, "logps/rejected": -377.14285714285717, "loss": 0.2015, "rewards/chosen": 1.5668402777777777, "rewards/margins": 6.7454117063492065, "rewards/rejected": -5.178571428571429, "step": 783 }, { "epoch": 0.5375385670209119, "grad_norm": 0.5086335090971612, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -216922313.91549295, "logits/rejected": -205447311.71929824, "logps/chosen": -260.50704225352115, "logps/rejected": -383.1578947368421, "loss": 0.1819, "rewards/chosen": 1.7491197183098592, "rewards/margins": 7.626312700765999, "rewards/rejected": -5.87719298245614, "step": 784 }, { "epoch": 0.5382242029482345, "grad_norm": 0.577698328153664, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -355559244.35087717, "logits/rejected": -250004148.28169015, "logps/chosen": -271.719298245614, "logps/rejected": -454.3098591549296, "loss": 0.1733, "rewards/chosen": 1.1929824561403508, "rewards/margins": 9.038052878675561, "rewards/rejected": -7.845070422535211, "step": 785 }, { "epoch": 0.5389098388755571, "grad_norm": 0.5809010868266312, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -303205628.28985506, "logits/rejected": -348340501.69491524, "logps/chosen": -312.81159420289856, "logps/rejected": -343.3220338983051, "loss": 0.1866, "rewards/chosen": 1.4719202898550725, "rewards/margins": 9.599038933922868, "rewards/rejected": -8.127118644067796, "step": 786 }, { "epoch": 0.5395954748028796, "grad_norm": 0.5469027339961821, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -240716577.39130434, "logits/rejected": -244833609.76271185, "logps/chosen": -222.14492753623188, "logps/rejected": -375.3220338983051, "loss": 0.2056, "rewards/chosen": 1.0625, "rewards/margins": 7.045550847457627, "rewards/rejected": -5.983050847457627, "step": 787 }, { "epoch": 0.5402811107302022, "grad_norm": 0.5540444808284434, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -227730079.47540984, "logits/rejected": -244646866.14925373, "logps/chosen": -204.59016393442624, "logps/rejected": -368.23880597014926, "loss": 0.1691, "rewards/chosen": 1.0973360655737705, "rewards/margins": 8.373455468558845, "rewards/rejected": -7.276119402985074, "step": 788 }, { "epoch": 0.5409667466575249, "grad_norm": 0.4363665887258003, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -261910983.1111111, "logits/rejected": -277504221.4054054, "logps/chosen": -284.44444444444446, "logps/rejected": -391.7837837837838, "loss": 0.1537, "rewards/chosen": 1.509837962962963, "rewards/margins": 8.739567692692694, "rewards/rejected": -7.22972972972973, "step": 789 }, { "epoch": 0.5416523825848475, "grad_norm": 0.5151682933279702, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -304475401.4814815, "logits/rejected": -323301486.7027027, "logps/chosen": -296.0, "logps/rejected": -363.2432432432432, "loss": 0.1596, "rewards/chosen": 1.5648148148148149, "rewards/margins": 9.476976976976978, "rewards/rejected": -7.912162162162162, "step": 790 }, { "epoch": 0.54233801851217, "grad_norm": 0.58974574598046, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -282162269.09090906, "logits/rejected": -259505647.48387095, "logps/chosen": -213.57575757575756, "logps/rejected": -382.96774193548384, "loss": 0.1707, "rewards/chosen": 1.365530303030303, "rewards/margins": 9.244562561094819, "rewards/rejected": -7.879032258064516, "step": 791 }, { "epoch": 0.5430236544394926, "grad_norm": 0.5373117844209637, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -199390759.3846154, "logits/rejected": -229362202.9473684, "logps/chosen": -220.30769230769232, "logps/rejected": -347.36842105263156, "loss": 0.1638, "rewards/chosen": 1.2403846153846154, "rewards/margins": 8.543016194331983, "rewards/rejected": -7.302631578947368, "step": 792 }, { "epoch": 0.5437092903668153, "grad_norm": 0.4730712149283638, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -278336763.8032787, "logits/rejected": -176160768.0, "logps/chosen": -207.47540983606558, "logps/rejected": -276.05970149253733, "loss": 0.1536, "rewards/chosen": 1.2305327868852458, "rewards/margins": 7.939488010765843, "rewards/rejected": -6.708955223880597, "step": 793 }, { "epoch": 0.5443949262941378, "grad_norm": 0.47630476432427665, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -268647825.8227848, "logits/rejected": -290006162.28571427, "logps/chosen": -296.50632911392404, "logps/rejected": -305.6326530612245, "loss": 0.2094, "rewards/chosen": 1.303006329113924, "rewards/margins": 9.180557349522088, "rewards/rejected": -7.877551020408164, "step": 794 }, { "epoch": 0.5450805622214604, "grad_norm": 0.5010034452136448, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -253632030.11764705, "logits/rejected": -245226973.86666667, "logps/chosen": -210.35294117647058, "logps/rejected": -381.3333333333333, "loss": 0.1783, "rewards/chosen": 0.8547794117647058, "rewards/margins": 8.804779411764706, "rewards/rejected": -7.95, "step": 795 }, { "epoch": 0.545766198148783, "grad_norm": 0.4813276711388075, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -283910991.44827586, "logits/rejected": -337222041.6, "logps/chosen": -252.41379310344828, "logps/rejected": -294.4, "loss": 0.1617, "rewards/chosen": 1.800646551724138, "rewards/margins": 6.886360837438423, "rewards/rejected": -5.085714285714285, "step": 796 }, { "epoch": 0.5464518340761055, "grad_norm": 0.5631400689083942, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -243793920.0, "logits/rejected": -274726912.0, "logps/chosen": -237.75, "logps/rejected": -357.0, "loss": 0.1624, "rewards/chosen": 1.5478515625, "rewards/margins": 9.1962890625, "rewards/rejected": -7.6484375, "step": 797 }, { "epoch": 0.5471374700034282, "grad_norm": 0.554797931999966, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -318767104.0, "logits/rejected": -340253713.9649123, "logps/chosen": -232.67605633802816, "logps/rejected": -385.12280701754383, "loss": 0.2009, "rewards/chosen": 0.8142605633802817, "rewards/margins": 8.323032493204844, "rewards/rejected": -7.508771929824562, "step": 798 }, { "epoch": 0.5478231059307508, "grad_norm": 0.6076782131055856, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -286918829.55932206, "logits/rejected": -240230281.2753623, "logps/chosen": -223.45762711864407, "logps/rejected": -386.3188405797101, "loss": 0.1677, "rewards/chosen": 1.2478813559322033, "rewards/margins": 8.617446573323509, "rewards/rejected": -7.369565217391305, "step": 799 }, { "epoch": 0.5485087418580734, "grad_norm": 0.5192482504941652, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -301002992.9411765, "logits/rejected": -285492292.26666665, "logps/chosen": -231.05882352941177, "logps/rejected": -348.26666666666665, "loss": 0.1614, "rewards/chosen": 1.7463235294117647, "rewards/margins": 12216109.479656862, "rewards/rejected": -12216107.733333332, "step": 800 }, { "epoch": 0.5491943777853959, "grad_norm": 0.48952524069146813, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -286432833.1636364, "logits/rejected": -244763493.69863012, "logps/chosen": -257.74545454545455, "logps/rejected": -327.67123287671234, "loss": 0.1581, "rewards/chosen": 1.4710227272727272, "rewards/margins": 9.08746108343711, "rewards/rejected": -7.616438356164384, "step": 801 }, { "epoch": 0.5498800137127186, "grad_norm": 0.5287014229683193, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -227610897.06666666, "logits/rejected": -286939738.35294116, "logps/chosen": -229.86666666666667, "logps/rejected": -356.2352941176471, "loss": 0.1598, "rewards/chosen": 1.453125, "rewards/margins": 8.886948529411764, "rewards/rejected": -7.4338235294117645, "step": 802 }, { "epoch": 0.5505656496400412, "grad_norm": 0.8527813003015253, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -254021228.1690141, "logits/rejected": -217220796.63157895, "logps/chosen": -317.7464788732394, "logps/rejected": -349.1929824561403, "loss": 0.2341, "rewards/chosen": 0.8974471830985915, "rewards/margins": 8.406219112923154, "rewards/rejected": -7.508771929824562, "step": 803 }, { "epoch": 0.5512512855673637, "grad_norm": 0.5771237074059088, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -260672863.52238807, "logits/rejected": -316841849.704918, "logps/chosen": -270.089552238806, "logps/rejected": -388.72131147540983, "loss": 0.1708, "rewards/chosen": 1.2985074626865671, "rewards/margins": 9.585392708588207, "rewards/rejected": -8.28688524590164, "step": 804 }, { "epoch": 0.5519369214946863, "grad_norm": 0.604096615170768, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -254021228.1690141, "logits/rejected": -214277425.40350878, "logps/chosen": -307.83098591549293, "logps/rejected": -356.7719298245614, "loss": 0.1701, "rewards/chosen": 1.4964788732394365, "rewards/margins": 9.303496417099085, "rewards/rejected": -7.807017543859649, "step": 805 }, { "epoch": 0.5526225574220089, "grad_norm": 0.5743538029895071, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -326689678.2222222, "logits/rejected": -322362221.71428573, "logps/chosen": -286.6666666666667, "logps/rejected": -384.85714285714283, "loss": 0.1944, "rewards/chosen": 1.5026041666666667, "rewards/margins": 9.556175595238095, "rewards/rejected": -8.053571428571429, "step": 806 }, { "epoch": 0.5533081933493315, "grad_norm": 0.48325445271655987, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -269545712.9411765, "logits/rejected": -285492292.26666665, "logps/chosen": -235.2941176470588, "logps/rejected": -331.73333333333335, "loss": 0.1804, "rewards/chosen": 1.380514705882353, "rewards/margins": 9.67218137254902, "rewards/rejected": -8.291666666666666, "step": 807 }, { "epoch": 0.5539938292766541, "grad_norm": 0.4919981033854038, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -199951410.36065573, "logits/rejected": -209589996.8955224, "logps/chosen": -282.4918032786885, "logps/rejected": -308.05970149253733, "loss": 0.1469, "rewards/chosen": 1.6214139344262295, "rewards/margins": 8.822906471739662, "rewards/rejected": -7.201492537313433, "step": 808 }, { "epoch": 0.5546794652039767, "grad_norm": 0.5413131925836557, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -271532788.1846154, "logits/rejected": -328087779.5555556, "logps/chosen": -344.3692307692308, "logps/rejected": -353.015873015873, "loss": 0.1865, "rewards/chosen": 1.466826923076923, "rewards/margins": 8.347779304029304, "rewards/rejected": -6.880952380952381, "step": 809 }, { "epoch": 0.5553651011312993, "grad_norm": 0.5990800451619118, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -307911258.35294116, "logits/rejected": -315132040.53333336, "logps/chosen": -309.1764705882353, "logps/rejected": -409.6, "loss": 0.2006, "rewards/chosen": 1.2040441176470589, "rewards/margins": 8.062377450980392, "rewards/rejected": -6.858333333333333, "step": 810 }, { "epoch": 0.5560507370586218, "grad_norm": 0.6345971692924078, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -254687459.55555555, "logits/rejected": -230986313.14285713, "logps/chosen": -267.3333333333333, "logps/rejected": -333.7142857142857, "loss": 0.2005, "rewards/chosen": 1.1710069444444444, "rewards/margins": 6.7245783730158735, "rewards/rejected": -5.553571428571429, "step": 811 }, { "epoch": 0.5567363729859445, "grad_norm": 0.5322925119160019, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -278886836.4590164, "logits/rejected": -235632242.62686568, "logps/chosen": -253.9016393442623, "logps/rejected": -389.0149253731343, "loss": 0.1534, "rewards/chosen": 1.7151639344262295, "rewards/margins": 9.78979080009787, "rewards/rejected": -8.074626865671641, "step": 812 }, { "epoch": 0.5574220089132671, "grad_norm": 0.5316132370583484, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -271653004.27397263, "logits/rejected": -257759045.8181818, "logps/chosen": -224.21917808219177, "logps/rejected": -348.5090909090909, "loss": 0.1833, "rewards/chosen": 1.529109589041096, "rewards/margins": 6.592745952677459, "rewards/rejected": -5.0636363636363635, "step": 813 }, { "epoch": 0.5581076448405896, "grad_norm": 0.6134926677023489, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -296103145.54385966, "logits/rejected": -248822654.1971831, "logps/chosen": -300.0701754385965, "logps/rejected": -387.6056338028169, "loss": 0.1547, "rewards/chosen": 2.1447368421052633, "rewards/margins": 6.116567828020756, "rewards/rejected": -3.971830985915493, "step": 814 }, { "epoch": 0.5587932807679122, "grad_norm": 0.6416510028434421, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -269350576.8727273, "logits/rejected": -334165370.739726, "logps/chosen": -419.4909090909091, "logps/rejected": -353.3150684931507, "loss": 0.1574, "rewards/chosen": 1.5852272727272727, "rewards/margins": -3357671.1818960146, "rewards/rejected": 3357672.7671232875, "step": 815 }, { "epoch": 0.5594789166952349, "grad_norm": 0.43877609023864317, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -253364397.55932203, "logits/rejected": -252630832.23188406, "logps/chosen": -325.96610169491527, "logps/rejected": -344.57971014492756, "loss": 0.1329, "rewards/chosen": 1.9173728813559323, "rewards/margins": 9.330416359616802, "rewards/rejected": -7.413043478260869, "step": 816 }, { "epoch": 0.5601645526225574, "grad_norm": 0.5424002625100303, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -297414283.6363636, "logits/rejected": -266541254.19354838, "logps/chosen": -236.6060606060606, "logps/rejected": -336.51612903225805, "loss": 0.1582, "rewards/chosen": 1.6003787878787878, "rewards/margins": 8.334249755620723, "rewards/rejected": -6.733870967741935, "step": 817 }, { "epoch": 0.56085018854988, "grad_norm": 0.47656289583614725, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -189756098.20689654, "logits/rejected": -235120698.5142857, "logps/chosen": -272.55172413793105, "logps/rejected": -325.9428571428571, "loss": 0.1519, "rewards/chosen": 1.644396551724138, "rewards/margins": 9.73725369458128, "rewards/rejected": -8.092857142857143, "step": 818 }, { "epoch": 0.5615358244772026, "grad_norm": 0.714782874534997, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -354838118.4, "logits/rejected": -264981323.29411766, "logps/chosen": -215.2, "logps/rejected": -379.7647058823529, "loss": 0.1931, "rewards/chosen": 0.953125, "rewards/margins": 7.556066176470588, "rewards/rejected": -6.602941176470588, "step": 819 }, { "epoch": 0.5622214604045251, "grad_norm": 0.5507049463965186, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -303716954.35294116, "logits/rejected": -308281344.0, "logps/chosen": -271.52941176470586, "logps/rejected": -343.2, "loss": 0.1857, "rewards/chosen": 1.2123161764705883, "rewards/margins": 7.295649509803921, "rewards/rejected": -6.083333333333333, "step": 820 }, { "epoch": 0.5629070963318478, "grad_norm": 0.5423564155671337, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -246211091.94805196, "logits/rejected": -192444536.47058824, "logps/chosen": -233.97402597402598, "logps/rejected": -349.96078431372547, "loss": 0.1854, "rewards/chosen": 1.7483766233766234, "rewards/margins": 9.10131779984721, "rewards/rejected": -7.352941176470588, "step": 821 }, { "epoch": 0.5635927322591704, "grad_norm": 0.3619379297005749, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -335244726.85714287, "logits/rejected": -339738624.0, "logps/chosen": -395.7142857142857, "logps/rejected": -363.1111111111111, "loss": 0.1113, "rewards/chosen": 2.497767857142857, "rewards/margins": 10.518601190476192, "rewards/rejected": -8.020833333333334, "step": 822 }, { "epoch": 0.564278368186493, "grad_norm": 0.6741265633737633, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -222237324.98550725, "logits/rejected": -233032686.6440678, "logps/chosen": -213.1014492753623, "logps/rejected": -379.1186440677966, "loss": 0.2162, "rewards/chosen": 0.6639492753623188, "rewards/margins": 6.9054746990911315, "rewards/rejected": -6.241525423728813, "step": 823 }, { "epoch": 0.5649640041138155, "grad_norm": 0.6020690549986396, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -278201298.1492537, "logits/rejected": -305015287.60655737, "logps/chosen": -238.32835820895522, "logps/rejected": -404.4590163934426, "loss": 0.174, "rewards/chosen": 1.4580223880597014, "rewards/margins": 9.417038781502324, "rewards/rejected": -7.959016393442623, "step": 824 }, { "epoch": 0.5656496400411382, "grad_norm": 0.5688946031462113, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -252483348.98360655, "logits/rejected": -287466327.880597, "logps/chosen": -252.327868852459, "logps/rejected": -361.07462686567163, "loss": 0.1645, "rewards/chosen": 1.2346311475409837, "rewards/margins": 8.898810252018595, "rewards/rejected": -7.664179104477612, "step": 825 }, { "epoch": 0.5663352759684608, "grad_norm": 0.6790609575294936, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -319553536.0, "logits/rejected": -268173312.0, "logps/chosen": -259.0, "logps/rejected": -440.0, "loss": 0.1638, "rewards/chosen": 1.32421875, "rewards/margins": 7.19140625, "rewards/rejected": -5.8671875, "step": 826 }, { "epoch": 0.5670209118957833, "grad_norm": 0.4318346663062415, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -307668330.3384615, "logits/rejected": -330750829.71428573, "logps/chosen": -233.6, "logps/rejected": -399.74603174603175, "loss": 0.153, "rewards/chosen": 1.65, "rewards/margins": 8.554761904761905, "rewards/rejected": -6.904761904761905, "step": 827 }, { "epoch": 0.5677065478231059, "grad_norm": 0.47554144616991606, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -344641824.4507042, "logits/rejected": -246360171.78947368, "logps/chosen": -217.2394366197183, "logps/rejected": -405.89473684210526, "loss": 0.1716, "rewards/chosen": 1.3028169014084507, "rewards/margins": 8.899308129478626, "rewards/rejected": -7.5964912280701755, "step": 828 }, { "epoch": 0.5683921837504285, "grad_norm": 0.47780717541994083, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -276824064.0, "logits/rejected": -263309084.44444445, "logps/chosen": -289.42857142857144, "logps/rejected": -405.77777777777777, "loss": 0.1353, "rewards/chosen": 1.9174107142857142, "rewards/margins": 8.347966269841269, "rewards/rejected": -6.430555555555555, "step": 829 }, { "epoch": 0.5690778196777511, "grad_norm": 0.5139586609007648, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -252989765.07936507, "logits/rejected": -251400128.9846154, "logps/chosen": -242.53968253968253, "logps/rejected": -344.8615384615385, "loss": 0.1677, "rewards/chosen": 1.3482142857142858, "rewards/margins": 8.77129120879121, "rewards/rejected": -7.423076923076923, "step": 830 }, { "epoch": 0.5697634556050737, "grad_norm": 0.4659569218888288, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -288513107.93442625, "logits/rejected": -255414333.13432837, "logps/chosen": -211.672131147541, "logps/rejected": -345.7910447761194, "loss": 0.1614, "rewards/chosen": 1.0860655737704918, "rewards/margins": 8.668155126009298, "rewards/rejected": -7.582089552238806, "step": 831 }, { "epoch": 0.5704490915323963, "grad_norm": 0.4788173717980313, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -290647544.7887324, "logits/rejected": -234586686.87719297, "logps/chosen": -221.07042253521126, "logps/rejected": -285.4736842105263, "loss": 0.1866, "rewards/chosen": 1.3257042253521127, "rewards/margins": 4.878335804299481, "rewards/rejected": -3.5526315789473686, "step": 832 }, { "epoch": 0.5711347274597189, "grad_norm": 0.6076189589705031, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -319283326.0307692, "logits/rejected": -339272590.2222222, "logps/chosen": -230.15384615384616, "logps/rejected": -359.1111111111111, "loss": 0.1868, "rewards/chosen": 1.1045673076923077, "rewards/margins": 8.922027625152625, "rewards/rejected": -7.817460317460317, "step": 833 }, { "epoch": 0.5718203633870415, "grad_norm": 0.6162740580609478, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -262175300.7761194, "logits/rejected": -330043593.44262296, "logps/chosen": -316.4179104477612, "logps/rejected": -384.5245901639344, "loss": 0.2023, "rewards/chosen": 1.2490671641791045, "rewards/margins": 7.986772082211891, "rewards/rejected": -6.737704918032787, "step": 834 }, { "epoch": 0.5725059993143641, "grad_norm": 0.5790339393911969, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -297414283.6363636, "logits/rejected": -235963425.03225806, "logps/chosen": -190.78787878787878, "logps/rejected": -299.8709677419355, "loss": 0.1906, "rewards/chosen": 1.1103219696969697, "rewards/margins": 8.892580034213099, "rewards/rejected": -7.782258064516129, "step": 835 }, { "epoch": 0.5731916352416867, "grad_norm": 0.5356773304863297, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -262482250.32258064, "logits/rejected": -205647996.12121212, "logps/chosen": -251.09677419354838, "logps/rejected": -444.6060606060606, "loss": 0.1771, "rewards/chosen": 1.0040322580645162, "rewards/margins": 8.541911045943305, "rewards/rejected": -7.537878787878788, "step": 836 }, { "epoch": 0.5738772711690092, "grad_norm": 0.5653078113294845, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -293880900.26666665, "logits/rejected": -314819523.7647059, "logps/chosen": -278.4, "logps/rejected": -365.6470588235294, "loss": 0.1543, "rewards/chosen": 1.0875, "rewards/margins": 9.2125, "rewards/rejected": -8.125, "step": 837 }, { "epoch": 0.5745629070963318, "grad_norm": 0.5624172017532441, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -260205123.6226415, "logits/rejected": -279396570.4533333, "logps/chosen": -315.47169811320754, "logps/rejected": -348.16, "loss": 0.1375, "rewards/chosen": 1.5188679245283019, "rewards/margins": 9.205534591194969, "rewards/rejected": -7.6866666666666665, "step": 838 }, { "epoch": 0.5752485430236545, "grad_norm": 0.5138144699619918, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -273855787.3230769, "logits/rejected": -323560594.28571427, "logps/chosen": -317.7846153846154, "logps/rejected": -389.3333333333333, "loss": 0.1549, "rewards/chosen": 1.5865384615384615, "rewards/margins": 9.991300366300367, "rewards/rejected": -8.404761904761905, "step": 839 }, { "epoch": 0.575934178950977, "grad_norm": 0.671022946461998, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -229034418.42424244, "logits/rejected": -197808788.6451613, "logps/chosen": -219.63636363636363, "logps/rejected": -384.51612903225805, "loss": 0.1647, "rewards/chosen": 1.415719696969697, "rewards/margins": 9.891526148582601, "rewards/rejected": -8.475806451612904, "step": 840 }, { "epoch": 0.5766198148782996, "grad_norm": 0.5847234393364223, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -316263041.9104478, "logits/rejected": -279986981.7704918, "logps/chosen": -311.1641791044776, "logps/rejected": -348.327868852459, "loss": 0.1881, "rewards/chosen": 1.017723880597015, "rewards/margins": 8.88657633961341, "rewards/rejected": -7.868852459016393, "step": 841 }, { "epoch": 0.5773054508056222, "grad_norm": 0.5266267863124968, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -260180000.5079365, "logits/rejected": -256820460.30769232, "logps/chosen": -351.74603174603175, "logps/rejected": -333.7846153846154, "loss": 0.1664, "rewards/chosen": 1.3601190476190477, "rewards/margins": 9.537042124542124, "rewards/rejected": -8.176923076923076, "step": 842 }, { "epoch": 0.5779910867329449, "grad_norm": 0.614917455289551, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -260046848.0, "logits/rejected": -274027861.3333333, "logps/chosen": -255.48387096774192, "logps/rejected": -365.57575757575756, "loss": 0.1602, "rewards/chosen": 1.5080645161290323, "rewards/margins": 9.098973607038124, "rewards/rejected": -7.590909090909091, "step": 843 }, { "epoch": 0.5786767226602674, "grad_norm": 0.4902114184312914, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -214109232.76190478, "logits/rejected": -195390038.64615384, "logps/chosen": -208.0, "logps/rejected": -304.0, "loss": 0.1807, "rewards/chosen": 0.7609126984126984, "rewards/margins": 7.814758852258852, "rewards/rejected": -7.053846153846154, "step": 844 }, { "epoch": 0.57936235858759, "grad_norm": 0.7096993977445784, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -295324966.5753425, "logits/rejected": -219781529.6, "logps/chosen": -240.21917808219177, "logps/rejected": -341.8181818181818, "loss": 0.2119, "rewards/chosen": 1.1549657534246576, "rewards/margins": 9.764056662515568, "rewards/rejected": -8.60909090909091, "step": 845 }, { "epoch": 0.5800479945149126, "grad_norm": 0.5912257783846198, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -265439524.57142857, "logits/rejected": -287542840.8888889, "logps/chosen": -283.7142857142857, "logps/rejected": -368.8888888888889, "loss": 0.1615, "rewards/chosen": 1.2092633928571428, "rewards/margins": 9.396763392857142, "rewards/rejected": -8.1875, "step": 846 }, { "epoch": 0.5807336304422351, "grad_norm": 1.0166832642376697, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -249335240.86153847, "logits/rejected": -227957093.58730158, "logps/chosen": -220.55384615384617, "logps/rejected": -330.1587301587302, "loss": 0.1887, "rewards/chosen": 1.228846153846154, "rewards/margins": 6.2129731379731385, "rewards/rejected": -4.984126984126984, "step": 847 }, { "epoch": 0.5814192663695578, "grad_norm": 0.5068951273264348, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -210239488.0, "logits/rejected": -241172480.0, "logps/chosen": -294.25, "logps/rejected": -334.5, "loss": 0.1541, "rewards/chosen": 1.443359375, "rewards/margins": 10.232421875, "rewards/rejected": -8.7890625, "step": 848 }, { "epoch": 0.5821049022968804, "grad_norm": 0.5397363094667057, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -281018368.0, "logits/rejected": -269175627.2941176, "logps/chosen": -246.13333333333333, "logps/rejected": -377.4117647058824, "loss": 0.1519, "rewards/chosen": 1.4479166666666667, "rewards/margins": 9.374387254901961, "rewards/rejected": -7.926470588235294, "step": 849 }, { "epoch": 0.5827905382242029, "grad_norm": 0.5461159241038349, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -322122547.2, "logits/rejected": -352321536.0, "logps/chosen": -251.88571428571427, "logps/rejected": -406.62068965517244, "loss": 0.1884, "rewards/chosen": 1.2133928571428572, "rewards/margins": 16976370.868565273, "rewards/rejected": -16976369.655172415, "step": 850 }, { "epoch": 0.5834761741515255, "grad_norm": 0.5872168246864953, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -277383304.53333336, "logits/rejected": -280278196.7058824, "logps/chosen": -307.06666666666666, "logps/rejected": -393.4117647058824, "loss": 0.158, "rewards/chosen": 2.1083333333333334, "rewards/margins": 7.18921568627451, "rewards/rejected": -5.080882352941177, "step": 851 }, { "epoch": 0.5841618100788482, "grad_norm": 0.5198053918137457, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -284400871.2258065, "logits/rejected": -352829936.4848485, "logps/chosen": -365.93548387096774, "logps/rejected": -350.54545454545456, "loss": 0.183, "rewards/chosen": 1.4233870967741935, "rewards/margins": 9.006720430107526, "rewards/rejected": -7.583333333333333, "step": 852 }, { "epoch": 0.5848474460061707, "grad_norm": 0.4968492127850972, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -269452256.969697, "logits/rejected": -265458853.16129032, "logps/chosen": -271.5151515151515, "logps/rejected": -393.2903225806452, "loss": 0.1837, "rewards/chosen": 1.496212121212121, "rewards/margins": 8.584921798631477, "rewards/rejected": -7.088709677419355, "step": 853 }, { "epoch": 0.5855330819334933, "grad_norm": 0.5579705564936895, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -251440354.07792208, "logits/rejected": -280442679.21568626, "logps/chosen": -222.12987012987014, "logps/rejected": -339.1372549019608, "loss": 0.1876, "rewards/chosen": 1.3944805194805194, "rewards/margins": 8.355264833206009, "rewards/rejected": -6.96078431372549, "step": 854 }, { "epoch": 0.5862187178608159, "grad_norm": 0.5482290822108243, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -263137387.78947368, "logits/rejected": -230863944.11267605, "logps/chosen": -285.7543859649123, "logps/rejected": -305.1267605633803, "loss": 0.1425, "rewards/chosen": 1.9890350877192982, "rewards/margins": 8.89748579194465, "rewards/rejected": -6.908450704225352, "step": 855 }, { "epoch": 0.5869043537881385, "grad_norm": 0.6482334513350014, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -280389222.4, "logits/rejected": -318417578.6666667, "logps/chosen": -271.0, "logps/rejected": -346.3333333333333, "loss": 0.209, "rewards/chosen": 1.8109375, "rewards/margins": 8.972395833333334, "rewards/rejected": -7.161458333333333, "step": 856 }, { "epoch": 0.5875899897154611, "grad_norm": 0.4855388106459078, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -286331153.06666666, "logits/rejected": -287433185.88235295, "logps/chosen": -256.26666666666665, "logps/rejected": -376.0, "loss": 0.1639, "rewards/chosen": 1.3291666666666666, "rewards/margins": 8.424754901960785, "rewards/rejected": -7.095588235294118, "step": 857 }, { "epoch": 0.5882756256427837, "grad_norm": 0.5757730910178326, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -270723258.1818182, "logits/rejected": -276012263.2258065, "logps/chosen": -268.6060606060606, "logps/rejected": -410.3225806451613, "loss": 0.1764, "rewards/chosen": 1.4900568181818181, "rewards/margins": 9.344895527859236, "rewards/rejected": -7.854838709677419, "step": 858 }, { "epoch": 0.5889612615701063, "grad_norm": 0.4883385212592391, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -255911618.70422536, "logits/rejected": -281091952.28070176, "logps/chosen": -283.2676056338028, "logps/rejected": -325.6140350877193, "loss": 0.1587, "rewards/chosen": 1.6443661971830985, "rewards/margins": 9.394366197183098, "rewards/rejected": -7.75, "step": 859 }, { "epoch": 0.5896468974974288, "grad_norm": 0.48360804242127936, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -218553197.7142857, "logits/rejected": -332748117.3333333, "logps/chosen": -240.14285714285714, "logps/rejected": -353.3333333333333, "loss": 0.1237, "rewards/chosen": 1.4151785714285714, "rewards/margins": 9.17906746031746, "rewards/rejected": -7.763888888888889, "step": 860 }, { "epoch": 0.5903325334247514, "grad_norm": 0.7545707621570729, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -296484864.0, "logits/rejected": -253231104.0, "logps/chosen": -240.375, "logps/rejected": -322.0, "loss": 0.1937, "rewards/chosen": 1.24609375, "rewards/margins": 7.74609375, "rewards/rejected": -6.5, "step": 861 }, { "epoch": 0.5910181693520741, "grad_norm": 0.5571633277436157, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -344203528.2580645, "logits/rejected": -260301048.24242425, "logps/chosen": -352.0, "logps/rejected": -365.57575757575756, "loss": 0.1723, "rewards/chosen": 1.7429435483870968, "rewards/margins": 8.947489002932551, "rewards/rejected": -7.204545454545454, "step": 862 }, { "epoch": 0.5917038052793966, "grad_norm": 0.5777838430948689, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -391905280.0, "logits/rejected": -314310656.0, "logps/chosen": -236.75, "logps/rejected": -336.5, "loss": 0.1718, "rewards/chosen": 1.4716796875, "rewards/margins": 8.4560546875, "rewards/rejected": -6.984375, "step": 863 }, { "epoch": 0.5923894412067192, "grad_norm": 0.5626553881200758, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -281456578.86567163, "logits/rejected": -289613253.24590164, "logps/chosen": -237.61194029850745, "logps/rejected": -354.62295081967216, "loss": 0.1682, "rewards/chosen": 1.5205223880597014, "rewards/margins": 8.053309273305603, "rewards/rejected": -6.532786885245901, "step": 864 }, { "epoch": 0.5930750771340418, "grad_norm": 0.5713949946265888, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -297613222.95652175, "logits/rejected": -338103556.33898306, "logps/chosen": -263.42028985507244, "logps/rejected": -369.35593220338984, "loss": 0.1957, "rewards/chosen": 1.082427536231884, "rewards/margins": 7.82819024809629, "rewards/rejected": -6.745762711864407, "step": 865 }, { "epoch": 0.5937607130613645, "grad_norm": 0.5659103652309576, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -240739734.34920636, "logits/rejected": -224814694.4, "logps/chosen": -211.3015873015873, "logps/rejected": -330.33846153846156, "loss": 0.1895, "rewards/chosen": 1.1721230158730158, "rewards/margins": 7.910584554334554, "rewards/rejected": -6.7384615384615385, "step": 866 }, { "epoch": 0.594446348988687, "grad_norm": 0.5157268054740686, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -315519900.9032258, "logits/rejected": -289279875.8787879, "logps/chosen": -341.16129032258067, "logps/rejected": -352.4848484848485, "loss": 0.1616, "rewards/chosen": 1.344758064516129, "rewards/margins": 6.897788367546432, "rewards/rejected": -5.553030303030303, "step": 867 }, { "epoch": 0.5951319849160096, "grad_norm": 0.5673241546841367, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -291263471.21311474, "logits/rejected": -291222421.01492536, "logps/chosen": -263.4754098360656, "logps/rejected": -387.34328358208955, "loss": 0.1686, "rewards/chosen": 1.194672131147541, "rewards/margins": 6.440940787863958, "rewards/rejected": -5.246268656716418, "step": 868 }, { "epoch": 0.5958176208433322, "grad_norm": 0.5589974688831655, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -275386016.9142857, "logits/rejected": -355792684.13793105, "logps/chosen": -360.22857142857146, "logps/rejected": -290.2068965517241, "loss": 0.1714, "rewards/chosen": 1.8089285714285714, "rewards/margins": 9.214100985221675, "rewards/rejected": -7.405172413793103, "step": 869 }, { "epoch": 0.5965032567706547, "grad_norm": 0.5388974204524178, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -297109243.3454546, "logits/rejected": -277398626.1917808, "logps/chosen": -306.90909090909093, "logps/rejected": -328.7671232876712, "loss": 0.1642, "rewards/chosen": 1.415909090909091, "rewards/margins": 7.813169364881694, "rewards/rejected": -6.397260273972603, "step": 870 }, { "epoch": 0.5971888926979774, "grad_norm": 0.4721193836081574, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -302888667.4285714, "logits/rejected": -259347797.33333334, "logps/chosen": -217.71428571428572, "logps/rejected": -431.55555555555554, "loss": 0.1515, "rewards/chosen": 1.1662946428571428, "rewards/margins": 8.930183531746032, "rewards/rejected": -7.763888888888889, "step": 871 }, { "epoch": 0.5978745286253, "grad_norm": 0.5464943326202067, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -234881024.0, "logits/rejected": -268164855.7419355, "logps/chosen": -229.57575757575756, "logps/rejected": -376.7741935483871, "loss": 0.1554, "rewards/chosen": 1.4734848484848484, "rewards/margins": 9.247678396871946, "rewards/rejected": -7.774193548387097, "step": 872 }, { "epoch": 0.5985601645526225, "grad_norm": 0.5393478156315662, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -236182704.55172414, "logits/rejected": -299832817.37142855, "logps/chosen": -302.62068965517244, "logps/rejected": -361.14285714285717, "loss": 0.1596, "rewards/chosen": 1.478448275862069, "rewards/margins": 8.371305418719212, "rewards/rejected": -6.892857142857143, "step": 873 }, { "epoch": 0.5992458004799451, "grad_norm": 0.6594648582546379, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -228428248.6153846, "logits/rejected": -273761556.3174603, "logps/chosen": -275.2, "logps/rejected": -350.984126984127, "loss": 0.1762, "rewards/chosen": 1.5259615384615384, "rewards/margins": 8.94659645909646, "rewards/rejected": -7.420634920634921, "step": 874 }, { "epoch": 0.5999314364072678, "grad_norm": 0.6219459905578674, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -282224948.60273975, "logits/rejected": -311903697.45454544, "logps/chosen": -342.35616438356163, "logps/rejected": -342.4, "loss": 0.1877, "rewards/chosen": 1.5376712328767124, "rewards/margins": 9.67403486924035, "rewards/rejected": -8.136363636363637, "step": 875 }, { "epoch": 0.6006170723345904, "grad_norm": 0.4359024933385692, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -241696768.0, "logits/rejected": -207355904.0, "logps/chosen": -247.25, "logps/rejected": -308.5, "loss": 0.1405, "rewards/chosen": 1.431640625, "rewards/margins": 9.001953125, "rewards/rejected": -7.5703125, "step": 876 }, { "epoch": 0.6013027082619129, "grad_norm": 0.6048959508849369, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -234614718.98412699, "logits/rejected": -281599117.7846154, "logps/chosen": -330.6666666666667, "logps/rejected": -381.53846153846155, "loss": 0.1739, "rewards/chosen": 1.2857142857142858, "rewards/margins": 8.639560439560439, "rewards/rejected": -7.3538461538461535, "step": 877 }, { "epoch": 0.6019883441892355, "grad_norm": 0.47635005167087624, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -290216403.0877193, "logits/rejected": -306243266.70422536, "logps/chosen": -345.5438596491228, "logps/rejected": -354.2535211267606, "loss": 0.1367, "rewards/chosen": 2.2280701754385963, "rewards/margins": 9.375957499382258, "rewards/rejected": -7.147887323943662, "step": 878 }, { "epoch": 0.6026739801165582, "grad_norm": 0.48388044627491805, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -230280819.61290324, "logits/rejected": -245303233.93939394, "logps/chosen": -204.7741935483871, "logps/rejected": -364.1212121212121, "loss": 0.1807, "rewards/chosen": 1.2439516129032258, "rewards/margins": 9.554557673509287, "rewards/rejected": -8.31060606060606, "step": 879 }, { "epoch": 0.6033596160438807, "grad_norm": 0.49717941460962223, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -260858648.77419356, "logits/rejected": -287754674.42424244, "logps/chosen": -270.19354838709677, "logps/rejected": -426.6666666666667, "loss": 0.1674, "rewards/chosen": 1.6834677419354838, "rewards/margins": 8.903164711632453, "rewards/rejected": -7.21969696969697, "step": 880 }, { "epoch": 0.6040452519712033, "grad_norm": 0.5526804335289597, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -271790899.2, "logits/rejected": -269234371.04761904, "logps/chosen": -305.96923076923076, "logps/rejected": -385.015873015873, "loss": 0.1646, "rewards/chosen": 1.6129807692307692, "rewards/margins": 8.628853785103786, "rewards/rejected": -7.015873015873016, "step": 881 }, { "epoch": 0.6047308878985259, "grad_norm": 0.602132474571508, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -300568090.0338983, "logits/rejected": -257007497.2753623, "logps/chosen": -250.84745762711864, "logps/rejected": -394.6666666666667, "loss": 0.1823, "rewards/chosen": 1.0953389830508475, "rewards/margins": 7.899686809137804, "rewards/rejected": -6.804347826086956, "step": 882 }, { "epoch": 0.6054165238258484, "grad_norm": 0.611506983867467, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -251425223.1111111, "logits/rejected": -215107876.57142857, "logps/chosen": -186.22222222222223, "logps/rejected": -349.42857142857144, "loss": 0.1963, "rewards/chosen": 1.1167534722222223, "rewards/margins": 8.688182043650794, "rewards/rejected": -7.571428571428571, "step": 883 }, { "epoch": 0.606102159753171, "grad_norm": 0.6875130086478961, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -259784704.0, "logits/rejected": -262144000.0, "logps/chosen": -225.125, "logps/rejected": -369.0, "loss": 0.1978, "rewards/chosen": 0.802734375, "rewards/margins": 8.740234375, "rewards/rejected": -7.9375, "step": 884 }, { "epoch": 0.6067877956804937, "grad_norm": 0.5585384752828613, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -253129925.6140351, "logits/rejected": -233935828.73239437, "logps/chosen": -261.89473684210526, "logps/rejected": -335.77464788732397, "loss": 0.1388, "rewards/chosen": 1.2346491228070176, "rewards/margins": 9.086761798863355, "rewards/rejected": -7.852112676056338, "step": 885 }, { "epoch": 0.6074734316078162, "grad_norm": 0.6720168217141558, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -314646384.28070176, "logits/rejected": -253548630.53521127, "logps/chosen": -324.2105263157895, "logps/rejected": -353.80281690140845, "loss": 0.1486, "rewards/chosen": 1.2171052631578947, "rewards/margins": 8.787527798369162, "rewards/rejected": -7.570422535211268, "step": 886 }, { "epoch": 0.6081590675351388, "grad_norm": 0.5922857735308841, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -244081432.77419356, "logits/rejected": -305802891.6363636, "logps/chosen": -301.6774193548387, "logps/rejected": -340.8484848484849, "loss": 0.174, "rewards/chosen": 1.3921370967741935, "rewards/margins": 7.361834066471163, "rewards/rejected": -5.96969696969697, "step": 887 }, { "epoch": 0.6088447034624614, "grad_norm": 0.6242545814086675, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -312475648.0, "logits/rejected": -280494080.0, "logps/chosen": -239.5, "logps/rejected": -389.0, "loss": 0.1975, "rewards/chosen": 1.431640625, "rewards/margins": 8.478515625, "rewards/rejected": -7.046875, "step": 888 }, { "epoch": 0.6095303393897841, "grad_norm": 0.6048512988670267, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -333570529.88235295, "logits/rejected": -234042163.2, "logps/chosen": -272.94117647058823, "logps/rejected": -362.4, "loss": 0.1758, "rewards/chosen": 1.6884191176470589, "rewards/margins": 9.69675245098039, "rewards/rejected": -8.008333333333333, "step": 889 }, { "epoch": 0.6102159753171066, "grad_norm": 0.6095572470282362, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -276136473.1803279, "logits/rejected": -262175300.7761194, "logps/chosen": -265.8360655737705, "logps/rejected": -359.1641791044776, "loss": 0.1654, "rewards/chosen": 1.2899590163934427, "rewards/margins": 8.581003792512846, "rewards/rejected": -7.291044776119403, "step": 890 }, { "epoch": 0.6109016112444292, "grad_norm": 0.6082627201126684, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -278921216.0, "logits/rejected": -264765440.0, "logps/chosen": -281.75, "logps/rejected": -355.5, "loss": 0.1681, "rewards/chosen": 1.26171875, "rewards/margins": 7.68359375, "rewards/rejected": -6.421875, "step": 891 }, { "epoch": 0.6115872471717518, "grad_norm": 0.5273434242222449, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -271581184.0, "logits/rejected": -355729408.0, "logps/chosen": -242.25, "logps/rejected": -312.75, "loss": 0.1642, "rewards/chosen": 0.818359375, "rewards/margins": 7.318359375, "rewards/rejected": -6.5, "step": 892 }, { "epoch": 0.6122728830990743, "grad_norm": 0.5706252139137314, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -216106520.3809524, "logits/rejected": -239656077.7846154, "logps/chosen": -208.0, "logps/rejected": -321.96923076923076, "loss": 0.166, "rewards/chosen": 1.2083333333333333, "rewards/margins": 8.28525641025641, "rewards/rejected": -7.076923076923077, "step": 893 }, { "epoch": 0.612958519026397, "grad_norm": 0.632840223829732, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -222597705.14285713, "logits/rejected": -205520896.0, "logps/chosen": -291.14285714285717, "logps/rejected": -333.3333333333333, "loss": 0.1509, "rewards/chosen": 1.5435267857142858, "rewards/margins": 9.731026785714286, "rewards/rejected": -8.1875, "step": 894 }, { "epoch": 0.6136441549537196, "grad_norm": 0.5593777303478087, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -336681758.37288135, "logits/rejected": -283996931.71014494, "logps/chosen": -317.2881355932203, "logps/rejected": -356.17391304347825, "loss": 0.1475, "rewards/chosen": 1.4364406779661016, "rewards/margins": 7.77702038811103, "rewards/rejected": -6.340579710144928, "step": 895 }, { "epoch": 0.6143297908810421, "grad_norm": 0.6021485035702526, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -300464686.54545456, "logits/rejected": -314437499.87096775, "logps/chosen": -238.54545454545453, "logps/rejected": -348.9032258064516, "loss": 0.1865, "rewards/chosen": 1.2575757575757576, "rewards/margins": 8.451124144672532, "rewards/rejected": -7.193548387096774, "step": 896 }, { "epoch": 0.6150154268083647, "grad_norm": 0.6473581635016031, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -253615581.86666667, "logits/rejected": -283238881.88235295, "logps/chosen": -262.93333333333334, "logps/rejected": -332.2352941176471, "loss": 0.1818, "rewards/chosen": 1.35888671875, "rewards/margins": 8.145651424632353, "rewards/rejected": -6.786764705882353, "step": 897 }, { "epoch": 0.6157010627356874, "grad_norm": 0.42958112372304447, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -338540251.4285714, "logits/rejected": -301290837.3333333, "logps/chosen": -271.14285714285717, "logps/rejected": -359.1111111111111, "loss": 0.127, "rewards/chosen": 1.7555803571428572, "rewards/margins": 10.679191468253968, "rewards/rejected": -8.92361111111111, "step": 898 }, { "epoch": 0.61638669866301, "grad_norm": 0.4980624668201034, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -275404453.4153846, "logits/rejected": -295864872.63492066, "logps/chosen": -328.12307692307695, "logps/rejected": -336.25396825396825, "loss": 0.1557, "rewards/chosen": 1.8413461538461537, "rewards/margins": 8.642933455433456, "rewards/rejected": -6.801587301587301, "step": 899 }, { "epoch": 0.6170723345903325, "grad_norm": 0.49658661010232635, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -253465017.1076923, "logits/rejected": -237810379.17460316, "logps/chosen": -211.3230769230769, "logps/rejected": -282.6666666666667, "loss": 0.1756, "rewards/chosen": 1.4230769230769231, "rewards/margins": 8.423076923076923, "rewards/rejected": -7.0, "step": 900 }, { "epoch": 0.6177579705176551, "grad_norm": 0.42833918009136857, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -261284511.47540984, "logits/rejected": -357705269.4925373, "logps/chosen": -289.04918032786884, "logps/rejected": -388.2985074626866, "loss": 0.1726, "rewards/chosen": 1.0855532786885247, "rewards/margins": 9.316896562270614, "rewards/rejected": -8.23134328358209, "step": 901 }, { "epoch": 0.6184436064449778, "grad_norm": 0.5953180302930915, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -234753923.87878788, "logits/rejected": -275741662.9677419, "logps/chosen": -329.45454545454544, "logps/rejected": -292.38709677419354, "loss": 0.2046, "rewards/chosen": 0.865530303030303, "rewards/margins": 7.913917399804497, "rewards/rejected": -7.048387096774194, "step": 902 }, { "epoch": 0.6191292423723003, "grad_norm": 0.5160380150763663, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -279707648.0, "logits/rejected": -233570304.0, "logps/chosen": -282.5, "logps/rejected": -338.5, "loss": 0.1584, "rewards/chosen": 1.44921875, "rewards/margins": 9.82421875, "rewards/rejected": -8.375, "step": 903 }, { "epoch": 0.6198148782996229, "grad_norm": 0.5284540149031726, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -234733855.4385965, "logits/rejected": -236298816.90140846, "logps/chosen": -216.7017543859649, "logps/rejected": -306.92957746478874, "loss": 0.1595, "rewards/chosen": 1.0866228070175439, "rewards/margins": 6.938735483073882, "rewards/rejected": -5.852112676056338, "step": 904 }, { "epoch": 0.6205005142269455, "grad_norm": 0.629796292472369, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -314048512.0, "logits/rejected": -308019200.0, "logps/chosen": -266.25, "logps/rejected": -383.5, "loss": 0.171, "rewards/chosen": 1.4609375, "rewards/margins": 8.46875, "rewards/rejected": -7.0078125, "step": 905 }, { "epoch": 0.621186150154268, "grad_norm": 0.49319691223868534, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -294563907.147541, "logits/rejected": -308750855.64179105, "logps/chosen": -236.59016393442624, "logps/rejected": -375.4029850746269, "loss": 0.1263, "rewards/chosen": 2.1290983606557377, "rewards/margins": 10.248501345730364, "rewards/rejected": -8.119402985074627, "step": 906 }, { "epoch": 0.6218717860815907, "grad_norm": 0.49356664241309717, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -206827318.55737704, "logits/rejected": -253411083.46268657, "logps/chosen": -322.62295081967216, "logps/rejected": -325.7313432835821, "loss": 0.1581, "rewards/chosen": 1.326844262295082, "rewards/margins": 9.125351724981648, "rewards/rejected": -7.798507462686567, "step": 907 }, { "epoch": 0.6225574220089133, "grad_norm": 0.6738706622987903, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -260625372.68965518, "logits/rejected": -326197013.94285715, "logps/chosen": -372.13793103448273, "logps/rejected": -319.54285714285714, "loss": 0.1581, "rewards/chosen": 1.456896551724138, "rewards/margins": 6.6854679802955665, "rewards/rejected": -5.228571428571429, "step": 908 }, { "epoch": 0.6232430579362359, "grad_norm": 0.5896466122887951, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -310013773.9130435, "logits/rejected": -277534962.9830508, "logps/chosen": -241.6231884057971, "logps/rejected": -322.4406779661017, "loss": 0.2005, "rewards/chosen": 0.9963768115942029, "rewards/margins": 9.106546303119627, "rewards/rejected": -8.110169491525424, "step": 909 }, { "epoch": 0.6239286938635584, "grad_norm": 0.5403174606801165, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -276957216.50793654, "logits/rejected": -298376333.7846154, "logps/chosen": -263.1111111111111, "logps/rejected": -334.7692307692308, "loss": 0.1548, "rewards/chosen": 1.7003968253968254, "rewards/margins": 7.6696275946275945, "rewards/rejected": -5.969230769230769, "step": 910 }, { "epoch": 0.624614329790881, "grad_norm": 0.7388483517645336, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -232244604.34285715, "logits/rejected": -269013980.6896552, "logps/chosen": -266.0571428571429, "logps/rejected": -393.9310344827586, "loss": 0.1785, "rewards/chosen": 1.5982142857142858, "rewards/margins": 9.75338669950739, "rewards/rejected": -8.155172413793103, "step": 911 }, { "epoch": 0.6252999657182037, "grad_norm": 0.7774977310005123, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -233327578.07407406, "logits/rejected": -267075141.1891892, "logps/chosen": -258.0740740740741, "logps/rejected": -298.81081081081084, "loss": 0.1615, "rewards/chosen": 1.3773148148148149, "rewards/margins": 8.985422922922922, "rewards/rejected": -7.608108108108108, "step": 912 }, { "epoch": 0.6259856016455262, "grad_norm": 0.4962799538654354, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -266785238.03278688, "logits/rejected": -241141179.2238806, "logps/chosen": -255.86885245901638, "logps/rejected": -309.97014925373134, "loss": 0.1438, "rewards/chosen": 1.6086065573770492, "rewards/margins": 9.27278566185466, "rewards/rejected": -7.664179104477612, "step": 913 }, { "epoch": 0.6266712375728488, "grad_norm": 0.6629322081996697, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -318527429.48571426, "logits/rejected": -276245539.3103448, "logps/chosen": -256.45714285714286, "logps/rejected": -307.86206896551727, "loss": 0.1908, "rewards/chosen": 1.0035714285714286, "rewards/margins": 6.960467980295567, "rewards/rejected": -5.956896551724138, "step": 914 }, { "epoch": 0.6273568735001714, "grad_norm": 0.6233554910283965, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -300262821.64705884, "logits/rejected": -292203178.6666667, "logps/chosen": -273.1764705882353, "logps/rejected": -344.26666666666665, "loss": 0.1512, "rewards/chosen": 1.5827205882352942, "rewards/margins": 9.916053921568627, "rewards/rejected": -8.333333333333334, "step": 915 }, { "epoch": 0.628042509427494, "grad_norm": 0.5870398240344626, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -251164792.47058824, "logits/rejected": -293042039.46666664, "logps/chosen": -243.05882352941177, "logps/rejected": -326.93333333333334, "loss": 0.1698, "rewards/chosen": 1.1911764705882353, "rewards/margins": 4.591176470588235, "rewards/rejected": -3.4, "step": 916 }, { "epoch": 0.6287281453548166, "grad_norm": 0.6274661103513373, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -276824064.0, "logits/rejected": -398738500.26666665, "logps/chosen": -273.1764705882353, "logps/rejected": -389.3333333333333, "loss": 0.1753, "rewards/chosen": 1.525735294117647, "rewards/margins": 7.95906862745098, "rewards/rejected": -6.433333333333334, "step": 917 }, { "epoch": 0.6294137812821392, "grad_norm": 0.6968567703989679, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -245332404.45901638, "logits/rejected": -276698860.8955224, "logps/chosen": -258.88524590163934, "logps/rejected": -362.9850746268657, "loss": 0.1551, "rewards/chosen": 1.7397540983606556, "rewards/margins": 9.918858575972596, "rewards/rejected": -8.17910447761194, "step": 918 }, { "epoch": 0.6300994172094617, "grad_norm": 0.4716944091203337, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -223453065.2753623, "logits/rejected": -292606021.4237288, "logps/chosen": -207.53623188405797, "logps/rejected": -367.1864406779661, "loss": 0.1681, "rewards/chosen": 1.6666666666666667, "rewards/margins": 9.878531073446327, "rewards/rejected": -8.211864406779661, "step": 919 }, { "epoch": 0.6307850531367843, "grad_norm": 0.5609098287562035, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -304822404.987013, "logits/rejected": -308569188.39215684, "logps/chosen": -237.4025974025974, "logps/rejected": -382.11764705882354, "loss": 0.1909, "rewards/chosen": 1.6574675324675325, "rewards/margins": 7.16727145403616, "rewards/rejected": -5.509803921568627, "step": 920 }, { "epoch": 0.631470689064107, "grad_norm": 0.4840109179180199, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -235913468.06153846, "logits/rejected": -286810502.0952381, "logps/chosen": -261.16923076923075, "logps/rejected": -301.968253968254, "loss": 0.169, "rewards/chosen": 1.5355769230769232, "rewards/margins": 8.39271978021978, "rewards/rejected": -6.857142857142857, "step": 921 }, { "epoch": 0.6321563249914296, "grad_norm": 0.6573684705341287, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -230744176.21917808, "logits/rejected": -258979206.98181817, "logps/chosen": -271.3424657534247, "logps/rejected": -427.6363636363636, "loss": 0.1967, "rewards/chosen": 1.2268835616438356, "rewards/margins": 5.426883561643836, "rewards/rejected": -4.2, "step": 922 }, { "epoch": 0.6328419609187521, "grad_norm": 0.5508198005397512, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -283996931.71014494, "logits/rejected": -332700724.0677966, "logps/chosen": -235.1304347826087, "logps/rejected": -372.6101694915254, "loss": 0.1667, "rewards/chosen": 1.568840579710145, "rewards/margins": 7.619688037337264, "rewards/rejected": -6.0508474576271185, "step": 923 }, { "epoch": 0.6335275968460747, "grad_norm": 0.484685659952093, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -200309791.03030303, "logits/rejected": -223245212.9032258, "logps/chosen": -251.3939393939394, "logps/rejected": -346.3225806451613, "loss": 0.1558, "rewards/chosen": 1.759469696969697, "rewards/margins": 9.09011485826002, "rewards/rejected": -7.330645161290323, "step": 924 }, { "epoch": 0.6342132327733974, "grad_norm": 0.5831642311484844, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -363602767.44827586, "logits/rejected": -286650719.0857143, "logps/chosen": -243.31034482758622, "logps/rejected": -432.9142857142857, "loss": 0.1605, "rewards/chosen": 1.709051724137931, "rewards/margins": 8.609051724137931, "rewards/rejected": -6.9, "step": 925 }, { "epoch": 0.6348988687007199, "grad_norm": 0.5843832589899232, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -253846572.52173913, "logits/rejected": -245402328.94915253, "logps/chosen": -259.2463768115942, "logps/rejected": -333.5593220338983, "loss": 0.1936, "rewards/chosen": 1.3233695652173914, "rewards/margins": 22428634.27252211, "rewards/rejected": -22428632.949152544, "step": 926 }, { "epoch": 0.6355845046280425, "grad_norm": 0.5645679081852435, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -286503227.0769231, "logits/rejected": -303587718.0952381, "logps/chosen": -261.4153846153846, "logps/rejected": -332.8253968253968, "loss": 0.1698, "rewards/chosen": 1.4019230769230768, "rewards/margins": 7.727319902319902, "rewards/rejected": -6.325396825396825, "step": 927 }, { "epoch": 0.6362701405553651, "grad_norm": 0.5555644714914474, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -346928859.4285714, "logits/rejected": -331583032.8888889, "logps/chosen": -218.85714285714286, "logps/rejected": -320.0, "loss": 0.1559, "rewards/chosen": 0.9634486607142857, "rewards/margins": 7.671781994047619, "rewards/rejected": -6.708333333333333, "step": 928 }, { "epoch": 0.6369557764826876, "grad_norm": 0.5083716652652136, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -241591910.4, "logits/rejected": -237011464.12698412, "logps/chosen": -320.0, "logps/rejected": -334.22222222222223, "loss": 0.1472, "rewards/chosen": 1.8788461538461538, "rewards/margins": 9.1248778998779, "rewards/rejected": -7.246031746031746, "step": 929 }, { "epoch": 0.6376414124100103, "grad_norm": 0.5986790783140775, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -247401334.4477612, "logits/rejected": -233643360.52459016, "logps/chosen": -217.07462686567163, "logps/rejected": -322.62295081967216, "loss": 0.1801, "rewards/chosen": 1.330223880597015, "rewards/margins": 8.264650110105212, "rewards/rejected": -6.934426229508197, "step": 930 }, { "epoch": 0.6383270483373329, "grad_norm": 0.5426770748348924, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -261549285.25373134, "logits/rejected": -295939088.78688526, "logps/chosen": -302.56716417910445, "logps/rejected": -357.24590163934425, "loss": 0.1627, "rewards/chosen": 1.3451492537313432, "rewards/margins": 8.140231220944457, "rewards/rejected": -6.795081967213115, "step": 931 }, { "epoch": 0.6390126842646555, "grad_norm": 0.6341919677012824, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -271231658.6666667, "logits/rejected": -243269632.0, "logps/chosen": -220.4848484848485, "logps/rejected": -316.9032258064516, "loss": 0.1807, "rewards/chosen": 1.3409090909090908, "rewards/margins": 8.502199413489736, "rewards/rejected": -7.161290322580645, "step": 932 }, { "epoch": 0.639698320191978, "grad_norm": 0.5654133295565476, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -277223521.52380955, "logits/rejected": -304829109.16923076, "logps/chosen": -326.85714285714283, "logps/rejected": -388.4307692307692, "loss": 0.1795, "rewards/chosen": 1.4821428571428572, "rewards/margins": 8.620604395604396, "rewards/rejected": -7.138461538461539, "step": 933 }, { "epoch": 0.6403839561193007, "grad_norm": 0.5875666104633893, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -240293029.16129032, "logits/rejected": -277586664.72727275, "logps/chosen": -266.3225806451613, "logps/rejected": -398.54545454545456, "loss": 0.1634, "rewards/chosen": 1.279233870967742, "rewards/margins": 8.892870234604105, "rewards/rejected": -7.613636363636363, "step": 934 }, { "epoch": 0.6410695920466233, "grad_norm": 0.7960738276262953, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -272082676.8695652, "logits/rejected": -277250603.3898305, "logps/chosen": -333.4492753623188, "logps/rejected": -367.1864406779661, "loss": 0.1629, "rewards/chosen": 1.693840579710145, "rewards/margins": 7.37180668140506, "rewards/rejected": -5.677966101694915, "step": 935 }, { "epoch": 0.6417552279739458, "grad_norm": 0.5423439537786158, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -284985952.8648649, "logits/rejected": -324048820.1481481, "logps/chosen": -332.97297297297297, "logps/rejected": -458.6666666666667, "loss": 0.1728, "rewards/chosen": 1.8986486486486487, "rewards/margins": 8.083833833833834, "rewards/rejected": -6.185185185185185, "step": 936 }, { "epoch": 0.6424408639012684, "grad_norm": 0.5296799072570657, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -265474770.82352942, "logits/rejected": -250539758.93333334, "logps/chosen": -246.35294117647058, "logps/rejected": -297.06666666666666, "loss": 0.1924, "rewards/chosen": 1.0606617647058822, "rewards/margins": 8.518995098039216, "rewards/rejected": -7.458333333333333, "step": 937 }, { "epoch": 0.643126499828591, "grad_norm": 0.5073404943794365, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -301989888.0, "logits/rejected": -266729298.44067797, "logps/chosen": -303.768115942029, "logps/rejected": -403.79661016949154, "loss": 0.1581, "rewards/chosen": 1.7780797101449275, "rewards/margins": 8.727232252517808, "rewards/rejected": -6.9491525423728815, "step": 938 }, { "epoch": 0.6438121357559136, "grad_norm": 1.745384804131027, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -268985528.6557377, "logits/rejected": -260046848.0, "logps/chosen": -331.0163934426229, "logps/rejected": -338.14925373134326, "loss": 0.1417, "rewards/chosen": 2.1004098360655736, "rewards/margins": 9.96608147785662, "rewards/rejected": -7.865671641791045, "step": 939 }, { "epoch": 0.6444977716832362, "grad_norm": 0.4992821437314696, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -275705582.93333334, "logits/rejected": -286446290.8235294, "logps/chosen": -225.6, "logps/rejected": -342.11764705882354, "loss": 0.1725, "rewards/chosen": 1.3979166666666667, "rewards/margins": 8.258210784313725, "rewards/rejected": -6.860294117647059, "step": 940 }, { "epoch": 0.6451834076105588, "grad_norm": 0.6278898781705485, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -237144616.63492063, "logits/rejected": -250109573.9076923, "logps/chosen": -288.76190476190476, "logps/rejected": -372.18461538461537, "loss": 0.1832, "rewards/chosen": 1.7043650793650793, "rewards/margins": 8.796672771672771, "rewards/rejected": -7.092307692307692, "step": 941 }, { "epoch": 0.6458690435378814, "grad_norm": 0.5590229451898068, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -288164219.2592593, "logits/rejected": -323528205.8378378, "logps/chosen": -189.33333333333334, "logps/rejected": -369.2972972972973, "loss": 0.1375, "rewards/chosen": 1.2893518518518519, "rewards/margins": 8.904216716716716, "rewards/rejected": -7.614864864864865, "step": 942 }, { "epoch": 0.646554679465204, "grad_norm": 0.6035724209272231, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -210745379.92982456, "logits/rejected": -311441840.6760563, "logps/chosen": -235.3684210526316, "logps/rejected": -379.49295774647885, "loss": 0.1722, "rewards/chosen": 0.9364035087719298, "rewards/margins": 7.851896466518409, "rewards/rejected": -6.915492957746479, "step": 943 }, { "epoch": 0.6472403153925266, "grad_norm": 0.659193714330683, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -260286522.5142857, "logits/rejected": -384718918.62068963, "logps/chosen": -264.9142857142857, "logps/rejected": -360.82758620689657, "loss": 0.1944, "rewards/chosen": 1.6267857142857143, "rewards/margins": 8.281958128078818, "rewards/rejected": -6.655172413793103, "step": 944 }, { "epoch": 0.6479259513198492, "grad_norm": 0.5371441951099766, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -296036682.32258064, "logits/rejected": -280382867.3939394, "logps/chosen": -334.19354838709677, "logps/rejected": -390.7878787878788, "loss": 0.1912, "rewards/chosen": 2.086693548387097, "rewards/margins": 8.692754154447703, "rewards/rejected": -6.606060606060606, "step": 945 }, { "epoch": 0.6486115872471717, "grad_norm": 0.4956811078843694, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -279620266.6666667, "logits/rejected": -249335240.86153847, "logps/chosen": -276.3174603174603, "logps/rejected": -298.83076923076925, "loss": 0.1711, "rewards/chosen": 1.8363095238095237, "rewards/margins": 8.8132326007326, "rewards/rejected": -6.976923076923077, "step": 946 }, { "epoch": 0.6492972231744943, "grad_norm": 0.6353851182600256, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -258267446.3030303, "logits/rejected": -271141458.58064514, "logps/chosen": -233.6969696969697, "logps/rejected": -407.741935483871, "loss": 0.1785, "rewards/chosen": 1.3058712121212122, "rewards/margins": 6.733290566959922, "rewards/rejected": -5.42741935483871, "step": 947 }, { "epoch": 0.649982859101817, "grad_norm": 0.45983027403105975, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -217264947.2, "logits/rejected": -231180167.52941176, "logps/chosen": -280.26666666666665, "logps/rejected": -358.11764705882354, "loss": 0.1265, "rewards/chosen": 1.9520833333333334, "rewards/margins": 10.121200980392157, "rewards/rejected": -8.169117647058824, "step": 948 }, { "epoch": 0.6506684950291395, "grad_norm": 0.552959777388709, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -292439780.4307692, "logits/rejected": -293734432.50793654, "logps/chosen": -253.04615384615386, "logps/rejected": -373.8412698412698, "loss": 0.158, "rewards/chosen": 1.4990384615384615, "rewards/margins": 9.562530525030525, "rewards/rejected": -8.063492063492063, "step": 949 }, { "epoch": 0.6513541309564621, "grad_norm": 0.7034309992223903, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -306311292.1212121, "logits/rejected": -312272697.8064516, "logps/chosen": -324.6060606060606, "logps/rejected": -407.741935483871, "loss": 0.1779, "rewards/chosen": 1.7954545454545454, "rewards/margins": 6.827712609970675, "rewards/rejected": -5.032258064516129, "step": 950 }, { "epoch": 0.6520397668837847, "grad_norm": 0.5678944944645993, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -269678212.7407407, "logits/rejected": -261407162.8108108, "logps/chosen": -319.7037037037037, "logps/rejected": -347.2432432432432, "loss": 0.1498, "rewards/chosen": 1.3877314814814814, "rewards/margins": 7.955299049049049, "rewards/rejected": -6.5675675675675675, "step": 951 }, { "epoch": 0.6527254028111072, "grad_norm": 0.5485861350814598, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -250279290.73972604, "logits/rejected": -269045536.58181816, "logps/chosen": -200.1095890410959, "logps/rejected": -333.3818181818182, "loss": 0.1933, "rewards/chosen": 1.2003424657534247, "rewards/margins": 8.491251556662515, "rewards/rejected": -7.290909090909091, "step": 952 }, { "epoch": 0.6534110387384299, "grad_norm": 0.6361435465922618, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -257070245.16129032, "logits/rejected": -301481487.5151515, "logps/chosen": -264.51612903225805, "logps/rejected": -448.969696969697, "loss": 0.1836, "rewards/chosen": 1.971774193548387, "rewards/margins": 8.729349951124146, "rewards/rejected": -6.757575757575758, "step": 953 }, { "epoch": 0.6540966746657525, "grad_norm": 0.569015371896042, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -283534950.4, "logits/rejected": -278551130.35294116, "logps/chosen": -242.0, "logps/rejected": -340.2352941176471, "loss": 0.1498, "rewards/chosen": 1.7208333333333334, "rewards/margins": 8.54436274509804, "rewards/rejected": -6.823529411764706, "step": 954 }, { "epoch": 0.6547823105930751, "grad_norm": 0.6321415358894784, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -246415360.0, "logits/rejected": -240123904.0, "logps/chosen": -218.0, "logps/rejected": -370.0, "loss": 0.1522, "rewards/chosen": 1.60546875, "rewards/margins": 9.49609375, "rewards/rejected": -7.890625, "step": 955 }, { "epoch": 0.6554679465203976, "grad_norm": 0.43780614237742843, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -187485388.8, "logits/rejected": -226985863.52941176, "logps/chosen": -197.2, "logps/rejected": -288.94117647058823, "loss": 0.1621, "rewards/chosen": 1.390625, "rewards/margins": 8.574448529411764, "rewards/rejected": -7.1838235294117645, "step": 956 }, { "epoch": 0.6561535824477203, "grad_norm": 0.5938986161767819, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -259311005.19298247, "logits/rejected": -243387781.4084507, "logps/chosen": -273.4035087719298, "logps/rejected": -361.46478873239437, "loss": 0.1659, "rewards/chosen": 1.0921052631578947, "rewards/margins": 8.31745737583395, "rewards/rejected": -7.225352112676056, "step": 957 }, { "epoch": 0.6568392183750429, "grad_norm": 0.6490239507666563, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -272909380.26666665, "logits/rejected": -241542565.6470588, "logps/chosen": -313.8666666666667, "logps/rejected": -362.3529411764706, "loss": 0.1547, "rewards/chosen": 1.025, "rewards/margins": 6.238235294117647, "rewards/rejected": -5.213235294117647, "step": 958 }, { "epoch": 0.6575248543023654, "grad_norm": 0.5113155744695688, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -341416345.6, "logits/rejected": -252151687.52941176, "logps/chosen": -308.26666666666665, "logps/rejected": -406.11764705882354, "loss": 0.1412, "rewards/chosen": 1.3927083333333334, "rewards/margins": 10.157414215686275, "rewards/rejected": -8.764705882352942, "step": 959 }, { "epoch": 0.658210490229688, "grad_norm": 0.4857487287851687, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -234493857.47692308, "logits/rejected": -259647390.47619048, "logps/chosen": -206.4, "logps/rejected": -347.93650793650795, "loss": 0.1672, "rewards/chosen": 1.2153846153846153, "rewards/margins": 8.687606837606838, "rewards/rejected": -7.472222222222222, "step": 960 }, { "epoch": 0.6588961261570107, "grad_norm": 0.5372992987146991, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -220571045.6470588, "logits/rejected": -278501785.6, "logps/chosen": -331.7647058823529, "logps/rejected": -334.4, "loss": 0.2015, "rewards/chosen": 1.6158088235294117, "rewards/margins": 8.024142156862744, "rewards/rejected": -6.408333333333333, "step": 961 }, { "epoch": 0.6595817620843332, "grad_norm": 0.5688415825128512, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -308498290.7586207, "logits/rejected": -275386016.9142857, "logps/chosen": -251.0344827586207, "logps/rejected": -358.85714285714283, "loss": 0.1451, "rewards/chosen": 1.8232758620689655, "rewards/margins": 9.344704433497537, "rewards/rejected": -7.521428571428571, "step": 962 }, { "epoch": 0.6602673980116558, "grad_norm": 0.474590250324121, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -320740894.11764705, "logits/rejected": -251876125.92207792, "logps/chosen": -297.0980392156863, "logps/rejected": -394.38961038961037, "loss": 0.1618, "rewards/chosen": 0.8063725490196079, "rewards/margins": 8.520658263305322, "rewards/rejected": -7.714285714285714, "step": 963 }, { "epoch": 0.6609530339389784, "grad_norm": 0.5657780637992601, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -306113102.10169494, "logits/rejected": -298585815.1884058, "logps/chosen": -279.3220338983051, "logps/rejected": -345.5072463768116, "loss": 0.1367, "rewards/chosen": 1.7891949152542372, "rewards/margins": 10.67325288626873, "rewards/rejected": -8.884057971014492, "step": 964 }, { "epoch": 0.661638669866301, "grad_norm": 0.5899318618580274, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -279986981.7704918, "logits/rejected": -288217546.5074627, "logps/chosen": -297.44262295081967, "logps/rejected": -335.2835820895522, "loss": 0.1566, "rewards/chosen": 1.6270491803278688, "rewards/margins": 9.462870075850256, "rewards/rejected": -7.835820895522388, "step": 965 }, { "epoch": 0.6623243057936236, "grad_norm": 0.6659035973174601, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -227455043.147541, "logits/rejected": -242894022.68656716, "logps/chosen": -205.9016393442623, "logps/rejected": -408.8358208955224, "loss": 0.1816, "rewards/chosen": 1.2868852459016393, "rewards/margins": 9.436138977244923, "rewards/rejected": -8.149253731343284, "step": 966 }, { "epoch": 0.6630099417209462, "grad_norm": 0.5025144790905164, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -220133309.93548387, "logits/rejected": -248607837.0909091, "logps/chosen": -308.9032258064516, "logps/rejected": -330.6666666666667, "loss": 0.1493, "rewards/chosen": 1.7137096774193548, "rewards/margins": 9.084921798631475, "rewards/rejected": -7.371212121212121, "step": 967 }, { "epoch": 0.6636955776482688, "grad_norm": 0.6503393555121671, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -242036013.17647058, "logits/rejected": -339738624.0, "logps/chosen": -298.3529411764706, "logps/rejected": -373.06666666666666, "loss": 0.153, "rewards/chosen": 2.25, "rewards/margins": 10.116666666666667, "rewards/rejected": -7.866666666666666, "step": 968 }, { "epoch": 0.6643812135755913, "grad_norm": 0.5682589314868159, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -249917208.1509434, "logits/rejected": -222130339.84, "logps/chosen": -206.79245283018867, "logps/rejected": -321.28, "loss": 0.1318, "rewards/chosen": 1.3242924528301887, "rewards/margins": 8.964292452830188, "rewards/rejected": -7.64, "step": 969 }, { "epoch": 0.665066849502914, "grad_norm": 0.542441326430267, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -224154607.21311477, "logits/rejected": -269437080.8358209, "logps/chosen": -206.68852459016392, "logps/rejected": -400.23880597014926, "loss": 0.1461, "rewards/chosen": 1.4559426229508197, "rewards/margins": 9.493256055786642, "rewards/rejected": -8.037313432835822, "step": 970 }, { "epoch": 0.6657524854302366, "grad_norm": 0.4471714829101077, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -279620266.6666667, "logits/rejected": -263901073.2972973, "logps/chosen": -260.14814814814815, "logps/rejected": -324.7567567567568, "loss": 0.1305, "rewards/chosen": 1.6793981481481481, "rewards/margins": 9.976695445445445, "rewards/rejected": -8.297297297297296, "step": 971 }, { "epoch": 0.6664381213575591, "grad_norm": 0.4618224079168187, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -374217224.6779661, "logits/rejected": -323873213.2173913, "logps/chosen": -354.8474576271187, "logps/rejected": -360.3478260869565, "loss": 0.1474, "rewards/chosen": 1.2955508474576272, "rewards/margins": 7.686855195283714, "rewards/rejected": -6.391304347826087, "step": 972 }, { "epoch": 0.6671237572848817, "grad_norm": 0.5918354969663359, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -336503018.05714285, "logits/rejected": -259757585.6551724, "logps/chosen": -250.74285714285713, "logps/rejected": -395.0344827586207, "loss": 0.1801, "rewards/chosen": 1.4160714285714286, "rewards/margins": 9.985036945812809, "rewards/rejected": -8.568965517241379, "step": 973 }, { "epoch": 0.6678093932122043, "grad_norm": 0.5273576146949067, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -296640630.7246377, "logits/rejected": -236018462.37288135, "logps/chosen": -337.8550724637681, "logps/rejected": -373.4237288135593, "loss": 0.1789, "rewards/chosen": 1.5267210144927537, "rewards/margins": 10.264009150085974, "rewards/rejected": -8.73728813559322, "step": 974 }, { "epoch": 0.668495029139527, "grad_norm": 0.5841867390366701, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -298634444.8, "logits/rejected": -272962641.26984125, "logps/chosen": -261.66153846153844, "logps/rejected": -388.57142857142856, "loss": 0.1635, "rewards/chosen": 1.7673076923076922, "rewards/margins": 10.973656898656898, "rewards/rejected": -9.206349206349206, "step": 975 }, { "epoch": 0.6691806650668495, "grad_norm": 0.47474787173893784, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -271016566.15384614, "logits/rejected": -213310317.7142857, "logps/chosen": -206.52307692307693, "logps/rejected": -414.4761904761905, "loss": 0.1532, "rewards/chosen": 1.3908653846153847, "rewards/margins": 7.589278083028083, "rewards/rejected": -6.198412698412699, "step": 976 }, { "epoch": 0.6698663009941721, "grad_norm": 0.5993472706782029, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -290455552.0, "logits/rejected": -308019200.0, "logps/chosen": -248.25, "logps/rejected": -403.5, "loss": 0.1872, "rewards/chosen": 0.9599609375, "rewards/margins": 10.2646484375, "rewards/rejected": -9.3046875, "step": 977 }, { "epoch": 0.6705519369214947, "grad_norm": 0.6976845196531337, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -261095424.0, "logits/rejected": -258736128.0, "logps/chosen": -330.0, "logps/rejected": -325.0, "loss": 0.1558, "rewards/chosen": 1.6171875, "rewards/margins": 10.0859375, "rewards/rejected": -8.46875, "step": 978 }, { "epoch": 0.6712375728488172, "grad_norm": 0.5342227157893, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -287861706.1052632, "logits/rejected": -251421941.18309858, "logps/chosen": -232.98245614035088, "logps/rejected": -352.90140845070425, "loss": 0.1621, "rewards/chosen": 1.4528508771929824, "rewards/margins": 6.410597356066221, "rewards/rejected": -4.957746478873239, "step": 979 }, { "epoch": 0.6719232087761399, "grad_norm": 0.5507280353040573, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -316953350.9189189, "logits/rejected": -359778076.4444444, "logps/chosen": -271.56756756756755, "logps/rejected": -340.44444444444446, "loss": 0.1652, "rewards/chosen": 1.5743243243243243, "rewards/margins": 9.102102102102101, "rewards/rejected": -7.527777777777778, "step": 980 }, { "epoch": 0.6726088447034625, "grad_norm": 0.5118427736241239, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -297922684.1212121, "logits/rejected": -252740641.03225806, "logps/chosen": -262.06060606060606, "logps/rejected": -379.8709677419355, "loss": 0.1695, "rewards/chosen": 1.215435606060606, "rewards/margins": 9.570274315738025, "rewards/rejected": -8.35483870967742, "step": 981 }, { "epoch": 0.673294480630785, "grad_norm": 0.4476353451361489, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -222630993.26984128, "logits/rejected": -289600559.26153845, "logps/chosen": -266.41269841269843, "logps/rejected": -298.0923076923077, "loss": 0.1478, "rewards/chosen": 2.0634920634920637, "rewards/margins": 9.217338217338218, "rewards/rejected": -7.153846153846154, "step": 982 }, { "epoch": 0.6739801165581076, "grad_norm": 0.5380897347346584, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -295908147.2, "logits/rejected": -237327701.33333334, "logps/chosen": -250.4, "logps/rejected": -382.0, "loss": 0.1777, "rewards/chosen": 1.0703125, "rewards/margins": 8.872395833333332, "rewards/rejected": -7.802083333333333, "step": 983 }, { "epoch": 0.6746657524854303, "grad_norm": 0.5109897153014767, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -254803968.0, "logits/rejected": -275251200.0, "logps/chosen": -202.25, "logps/rejected": -343.5, "loss": 0.1619, "rewards/chosen": 1.078125, "rewards/margins": 8.6640625, "rewards/rejected": -7.5859375, "step": 984 }, { "epoch": 0.6753513884127528, "grad_norm": 1.0155414848519657, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -337443627.4716981, "logits/rejected": -319438192.64, "logps/chosen": -231.8490566037736, "logps/rejected": -392.1066666666667, "loss": 0.1411, "rewards/chosen": 1.5849056603773586, "rewards/margins": 9.451572327044024, "rewards/rejected": -7.866666666666666, "step": 985 }, { "epoch": 0.6760370243400754, "grad_norm": 0.6211409634644941, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -202744384.90140846, "logits/rejected": -271084490.1052632, "logps/chosen": -301.5211267605634, "logps/rejected": -370.8070175438597, "loss": 0.2025, "rewards/chosen": 1.736355633802817, "rewards/margins": 8.93811001976773, "rewards/rejected": -7.201754385964913, "step": 986 }, { "epoch": 0.676722660267398, "grad_norm": 0.4474692081219545, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -274726912.0, "logits/rejected": -306446336.0, "logps/chosen": -316.0, "logps/rejected": -362.5, "loss": 0.1306, "rewards/chosen": 2.23046875, "rewards/margins": 9.49609375, "rewards/rejected": -7.265625, "step": 987 }, { "epoch": 0.6774082961947206, "grad_norm": 0.5269396172010973, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -300756269.1764706, "logits/rejected": -317369002.6666667, "logps/chosen": -238.8235294117647, "logps/rejected": -339.73333333333335, "loss": 0.1703, "rewards/chosen": 1.1185661764705883, "rewards/margins": 9.001899509803922, "rewards/rejected": -7.883333333333334, "step": 988 }, { "epoch": 0.6780939321220432, "grad_norm": 0.5435602837663207, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -227626011.6756757, "logits/rejected": -270610280.2962963, "logps/chosen": -230.27027027027026, "logps/rejected": -419.55555555555554, "loss": 0.2102, "rewards/chosen": 1.287162162162162, "rewards/margins": 8.861236236236236, "rewards/rejected": -7.574074074074074, "step": 989 }, { "epoch": 0.6787795680493658, "grad_norm": 0.41640313602895884, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -308090693.8181818, "logits/rejected": -313481131.8356164, "logps/chosen": -220.36363636363637, "logps/rejected": -349.8082191780822, "loss": 0.1398, "rewards/chosen": 1.4636363636363636, "rewards/margins": 8.8266500622665, "rewards/rejected": -7.363013698630137, "step": 990 }, { "epoch": 0.6794652039766884, "grad_norm": 0.4864985918354522, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -238551040.0, "logits/rejected": -328466432.0, "logps/chosen": -285.5, "logps/rejected": -347.5, "loss": 0.1712, "rewards/chosen": 1.1162109375, "rewards/margins": 8.6474609375, "rewards/rejected": -7.53125, "step": 991 }, { "epoch": 0.6801508399040109, "grad_norm": 0.7050353501148509, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -283055601.37142855, "logits/rejected": -274220702.8965517, "logps/chosen": -332.8, "logps/rejected": -308.9655172413793, "loss": 0.1708, "rewards/chosen": 1.5642857142857143, "rewards/margins": 7.159113300492611, "rewards/rejected": -5.594827586206897, "step": 992 }, { "epoch": 0.6808364758313336, "grad_norm": 0.4725429332923472, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -325582848.0, "logits/rejected": -283115520.0, "logps/chosen": -289.25, "logps/rejected": -365.75, "loss": 0.1397, "rewards/chosen": 1.76953125, "rewards/margins": 9.69140625, "rewards/rejected": -7.921875, "step": 993 }, { "epoch": 0.6815221117586562, "grad_norm": 0.564093774070174, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -300591786.6666667, "logits/rejected": -314326076.2352941, "logps/chosen": -233.33333333333334, "logps/rejected": -373.1764705882353, "loss": 0.173, "rewards/chosen": 1.0979166666666667, "rewards/margins": 8.715563725490195, "rewards/rejected": -7.617647058823529, "step": 994 }, { "epoch": 0.6822077476859787, "grad_norm": 0.4900513304633844, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -249713055.53623188, "logits/rejected": -271563411.5254237, "logps/chosen": -290.0869565217391, "logps/rejected": -261.4237288135593, "loss": 0.1674, "rewards/chosen": 1.6331521739130435, "rewards/margins": 9.158575902726604, "rewards/rejected": -7.52542372881356, "step": 995 }, { "epoch": 0.6828933836133013, "grad_norm": 0.553053370661059, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -239134402.70422536, "logits/rejected": -204564300.3508772, "logps/chosen": -208.67605633802816, "logps/rejected": -374.4561403508772, "loss": 0.1828, "rewards/chosen": 1.0897887323943662, "rewards/margins": 7.449437855201383, "rewards/rejected": -6.359649122807017, "step": 996 }, { "epoch": 0.6835790195406239, "grad_norm": 0.4623614889689613, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -330750829.71428573, "logits/rejected": -373744750.27692306, "logps/chosen": -291.04761904761904, "logps/rejected": -377.6, "loss": 0.1425, "rewards/chosen": 2.0357142857142856, "rewards/margins": 9.5510989010989, "rewards/rejected": -7.515384615384615, "step": 997 }, { "epoch": 0.6842646554679466, "grad_norm": 0.6917485285699263, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -197616246.15384614, "logits/rejected": -250554475.78947368, "logps/chosen": -335.38461538461536, "logps/rejected": -322.10526315789474, "loss": 0.1522, "rewards/chosen": 1.4639423076923077, "rewards/margins": 8.60867914979757, "rewards/rejected": -7.144736842105263, "step": 998 }, { "epoch": 0.6849502913952691, "grad_norm": 0.5281499512105358, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -286537189.05263156, "logits/rejected": -294900923.4929578, "logps/chosen": -205.47368421052633, "logps/rejected": -313.23943661971833, "loss": 0.1623, "rewards/chosen": 0.42105263157894735, "rewards/margins": 7.618235730170497, "rewards/rejected": -7.197183098591549, "step": 999 }, { "epoch": 0.6856359273225917, "grad_norm": 0.5245039963830186, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -227540992.0, "logits/rejected": -269134506.6666667, "logps/chosen": -254.57142857142858, "logps/rejected": -409.77777777777777, "loss": 0.1585, "rewards/chosen": 1.3822544642857142, "rewards/margins": 8.951698908730158, "rewards/rejected": -7.569444444444445, "step": 1000 }, { "epoch": 0.6863215632499143, "grad_norm": 0.5308509349495377, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -253632030.11764705, "logits/rejected": -312895078.4, "logps/chosen": -271.29411764705884, "logps/rejected": -295.2, "loss": 0.1615, "rewards/chosen": 1.6902573529411764, "rewards/margins": 9.12359068627451, "rewards/rejected": -7.433333333333334, "step": 1001 }, { "epoch": 0.6870071991772368, "grad_norm": 0.5564612843231413, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -290538772.3174603, "logits/rejected": -360322977.47692305, "logps/chosen": -244.31746031746033, "logps/rejected": -337.2307692307692, "loss": 0.1545, "rewards/chosen": 1.6617063492063493, "rewards/margins": 9.269398656898657, "rewards/rejected": -7.607692307692307, "step": 1002 }, { "epoch": 0.6876928351045595, "grad_norm": 0.5562775497576827, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -316809762.1333333, "logits/rejected": -300016097.88235295, "logps/chosen": -324.53333333333336, "logps/rejected": -328.47058823529414, "loss": 0.1636, "rewards/chosen": 1.0322916666666666, "rewards/margins": 7.848468137254902, "rewards/rejected": -6.8161764705882355, "step": 1003 }, { "epoch": 0.6883784710318821, "grad_norm": 0.6369374599526976, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -254962843.15151516, "logits/rejected": -279259466.32258064, "logps/chosen": -245.8181818181818, "logps/rejected": -342.7096774193548, "loss": 0.1704, "rewards/chosen": 1.509469696969697, "rewards/margins": 8.848179374389051, "rewards/rejected": -7.338709677419355, "step": 1004 }, { "epoch": 0.6890641069592046, "grad_norm": 0.5356764374316046, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -350121245.3770492, "logits/rejected": -286214296.8358209, "logps/chosen": -216.13114754098362, "logps/rejected": -349.13432835820896, "loss": 0.1623, "rewards/chosen": 1.278688524590164, "rewards/margins": 9.256300464888671, "rewards/rejected": -7.977611940298507, "step": 1005 }, { "epoch": 0.6897497428865272, "grad_norm": 0.5079500280333367, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -286229472.969697, "logits/rejected": -340685724.9032258, "logps/chosen": -272.969696969697, "logps/rejected": -369.5483870967742, "loss": 0.188, "rewards/chosen": 0.9673295454545454, "rewards/margins": 22425761.99958761, "rewards/rejected": -22425761.032258064, "step": 1006 }, { "epoch": 0.6904353788138499, "grad_norm": 0.463340170257327, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -221832078.2222222, "logits/rejected": -323671213.2923077, "logps/chosen": -292.3174603174603, "logps/rejected": -380.55384615384617, "loss": 0.1659, "rewards/chosen": 1.7103174603174602, "rewards/margins": 9.633394383394384, "rewards/rejected": -7.923076923076923, "step": 1007 }, { "epoch": 0.6911210147411725, "grad_norm": 0.5143548753415595, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -272285964.59016395, "logits/rejected": -247651740.6567164, "logps/chosen": -231.86885245901638, "logps/rejected": -329.07462686567163, "loss": 0.1491, "rewards/chosen": 1.165983606557377, "rewards/margins": 8.942103009542452, "rewards/rejected": -7.776119402985074, "step": 1008 }, { "epoch": 0.691806650668495, "grad_norm": 0.5025386942357064, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -244352033.03225806, "logits/rejected": -206156396.6060606, "logps/chosen": -304.258064516129, "logps/rejected": -335.27272727272725, "loss": 0.1319, "rewards/chosen": 1.9435483870967742, "rewards/margins": 9.307184750733137, "rewards/rejected": -7.363636363636363, "step": 1009 }, { "epoch": 0.6924922865958176, "grad_norm": 0.5047105836004098, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -305135616.0, "logits/rejected": -344457216.0, "logps/chosen": -302.25, "logps/rejected": -371.0, "loss": 0.1637, "rewards/chosen": 1.6259765625, "rewards/margins": 8.7041015625, "rewards/rejected": -7.078125, "step": 1010 }, { "epoch": 0.6931779225231403, "grad_norm": 0.5473400331711719, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -325884710.7878788, "logits/rejected": -317955303.2258065, "logps/chosen": -323.8787878787879, "logps/rejected": -400.258064516129, "loss": 0.1859, "rewards/chosen": 1.4829545454545454, "rewards/margins": 9.192631964809385, "rewards/rejected": -7.709677419354839, "step": 1011 }, { "epoch": 0.6938635584504628, "grad_norm": 0.5951320207991804, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -207866862.6440678, "logits/rejected": -214699735.1884058, "logps/chosen": -272.0, "logps/rejected": -303.768115942029, "loss": 0.1449, "rewards/chosen": 1.7754237288135593, "rewards/margins": 9.471075902726604, "rewards/rejected": -7.695652173913044, "step": 1012 }, { "epoch": 0.6945491943777854, "grad_norm": 0.43940234062245753, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -264800392.53333333, "logits/rejected": -292367661.1764706, "logps/chosen": -309.06666666666666, "logps/rejected": -340.2352941176471, "loss": 0.1437, "rewards/chosen": 1.4291666666666667, "rewards/margins": 9.289460784313725, "rewards/rejected": -7.860294117647059, "step": 1013 }, { "epoch": 0.695234830305108, "grad_norm": 0.4876292015039403, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -239578644.48, "logits/rejected": -221427596.0754717, "logps/chosen": -273.70666666666665, "logps/rejected": -299.1698113207547, "loss": 0.1646, "rewards/chosen": 2.1033333333333335, "rewards/margins": 8.056163522012579, "rewards/rejected": -5.952830188679245, "step": 1014 }, { "epoch": 0.6959204662324305, "grad_norm": 0.5480188125741093, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -282283316.82539684, "logits/rejected": -341480873.35384613, "logps/chosen": -239.74603174603175, "logps/rejected": -294.15384615384613, "loss": 0.1849, "rewards/chosen": 1.1061507936507937, "rewards/margins": 8.529227716727718, "rewards/rejected": -7.423076923076923, "step": 1015 }, { "epoch": 0.6966061021597532, "grad_norm": 0.5765067817290292, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -270188299.46268654, "logits/rejected": -288513107.93442625, "logps/chosen": -244.0597014925373, "logps/rejected": -371.40983606557376, "loss": 0.1733, "rewards/chosen": 1.2201492537313432, "rewards/margins": 9.3676902373379, "rewards/rejected": -8.147540983606557, "step": 1016 }, { "epoch": 0.6972917380870758, "grad_norm": 0.41829732932601077, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -239999496.6779661, "logits/rejected": -205460108.98550725, "logps/chosen": -200.135593220339, "logps/rejected": -386.3188405797101, "loss": 0.1613, "rewards/chosen": 1.0932203389830508, "rewards/margins": 9.216408744780152, "rewards/rejected": -8.123188405797102, "step": 1017 }, { "epoch": 0.6979773740143983, "grad_norm": 0.4184094917805206, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -271231658.6666667, "logits/rejected": -212305799.52941176, "logps/chosen": -250.4, "logps/rejected": -338.8235294117647, "loss": 0.1386, "rewards/chosen": 1.6395833333333334, "rewards/margins": 9.507230392156863, "rewards/rejected": -7.867647058823529, "step": 1018 }, { "epoch": 0.6986630099417209, "grad_norm": 0.5339204236324739, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -317629665.62711865, "logits/rejected": -272812121.04347825, "logps/chosen": -228.0677966101695, "logps/rejected": -349.4492753623188, "loss": 0.1463, "rewards/chosen": 1.6091101694915255, "rewards/margins": 8.993168140506018, "rewards/rejected": -7.384057971014493, "step": 1019 }, { "epoch": 0.6993486458690436, "grad_norm": 0.5052077310852429, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -229105261.1147541, "logits/rejected": -246775318.92537314, "logps/chosen": -271.4754098360656, "logps/rejected": -427.94029850746267, "loss": 0.1568, "rewards/chosen": 1.4851434426229508, "rewards/margins": 10.492606129190115, "rewards/rejected": -9.007462686567164, "step": 1020 }, { "epoch": 0.7000342817963662, "grad_norm": 0.43027256938405894, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -304465214.9508197, "logits/rejected": -266682612.53731343, "logps/chosen": -260.72131147540983, "logps/rejected": -348.17910447761193, "loss": 0.1587, "rewards/chosen": 1.4241803278688525, "rewards/margins": 9.319702715928553, "rewards/rejected": -7.895522388059701, "step": 1021 }, { "epoch": 0.7007199177236887, "grad_norm": 0.491203371763862, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -279620266.6666667, "logits/rejected": -294880898.1694915, "logps/chosen": -217.04347826086956, "logps/rejected": -414.3728813559322, "loss": 0.184, "rewards/chosen": 1.1748188405797102, "rewards/margins": 8.708717145664457, "rewards/rejected": -7.533898305084746, "step": 1022 }, { "epoch": 0.7014055536510113, "grad_norm": 0.4917093368306968, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -291204534.85714287, "logits/rejected": -359263832.2758621, "logps/chosen": -291.8857142857143, "logps/rejected": -345.9310344827586, "loss": 0.1617, "rewards/chosen": 2.030357142857143, "rewards/margins": 9.73725369458128, "rewards/rejected": -7.706896551724138, "step": 1023 }, { "epoch": 0.7020911895783339, "grad_norm": 0.7787360116071429, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -199777925.9076923, "logits/rejected": -282815926.85714287, "logps/chosen": -216.6153846153846, "logps/rejected": -398.4761904761905, "loss": 0.18, "rewards/chosen": 1.4346153846153846, "rewards/margins": 5.910805860805861, "rewards/rejected": -4.476190476190476, "step": 1024 }, { "epoch": 0.7027768255056565, "grad_norm": 0.5176121103675216, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -273642178.20689654, "logits/rejected": -245666377.14285713, "logps/chosen": -240.55172413793105, "logps/rejected": -417.8285714285714, "loss": 0.1352, "rewards/chosen": 1.4523168103448276, "rewards/margins": 10.430888238916255, "rewards/rejected": -8.978571428571428, "step": 1025 }, { "epoch": 0.7034624614329791, "grad_norm": 0.4439271417774282, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -288624987.11864406, "logits/rejected": -250928795.82608697, "logps/chosen": -219.11864406779662, "logps/rejected": -349.4492753623188, "loss": 0.1321, "rewards/chosen": 1.2860169491525424, "rewards/margins": 9.720799557848196, "rewards/rejected": -8.434782608695652, "step": 1026 }, { "epoch": 0.7041480973603017, "grad_norm": 0.473080582582366, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -242624354.46153846, "logits/rejected": -289739857.26984125, "logps/chosen": -332.3076923076923, "logps/rejected": -325.58730158730157, "loss": 0.1429, "rewards/chosen": 2.0846153846153848, "rewards/margins": 9.544932844932845, "rewards/rejected": -7.4603174603174605, "step": 1027 }, { "epoch": 0.7048337332876242, "grad_norm": 0.5230246431798781, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -239785133.2923077, "logits/rejected": -361908516.5714286, "logps/chosen": -231.3846153846154, "logps/rejected": -359.6190476190476, "loss": 0.1503, "rewards/chosen": 1.8173076923076923, "rewards/margins": 8.912545787545787, "rewards/rejected": -7.095238095238095, "step": 1028 }, { "epoch": 0.7055193692149468, "grad_norm": 0.5649391115882747, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -283477097.9310345, "logits/rejected": -243149794.74285713, "logps/chosen": -239.44827586206895, "logps/rejected": -330.0571428571429, "loss": 0.1555, "rewards/chosen": 1.1605603448275863, "rewards/margins": 8.967703201970444, "rewards/rejected": -7.807142857142857, "step": 1029 }, { "epoch": 0.7062050051422695, "grad_norm": 0.5903126098521504, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -262267361.88235295, "logits/rejected": -268155835.73333332, "logps/chosen": -293.6470588235294, "logps/rejected": -338.4, "loss": 0.1741, "rewards/chosen": 1.338235294117647, "rewards/margins": 8.942401960784315, "rewards/rejected": -7.604166666666667, "step": 1030 }, { "epoch": 0.7068906410695921, "grad_norm": 0.48884387126738404, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -268710492.3278689, "logits/rejected": -290471202.3880597, "logps/chosen": -248.39344262295083, "logps/rejected": -307.1044776119403, "loss": 0.171, "rewards/chosen": 1.2653688524590163, "rewards/margins": 8.138503180817226, "rewards/rejected": -6.873134328358209, "step": 1031 }, { "epoch": 0.7075762769969146, "grad_norm": 0.6119728744605387, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -228944470.64615384, "logits/rejected": -229022313.65079364, "logps/chosen": -344.61538461538464, "logps/rejected": -387.55555555555554, "loss": 0.1757, "rewards/chosen": 1.3201923076923077, "rewards/margins": 8.986858974358974, "rewards/rejected": -7.666666666666667, "step": 1032 }, { "epoch": 0.7082619129242372, "grad_norm": 0.5335275216835739, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -237204023.13846153, "logits/rejected": -252723460.06349206, "logps/chosen": -313.84615384615387, "logps/rejected": -389.58730158730157, "loss": 0.1462, "rewards/chosen": 1.9403846153846154, "rewards/margins": 7.4086385836385835, "rewards/rejected": -5.468253968253968, "step": 1033 }, { "epoch": 0.7089475488515599, "grad_norm": 0.4857229532573898, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -264679362.86567163, "logits/rejected": -271185819.27868855, "logps/chosen": -266.9850746268657, "logps/rejected": -360.91803278688525, "loss": 0.1554, "rewards/chosen": 1.7332089552238805, "rewards/margins": 10.53648764374847, "rewards/rejected": -8.80327868852459, "step": 1034 }, { "epoch": 0.7096331847788824, "grad_norm": 0.6464031224119647, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -268656208.84210527, "logits/rejected": -292310724.9230769, "logps/chosen": -257.4736842105263, "logps/rejected": -380.3076923076923, "loss": 0.1882, "rewards/chosen": 1.837171052631579, "rewards/margins": 6.750632591093117, "rewards/rejected": -4.913461538461538, "step": 1035 }, { "epoch": 0.710318820706205, "grad_norm": 0.6313121774086184, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -359751445.94285715, "logits/rejected": -408149168.55172414, "logps/chosen": -312.0, "logps/rejected": -353.1034482758621, "loss": 0.1712, "rewards/chosen": 1.6642857142857144, "rewards/margins": 26431648.56083744, "rewards/rejected": -26431646.896551725, "step": 1036 }, { "epoch": 0.7110044566335276, "grad_norm": 0.5891834322413882, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -288788144.26229507, "logits/rejected": -261424082.14925373, "logps/chosen": -251.14754098360655, "logps/rejected": -368.7164179104478, "loss": 0.1708, "rewards/chosen": 0.9800204918032787, "rewards/margins": 8.569572730609249, "rewards/rejected": -7.58955223880597, "step": 1037 }, { "epoch": 0.7116900925608501, "grad_norm": 0.5251865974474694, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -227016704.0, "logits/rejected": -219414528.0, "logps/chosen": -235.75, "logps/rejected": -296.5, "loss": 0.1464, "rewards/chosen": 1.802734375, "rewards/margins": 9.240234375, "rewards/rejected": -7.4375, "step": 1038 }, { "epoch": 0.7123757284881728, "grad_norm": 0.45628881440530744, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -196996987.87096775, "logits/rejected": -280637067.6363636, "logps/chosen": -278.4516129032258, "logps/rejected": -302.06060606060606, "loss": 0.1477, "rewards/chosen": 1.814516129032258, "rewards/margins": 9.852394916911045, "rewards/rejected": -8.037878787878787, "step": 1039 }, { "epoch": 0.7130613644154954, "grad_norm": 0.5055806511196226, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -287186462.11764705, "logits/rejected": -262004189.86666667, "logps/chosen": -266.8235294117647, "logps/rejected": -409.6, "loss": 0.1507, "rewards/chosen": 1.9264705882352942, "rewards/margins": 10.64313725490196, "rewards/rejected": -8.716666666666667, "step": 1040 }, { "epoch": 0.713747000342818, "grad_norm": 0.4776220318777652, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -244431131.56923077, "logits/rejected": -289473552.25396824, "logps/chosen": -259.2, "logps/rejected": -408.63492063492066, "loss": 0.1771, "rewards/chosen": 1.325841346153846, "rewards/margins": 9.11949213980464, "rewards/rejected": -7.7936507936507935, "step": 1041 }, { "epoch": 0.7144326362701405, "grad_norm": 0.523828509831746, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -289542276.12903225, "logits/rejected": -262334650.1818182, "logps/chosen": -237.41935483870967, "logps/rejected": -339.3939393939394, "loss": 0.1723, "rewards/chosen": 1.6633064516129032, "rewards/margins": 9.473912512218964, "rewards/rejected": -7.8106060606060606, "step": 1042 }, { "epoch": 0.7151182721974632, "grad_norm": 0.5276072008048174, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -355298138.83870965, "logits/rejected": -296651682.90909094, "logps/chosen": -232.7741935483871, "logps/rejected": -417.6969696969697, "loss": 0.1609, "rewards/chosen": 1.0161290322580645, "rewards/margins": 8.773704789833822, "rewards/rejected": -7.757575757575758, "step": 1043 }, { "epoch": 0.7158039081247858, "grad_norm": 0.5269780532896691, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -307765650.8852459, "logits/rejected": -309001261.8507463, "logps/chosen": -252.85245901639345, "logps/rejected": -290.14925373134326, "loss": 0.1546, "rewards/chosen": 1.9487704918032787, "rewards/margins": 9.866680939564473, "rewards/rejected": -7.917910447761194, "step": 1044 }, { "epoch": 0.7164895440521083, "grad_norm": 0.5804251994047618, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -237550126.54545453, "logits/rejected": -264647052.38709676, "logps/chosen": -254.54545454545453, "logps/rejected": -313.80645161290323, "loss": 0.1656, "rewards/chosen": 1.4621212121212122, "rewards/margins": 8.27663734115347, "rewards/rejected": -6.814516129032258, "step": 1045 }, { "epoch": 0.7171751799794309, "grad_norm": 0.43876025918207046, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -288340627.5254237, "logits/rejected": -267219715.71014494, "logps/chosen": -233.76271186440678, "logps/rejected": -367.30434782608694, "loss": 0.1456, "rewards/chosen": 1.382415254237288, "rewards/margins": 8.795458732498158, "rewards/rejected": -7.413043478260869, "step": 1046 }, { "epoch": 0.7178608159067535, "grad_norm": 0.5266636217192694, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -266757734.4, "logits/rejected": -263807258.4827586, "logps/chosen": -247.0857142857143, "logps/rejected": -347.86206896551727, "loss": 0.1812, "rewards/chosen": 1.4839285714285715, "rewards/margins": 30047853.621859606, "rewards/rejected": -30047852.137931034, "step": 1047 }, { "epoch": 0.7185464518340761, "grad_norm": 0.492815010493917, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -221832078.2222222, "logits/rejected": -270233014.85714287, "logps/chosen": -242.0, "logps/rejected": -369.14285714285717, "loss": 0.1895, "rewards/chosen": 1.484375, "rewards/margins": 9.198660714285715, "rewards/rejected": -7.714285714285714, "step": 1048 }, { "epoch": 0.7192320877613987, "grad_norm": 0.5727528458538513, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -298484201.07462686, "logits/rejected": -313816450.09836066, "logps/chosen": -254.56716417910448, "logps/rejected": -385.9672131147541, "loss": 0.2102, "rewards/chosen": 0.781191697761194, "rewards/margins": 9.051683501039882, "rewards/rejected": -8.270491803278688, "step": 1049 }, { "epoch": 0.7199177236887213, "grad_norm": 0.5655011709452612, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -247929969.7777778, "logits/rejected": -294246557.53846157, "logps/chosen": -254.22222222222223, "logps/rejected": -319.0153846153846, "loss": 0.1644, "rewards/chosen": 1.4930555555555556, "rewards/margins": 8.27767094017094, "rewards/rejected": -6.7846153846153845, "step": 1050 }, { "epoch": 0.7206033596160438, "grad_norm": 0.5876314132568852, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -258747204.50704226, "logits/rejected": -220016999.2982456, "logps/chosen": -272.22535211267603, "logps/rejected": -352.280701754386, "loss": 0.1613, "rewards/chosen": 1.6443661971830985, "rewards/margins": 8.71454163577959, "rewards/rejected": -7.0701754385964914, "step": 1051 }, { "epoch": 0.7212889955433665, "grad_norm": 0.5025542063589508, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -271682659.0967742, "logits/rejected": -268435456.0, "logps/chosen": -255.48387096774192, "logps/rejected": -345.2121212121212, "loss": 0.1419, "rewards/chosen": 1.8588709677419355, "rewards/margins": 8.752810361681329, "rewards/rejected": -6.893939393939394, "step": 1052 }, { "epoch": 0.7219746314706891, "grad_norm": 0.6245476326373661, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -258111015.3846154, "logits/rejected": -227424483.55555555, "logps/chosen": -344.3692307692308, "logps/rejected": -384.76190476190476, "loss": 0.1655, "rewards/chosen": 1.976923076923077, "rewards/margins": 9.468986568986569, "rewards/rejected": -7.492063492063492, "step": 1053 }, { "epoch": 0.7226602673980117, "grad_norm": 0.48187603132883705, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -267201837.17647058, "logits/rejected": -231805201.06666666, "logps/chosen": -299.52941176470586, "logps/rejected": -397.8666666666667, "loss": 0.1715, "rewards/chosen": 0.47794117647058826, "rewards/margins": 7.694607843137255, "rewards/rejected": -7.216666666666667, "step": 1054 }, { "epoch": 0.7233459033253342, "grad_norm": 0.5381089773786365, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -341602759.1111111, "logits/rejected": -302289481.14285713, "logps/chosen": -268.22222222222223, "logps/rejected": -317.14285714285717, "loss": 0.1819, "rewards/chosen": 1.3585069444444444, "rewards/margins": 8.849578373015873, "rewards/rejected": -7.491071428571429, "step": 1055 }, { "epoch": 0.7240315392526568, "grad_norm": 0.4480510670821707, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -226891873.52380952, "logits/rejected": -289600559.26153845, "logps/chosen": -290.53968253968253, "logps/rejected": -343.6307692307692, "loss": 0.1517, "rewards/chosen": 1.6041666666666667, "rewards/margins": 9.781089743589742, "rewards/rejected": -8.176923076923076, "step": 1056 }, { "epoch": 0.7247171751799795, "grad_norm": 0.4985698284142554, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -218103808.0, "logits/rejected": -227540992.0, "logps/chosen": -252.75, "logps/rejected": -347.5, "loss": 0.1566, "rewards/chosen": 1.626953125, "rewards/margins": 6.978515625, "rewards/rejected": -5.3515625, "step": 1057 }, { "epoch": 0.725402811107302, "grad_norm": 0.6675315163796706, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -337198411.7183099, "logits/rejected": -323770835.0877193, "logps/chosen": -269.5211267605634, "logps/rejected": -410.3859649122807, "loss": 0.1755, "rewards/chosen": 1.625, "rewards/margins": 7.756578947368421, "rewards/rejected": -6.131578947368421, "step": 1058 }, { "epoch": 0.7260884470346246, "grad_norm": 0.5585310097708684, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -233948956.44444445, "logits/rejected": -259747254.85714287, "logps/chosen": -224.0, "logps/rejected": -403.42857142857144, "loss": 0.193, "rewards/chosen": 1.1432291666666667, "rewards/margins": 8.902157738095237, "rewards/rejected": -7.758928571428571, "step": 1059 }, { "epoch": 0.7267740829619472, "grad_norm": 0.528901530497461, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -255605820.2352941, "logits/rejected": -312335837.8666667, "logps/chosen": -253.1764705882353, "logps/rejected": -365.3333333333333, "loss": 0.1875, "rewards/chosen": 1.3363970588235294, "rewards/margins": 8.203063725490196, "rewards/rejected": -6.866666666666666, "step": 1060 }, { "epoch": 0.7274597188892697, "grad_norm": 0.4669322863229607, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -220645810.42424244, "logits/rejected": -264376452.12903225, "logps/chosen": -227.15151515151516, "logps/rejected": -282.06451612903226, "loss": 0.1588, "rewards/chosen": 1.4791666666666667, "rewards/margins": 8.906586021505376, "rewards/rejected": -7.42741935483871, "step": 1061 }, { "epoch": 0.7281453548165924, "grad_norm": 0.5210816568289072, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -278659072.0, "logits/rejected": -213385216.0, "logps/chosen": -279.75, "logps/rejected": -422.0, "loss": 0.17, "rewards/chosen": 1.4024658203125, "rewards/margins": 8.3477783203125, "rewards/rejected": -6.9453125, "step": 1062 }, { "epoch": 0.728830990743915, "grad_norm": 0.4501998460099523, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -267536676.57142857, "logits/rejected": -283814570.6666667, "logps/chosen": -252.28571428571428, "logps/rejected": -368.44444444444446, "loss": 0.1384, "rewards/chosen": 1.1852678571428572, "rewards/margins": 8.504712301587302, "rewards/rejected": -7.319444444444445, "step": 1063 }, { "epoch": 0.7295166266712376, "grad_norm": 0.5227623696844466, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -244108492.8, "logits/rejected": -249684449.88235295, "logps/chosen": -304.4, "logps/rejected": -386.3529411764706, "loss": 0.1382, "rewards/chosen": 2.169791666666667, "rewards/margins": 9.478615196078431, "rewards/rejected": -7.3088235294117645, "step": 1064 }, { "epoch": 0.7302022625985601, "grad_norm": 0.6687219192749495, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -201562890.81690142, "logits/rejected": -198677557.89473686, "logps/chosen": -236.16901408450704, "logps/rejected": -318.4561403508772, "loss": 0.1901, "rewards/chosen": 1.380281690140845, "rewards/margins": 4.191685198912775, "rewards/rejected": -2.81140350877193, "step": 1065 }, { "epoch": 0.7308878985258828, "grad_norm": 0.4451271292634487, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -243269632.0, "logits/rejected": -258319781.6470588, "logps/chosen": -318.1333333333333, "logps/rejected": -323.29411764705884, "loss": 0.1597, "rewards/chosen": 1.2739583333333333, "rewards/margins": 8.281311274509804, "rewards/rejected": -7.007352941176471, "step": 1066 }, { "epoch": 0.7315735344532054, "grad_norm": 0.5384817562509432, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -241591910.4, "logits/rejected": -242529460.70588234, "logps/chosen": -249.06666666666666, "logps/rejected": -411.7647058823529, "loss": 0.1505, "rewards/chosen": 1.34375, "rewards/margins": 9.233455882352942, "rewards/rejected": -7.889705882352941, "step": 1067 }, { "epoch": 0.7322591703805279, "grad_norm": 0.6092718713756102, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -285512265.14285713, "logits/rejected": -296397482.6666667, "logps/chosen": -304.85714285714283, "logps/rejected": -303.55555555555554, "loss": 0.1522, "rewards/chosen": 1.765625, "rewards/margins": 9.21701388888889, "rewards/rejected": -7.451388888888889, "step": 1068 }, { "epoch": 0.7329448063078505, "grad_norm": 0.608992598430504, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -337489504.4637681, "logits/rejected": -414027567.7288136, "logps/chosen": -226.7826086956522, "logps/rejected": -407.3220338983051, "loss": 0.1884, "rewards/chosen": 0.8369565217391305, "rewards/margins": 6.692888725128961, "rewards/rejected": -5.8559322033898304, "step": 1069 }, { "epoch": 0.7336304422351732, "grad_norm": 0.5162741121223635, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -259323692.13793105, "logits/rejected": -260046848.0, "logps/chosen": -291.58620689655174, "logps/rejected": -304.0, "loss": 0.1459, "rewards/chosen": 1.3448275862068966, "rewards/margins": 6.723399014778326, "rewards/rejected": -5.378571428571429, "step": 1070 }, { "epoch": 0.7343160781624957, "grad_norm": 0.6595773650895482, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -222764145.7777778, "logits/rejected": -305285412.5714286, "logps/chosen": -308.6666666666667, "logps/rejected": -316.2857142857143, "loss": 0.1542, "rewards/chosen": 1.78125, "rewards/margins": 19436836.066964287, "rewards/rejected": -19436834.285714287, "step": 1071 }, { "epoch": 0.7350017140898183, "grad_norm": 0.5766356737148768, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -248629020.44444445, "logits/rejected": -283714706.28571427, "logps/chosen": -255.33333333333334, "logps/rejected": -386.2857142857143, "loss": 0.159, "rewards/chosen": 1.8107638888888888, "rewards/margins": 9.81969246031746, "rewards/rejected": -8.008928571428571, "step": 1072 }, { "epoch": 0.7356873500171409, "grad_norm": 0.4401100905599869, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -311460897.0322581, "logits/rejected": -282670669.57575756, "logps/chosen": -278.5806451612903, "logps/rejected": -411.1515151515151, "loss": 0.1475, "rewards/chosen": 1.908140120967742, "rewards/margins": 8.150564363391984, "rewards/rejected": -6.242424242424242, "step": 1073 }, { "epoch": 0.7363729859444635, "grad_norm": 0.5580226114431286, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -286051532.8, "logits/rejected": -288666804.7058824, "logps/chosen": -331.73333333333335, "logps/rejected": -373.1764705882353, "loss": 0.1342, "rewards/chosen": 1.6791666666666667, "rewards/margins": 9.85563725490196, "rewards/rejected": -8.176470588235293, "step": 1074 }, { "epoch": 0.7370586218717861, "grad_norm": 0.4461441489162067, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -243129821.86666667, "logits/rejected": -206754514.82352942, "logps/chosen": -237.86666666666667, "logps/rejected": -358.8235294117647, "loss": 0.1735, "rewards/chosen": 1.1333333333333333, "rewards/margins": 8.905392156862746, "rewards/rejected": -7.772058823529412, "step": 1075 }, { "epoch": 0.7377442577991087, "grad_norm": 0.40513233618009376, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -208424644.92307693, "logits/rejected": -215234021.0526316, "logps/chosen": -203.69230769230768, "logps/rejected": -328.42105263157896, "loss": 0.1314, "rewards/chosen": 1.3774038461538463, "rewards/margins": 9.377403846153847, "rewards/rejected": -8.0, "step": 1076 }, { "epoch": 0.7384298937264313, "grad_norm": 0.5540760097320375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -209857379.79661018, "logits/rejected": -246552130.7826087, "logps/chosen": -243.25423728813558, "logps/rejected": -370.5507246376812, "loss": 0.1528, "rewards/chosen": 1.0444915254237288, "rewards/margins": 8.472027757307787, "rewards/rejected": -7.427536231884058, "step": 1077 }, { "epoch": 0.7391155296537538, "grad_norm": 0.6710993036867503, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -289271675.87096775, "logits/rejected": -292584479.030303, "logps/chosen": -278.5806451612903, "logps/rejected": -378.6666666666667, "loss": 0.1899, "rewards/chosen": 1.1149193548387097, "rewards/margins": 9.205828445747802, "rewards/rejected": -8.090909090909092, "step": 1078 }, { "epoch": 0.7398011655810764, "grad_norm": 0.67064696654592, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -215896279.57894737, "logits/rejected": -240043244.30769232, "logps/chosen": -292.42105263157896, "logps/rejected": -386.46153846153845, "loss": 0.1999, "rewards/chosen": 1.2689144736842106, "rewards/margins": 5.691991396761134, "rewards/rejected": -4.423076923076923, "step": 1079 }, { "epoch": 0.7404868015083991, "grad_norm": 0.48163201936819855, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -251400128.9846154, "logits/rejected": -274294166.3492063, "logps/chosen": -227.44615384615383, "logps/rejected": -332.1904761904762, "loss": 0.1455, "rewards/chosen": 1.0846153846153845, "rewards/margins": 9.529059829059829, "rewards/rejected": -8.444444444444445, "step": 1080 }, { "epoch": 0.7411724374357216, "grad_norm": 0.5303585477313191, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -246237908.67692307, "logits/rejected": -296131177.6507937, "logps/chosen": -236.30769230769232, "logps/rejected": -368.5079365079365, "loss": 0.1705, "rewards/chosen": 1.4461538461538461, "rewards/margins": 9.247741147741147, "rewards/rejected": -7.801587301587301, "step": 1081 }, { "epoch": 0.7418580733630442, "grad_norm": 0.5183658647372397, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -297983388.6567164, "logits/rejected": -305290323.93442625, "logps/chosen": -282.9850746268657, "logps/rejected": -379.8032786885246, "loss": 0.1753, "rewards/chosen": 1.3236940298507462, "rewards/margins": 10.340087472473696, "rewards/rejected": -9.01639344262295, "step": 1082 }, { "epoch": 0.7425437092903668, "grad_norm": 0.5446969002340267, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -249116237.57575756, "logits/rejected": -314166899.61290324, "logps/chosen": -331.1515151515151, "logps/rejected": -317.4193548387097, "loss": 0.1551, "rewards/chosen": 1.7916666666666667, "rewards/margins": 10.80779569892473, "rewards/rejected": -9.016129032258064, "step": 1083 }, { "epoch": 0.7432293452176894, "grad_norm": 0.5735574941264091, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -338948392.8115942, "logits/rejected": -296018336.5423729, "logps/chosen": -251.82608695652175, "logps/rejected": -375.3220338983051, "loss": 0.1593, "rewards/chosen": 2.088768115942029, "rewards/margins": 10.080293539670842, "rewards/rejected": -7.991525423728813, "step": 1084 }, { "epoch": 0.743914981145012, "grad_norm": 0.7157804759288966, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -351509735.2258065, "logits/rejected": -344441328.4848485, "logps/chosen": -234.58064516129033, "logps/rejected": -430.06060606060606, "loss": 0.1221, "rewards/chosen": 1.8024193548387097, "rewards/margins": 9.734237536656892, "rewards/rejected": -7.931818181818182, "step": 1085 }, { "epoch": 0.7446006170723346, "grad_norm": 0.7430718546466776, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -205054862.2222222, "logits/rejected": -221399332.57142857, "logps/chosen": -265.77777777777777, "logps/rejected": -418.2857142857143, "loss": 0.1835, "rewards/chosen": 1.5069444444444444, "rewards/margins": 9.337301587301587, "rewards/rejected": -7.830357142857143, "step": 1086 }, { "epoch": 0.7452862529996572, "grad_norm": 0.6691161988451573, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -262211650.06451613, "logits/rejected": -273265260.6060606, "logps/chosen": -222.19354838709677, "logps/rejected": -387.3939393939394, "loss": 0.165, "rewards/chosen": 1.1602822580645162, "rewards/margins": 9.43300953079179, "rewards/rejected": -8.272727272727273, "step": 1087 }, { "epoch": 0.7459718889269797, "grad_norm": 0.6765142713431497, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -239861760.0, "logits/rejected": -264503296.0, "logps/chosen": -268.125, "logps/rejected": -331.75, "loss": 0.1552, "rewards/chosen": 1.9794921875, "rewards/margins": 9.3388671875, "rewards/rejected": -7.359375, "step": 1088 }, { "epoch": 0.7466575248543024, "grad_norm": 0.5175609972301938, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -227051656.53333333, "logits/rejected": -308898153.4117647, "logps/chosen": -276.53333333333336, "logps/rejected": -304.2352941176471, "loss": 0.1381, "rewards/chosen": 1.1520833333333333, "rewards/margins": 8.277083333333334, "rewards/rejected": -7.125, "step": 1089 }, { "epoch": 0.747343160781625, "grad_norm": 0.6020636612683593, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -268192307.942029, "logits/rejected": -271847771.11864406, "logps/chosen": -246.7246376811594, "logps/rejected": -308.6101694915254, "loss": 0.1751, "rewards/chosen": 1.1105072463768115, "rewards/margins": 7.788473348071727, "rewards/rejected": -6.677966101694915, "step": 1090 }, { "epoch": 0.7480287967089475, "grad_norm": 0.43429368934568796, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -241916630.70967743, "logits/rejected": -270214857.6969697, "logps/chosen": -211.3548387096774, "logps/rejected": -354.42424242424244, "loss": 0.1371, "rewards/chosen": 1.719758064516129, "rewards/margins": 7.356121700879766, "rewards/rejected": -5.636363636363637, "step": 1091 }, { "epoch": 0.7487144326362701, "grad_norm": 0.4986812957293579, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -250895639.27272728, "logits/rejected": -312272697.8064516, "logps/chosen": -249.6969696969697, "logps/rejected": -335.48387096774195, "loss": 0.1622, "rewards/chosen": 1.7632575757575757, "rewards/margins": 9.45680596285435, "rewards/rejected": -7.693548387096774, "step": 1092 }, { "epoch": 0.7494000685635928, "grad_norm": 0.6070234052581293, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -257839319.57894737, "logits/rejected": -309551450.14084506, "logps/chosen": -260.7719298245614, "logps/rejected": -350.6478873239437, "loss": 0.1387, "rewards/chosen": 1.5460526315789473, "rewards/margins": 18638586.334785026, "rewards/rejected": -18638584.788732395, "step": 1093 }, { "epoch": 0.7500857044909153, "grad_norm": 0.8029406012799936, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -291665447.38461536, "logits/rejected": -273495251.3015873, "logps/chosen": -251.07692307692307, "logps/rejected": -318.984126984127, "loss": 0.168, "rewards/chosen": 1.7230769230769232, "rewards/margins": 8.421489621489622, "rewards/rejected": -6.698412698412699, "step": 1094 }, { "epoch": 0.7507713404182379, "grad_norm": 0.5071007111202095, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -251493757.49019608, "logits/rejected": -323560594.28571427, "logps/chosen": -238.4313725490196, "logps/rejected": -351.5844155844156, "loss": 0.1278, "rewards/chosen": 1.9479166666666667, "rewards/margins": 9.129734848484848, "rewards/rejected": -7.181818181818182, "step": 1095 }, { "epoch": 0.7514569763455605, "grad_norm": 0.4997467584957213, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -270765624.8888889, "logits/rejected": -249860681.14285713, "logps/chosen": -341.3333333333333, "logps/rejected": -352.57142857142856, "loss": 0.1692, "rewards/chosen": 2.0625, "rewards/margins": 9.839285714285715, "rewards/rejected": -7.776785714285714, "step": 1096 }, { "epoch": 0.7521426122728831, "grad_norm": 0.5756426294191915, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -256984340.31746033, "logits/rejected": -276049730.95384616, "logps/chosen": -267.1746031746032, "logps/rejected": -355.44615384615383, "loss": 0.1707, "rewards/chosen": 1.4523809523809523, "rewards/margins": 9.390842490842491, "rewards/rejected": -7.938461538461539, "step": 1097 }, { "epoch": 0.7528282482002057, "grad_norm": 0.5823695145977801, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -258722330.9473684, "logits/rejected": -269242052.9230769, "logps/chosen": -276.63157894736844, "logps/rejected": -348.46153846153845, "loss": 0.1883, "rewards/chosen": 1.305921052631579, "rewards/margins": 9.262651821862349, "rewards/rejected": -7.956730769230769, "step": 1098 }, { "epoch": 0.7535138841275283, "grad_norm": 0.44293575986935446, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -281515997.2881356, "logits/rejected": -233665283.71014494, "logps/chosen": -269.5593220338983, "logps/rejected": -350.1449275362319, "loss": 0.1257, "rewards/chosen": 1.3940677966101696, "rewards/margins": 8.93029968066814, "rewards/rejected": -7.536231884057971, "step": 1099 }, { "epoch": 0.7541995200548509, "grad_norm": 0.5587301467761772, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -223790999.8644068, "logits/rejected": -243634354.08695653, "logps/chosen": -204.61016949152543, "logps/rejected": -351.07246376811594, "loss": 0.1654, "rewards/chosen": 1.3633474576271187, "rewards/margins": 6.682188037337263, "rewards/rejected": -5.318840579710145, "step": 1100 }, { "epoch": 0.7548851559821734, "grad_norm": 0.6041845449741873, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -261674488.35820895, "logits/rejected": -287412962.6229508, "logps/chosen": -287.2835820895522, "logps/rejected": -366.1639344262295, "loss": 0.1701, "rewards/chosen": 1.5541044776119404, "rewards/margins": 9.55410447761194, "rewards/rejected": -8.0, "step": 1101 }, { "epoch": 0.755570791909496, "grad_norm": 0.46395043520808066, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -217677268.6101695, "logits/rejected": -261870458.4347826, "logps/chosen": -226.16949152542372, "logps/rejected": -343.42028985507244, "loss": 0.1482, "rewards/chosen": 1.7415254237288136, "rewards/margins": 8.727032670105626, "rewards/rejected": -6.9855072463768115, "step": 1102 }, { "epoch": 0.7562564278368187, "grad_norm": 0.576176292600644, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -263759838.4262295, "logits/rejected": -293976889.3134328, "logps/chosen": -266.4918032786885, "logps/rejected": -408.35820895522386, "loss": 0.1683, "rewards/chosen": 1.540983606557377, "rewards/margins": 8.182774651333496, "rewards/rejected": -6.641791044776119, "step": 1103 }, { "epoch": 0.7569420637641412, "grad_norm": 0.706508406926888, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -293486367.56164384, "logits/rejected": -255318723.4909091, "logps/chosen": -250.95890410958904, "logps/rejected": -345.8909090909091, "loss": 0.1639, "rewards/chosen": 1.769691780821918, "rewards/margins": 9.833328144458282, "rewards/rejected": -8.063636363636364, "step": 1104 }, { "epoch": 0.7576276996914638, "grad_norm": 0.467049870817839, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -287106873.8064516, "logits/rejected": -239710828.6060606, "logps/chosen": -217.80645161290323, "logps/rejected": -363.1515151515151, "loss": 0.1499, "rewards/chosen": 1.5252016129032258, "rewards/margins": 9.472171309872923, "rewards/rejected": -7.946969696969697, "step": 1105 }, { "epoch": 0.7583133356187864, "grad_norm": 0.5390046275646234, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -203614394.1818182, "logits/rejected": -217562607.48387095, "logps/chosen": -230.3030303030303, "logps/rejected": -384.0, "loss": 0.1477, "rewards/chosen": 1.606060606060606, "rewards/margins": 8.025415444770283, "rewards/rejected": -6.419354838709677, "step": 1106 }, { "epoch": 0.7589989715461091, "grad_norm": 0.5886165369336671, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -264174575.74603173, "logits/rejected": -213199698.7076923, "logps/chosen": -323.04761904761904, "logps/rejected": -420.4307692307692, "loss": 0.1697, "rewards/chosen": 1.7936507936507937, "rewards/margins": 8.755189255189256, "rewards/rejected": -6.961538461538462, "step": 1107 }, { "epoch": 0.7596846074734316, "grad_norm": 0.7179413780441871, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -251149839.5151515, "logits/rejected": -320390705.5483871, "logps/chosen": -319.27272727272725, "logps/rejected": -361.2903225806452, "loss": 0.1438, "rewards/chosen": 2.2140151515151514, "rewards/margins": 8.980144183773216, "rewards/rejected": -6.766129032258065, "step": 1108 }, { "epoch": 0.7603702434007542, "grad_norm": 0.6858681206086074, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -267278406.62068966, "logits/rejected": -307262727.3142857, "logps/chosen": -235.58620689655172, "logps/rejected": -351.3142857142857, "loss": 0.1439, "rewards/chosen": 1.7780172413793103, "rewards/margins": 9.356588669950739, "rewards/rejected": -7.578571428571428, "step": 1109 }, { "epoch": 0.7610558793280768, "grad_norm": 0.6015254343283701, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -334732519.2258065, "logits/rejected": -345712329.6969697, "logps/chosen": -262.3225806451613, "logps/rejected": -325.8181818181818, "loss": 0.138, "rewards/chosen": 1.6955645161290323, "rewards/margins": 9.801625122189638, "rewards/rejected": -8.106060606060606, "step": 1110 }, { "epoch": 0.7617415152553993, "grad_norm": 0.7225257246706484, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -225012073.4117647, "logits/rejected": -244108492.8, "logps/chosen": -277.1764705882353, "logps/rejected": -392.0, "loss": 0.1785, "rewards/chosen": 1.6709558823529411, "rewards/margins": 10.004289215686274, "rewards/rejected": -8.333333333333334, "step": 1111 }, { "epoch": 0.762427151182722, "grad_norm": 0.646100877745246, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -239456628.36363637, "logits/rejected": -256757197.80392158, "logps/chosen": -207.37662337662337, "logps/rejected": -348.54901960784315, "loss": 0.2, "rewards/chosen": 1.3847402597402598, "rewards/margins": 8.301406926406926, "rewards/rejected": -6.916666666666667, "step": 1112 }, { "epoch": 0.7631127871100446, "grad_norm": 0.5982830675356666, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -282269300.7719298, "logits/rejected": -267490260.73239437, "logps/chosen": -243.64912280701753, "logps/rejected": -380.3943661971831, "loss": 0.1487, "rewards/chosen": 1.6968201754385965, "rewards/margins": 9.267242710649864, "rewards/rejected": -7.570422535211268, "step": 1113 }, { "epoch": 0.7637984230373671, "grad_norm": 0.5816315391106938, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -261095424.0, "logits/rejected": -246415360.0, "logps/chosen": -232.25, "logps/rejected": -341.0, "loss": 0.1692, "rewards/chosen": 1.263671875, "rewards/margins": 8.888671875, "rewards/rejected": -7.625, "step": 1114 }, { "epoch": 0.7644840589646897, "grad_norm": 0.5815316511517828, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -317750303.030303, "logits/rejected": -301989888.0, "logps/chosen": -282.6666666666667, "logps/rejected": -411.8709677419355, "loss": 0.149, "rewards/chosen": 1.5075757575757576, "rewards/margins": 10.144672531769306, "rewards/rejected": -8.637096774193548, "step": 1115 }, { "epoch": 0.7651696948920124, "grad_norm": 0.5901947048679785, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -295279001.6, "logits/rejected": -289207247.2380952, "logps/chosen": -258.7076923076923, "logps/rejected": -371.8095238095238, "loss": 0.1758, "rewards/chosen": 1.2341346153846153, "rewards/margins": 8.853182234432234, "rewards/rejected": -7.619047619047619, "step": 1116 }, { "epoch": 0.7658553308193349, "grad_norm": 0.5013298156449832, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -292088580.1967213, "logits/rejected": -311755730.1492537, "logps/chosen": -236.327868852459, "logps/rejected": -366.32835820895525, "loss": 0.1494, "rewards/chosen": 1.6721311475409837, "rewards/margins": 9.388549057988746, "rewards/rejected": -7.7164179104477615, "step": 1117 }, { "epoch": 0.7665409667466575, "grad_norm": 0.5021640625924056, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -299514561.0491803, "logits/rejected": -284962265.7910448, "logps/chosen": -254.68852459016392, "logps/rejected": -355.82089552238807, "loss": 0.1386, "rewards/chosen": 1.3811475409836065, "rewards/margins": 8.61995351113286, "rewards/rejected": -7.2388059701492535, "step": 1118 }, { "epoch": 0.7672266026739801, "grad_norm": 0.5595185125461812, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -340374124.6060606, "logits/rejected": -282777269.67741936, "logps/chosen": -341.09090909090907, "logps/rejected": -366.4516129032258, "loss": 0.1613, "rewards/chosen": 2.308712121212121, "rewards/margins": 16963545.01838954, "rewards/rejected": -16963542.70967742, "step": 1119 }, { "epoch": 0.7679122386013028, "grad_norm": 0.5342333861589149, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -239556641.5737705, "logits/rejected": -256916770.3880597, "logps/chosen": -295.08196721311475, "logps/rejected": -349.13432835820896, "loss": 0.1663, "rewards/chosen": 1.6290983606557377, "rewards/margins": 9.144023733790066, "rewards/rejected": -7.514925373134329, "step": 1120 }, { "epoch": 0.7685978745286253, "grad_norm": 0.6197508668669871, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -210466418.62686568, "logits/rejected": -212328045.1147541, "logps/chosen": -216.83582089552237, "logps/rejected": -353.3114754098361, "loss": 0.1822, "rewards/chosen": 1.310634328358209, "rewards/margins": 9.335224492292635, "rewards/rejected": -8.024590163934427, "step": 1121 }, { "epoch": 0.7692835104559479, "grad_norm": 0.47087115718492306, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -179272670.96774194, "logits/rejected": -244286432.96969697, "logps/chosen": -320.51612903225805, "logps/rejected": -333.09090909090907, "loss": 0.1539, "rewards/chosen": 1.7913306451612903, "rewards/margins": 9.609512463343108, "rewards/rejected": -7.818181818181818, "step": 1122 }, { "epoch": 0.7699691463832705, "grad_norm": 0.6189443874569921, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -271979938.25352114, "logits/rejected": -290510740.2105263, "logps/chosen": -289.5774647887324, "logps/rejected": -405.05263157894734, "loss": 0.1761, "rewards/chosen": 1.630281690140845, "rewards/margins": 17642176.507474672, "rewards/rejected": -17642174.87719298, "step": 1123 }, { "epoch": 0.770654782310593, "grad_norm": 0.5376485527463222, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -304197416.42105263, "logits/rejected": -289084337.2307692, "logps/chosen": -230.31578947368422, "logps/rejected": -378.46153846153845, "loss": 0.179, "rewards/chosen": 1.4695723684210527, "rewards/margins": 10.306110829959515, "rewards/rejected": -8.836538461538462, "step": 1124 }, { "epoch": 0.7713404182379157, "grad_norm": 0.48867549017671125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -241272344.3809524, "logits/rejected": -250625795.93846154, "logps/chosen": -311.87301587301585, "logps/rejected": -371.6923076923077, "loss": 0.1609, "rewards/chosen": 1.8422619047619047, "rewards/margins": 10.334569597069597, "rewards/rejected": -8.492307692307692, "step": 1125 }, { "epoch": 0.7720260541652383, "grad_norm": 0.551008745696915, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -253585352.64864865, "logits/rejected": -257095300.74074075, "logps/chosen": -258.5945945945946, "logps/rejected": -358.51851851851853, "loss": 0.1907, "rewards/chosen": 1.2432432432432432, "rewards/margins": 8.743243243243244, "rewards/rejected": -7.5, "step": 1126 }, { "epoch": 0.7727116900925608, "grad_norm": 0.5472210705793066, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -283433270.3030303, "logits/rejected": -258152646.19354838, "logps/chosen": -238.78787878787878, "logps/rejected": -354.06451612903226, "loss": 0.1659, "rewards/chosen": 1.581439393939394, "rewards/margins": 8.50079423264907, "rewards/rejected": -6.919354838709677, "step": 1127 }, { "epoch": 0.7733973260198834, "grad_norm": 0.6880383892384287, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -272416490.30508476, "logits/rejected": -291777669.5652174, "logps/chosen": -332.20338983050846, "logps/rejected": -369.6231884057971, "loss": 0.1371, "rewards/chosen": 1.6970338983050848, "rewards/margins": 10.581091869319577, "rewards/rejected": -8.884057971014492, "step": 1128 }, { "epoch": 0.774082961947206, "grad_norm": 0.5772340761087018, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -287019449.1076923, "logits/rejected": -350191095.8730159, "logps/chosen": -268.3076923076923, "logps/rejected": -335.74603174603175, "loss": 0.1553, "rewards/chosen": 1.6096153846153847, "rewards/margins": 9.442948717948717, "rewards/rejected": -7.833333333333333, "step": 1129 }, { "epoch": 0.7747685978745287, "grad_norm": 0.5272220168293621, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -235580074.66666666, "logits/rejected": -257650102.85714287, "logps/chosen": -190.88888888888889, "logps/rejected": -356.85714285714283, "loss": 0.1649, "rewards/chosen": 1.5086805555555556, "rewards/margins": 9.803323412698413, "rewards/rejected": -8.294642857142858, "step": 1130 }, { "epoch": 0.7754542338018512, "grad_norm": 0.5586330463165874, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -243151482.5915493, "logits/rejected": -311703013.05263156, "logps/chosen": -236.8450704225352, "logps/rejected": -355.0877192982456, "loss": 0.1781, "rewards/chosen": 1.358274647887324, "rewards/margins": 7.551257104027675, "rewards/rejected": -6.192982456140351, "step": 1131 }, { "epoch": 0.7761398697291738, "grad_norm": 0.43483408105445676, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -317241902.54545456, "logits/rejected": -313021482.08219177, "logps/chosen": -304.43636363636364, "logps/rejected": -407.2328767123288, "loss": 0.1181, "rewards/chosen": 1.6318181818181818, "rewards/margins": 9.152366127023662, "rewards/rejected": -7.52054794520548, "step": 1132 }, { "epoch": 0.7768255056564964, "grad_norm": 0.529458265965381, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -273838288.2711864, "logits/rejected": -290805077.3333333, "logps/chosen": -320.8135593220339, "logps/rejected": -353.8550724637681, "loss": 0.1537, "rewards/chosen": 1.5911016949152543, "rewards/margins": 8.243275607958733, "rewards/rejected": -6.6521739130434785, "step": 1133 }, { "epoch": 0.777511141583819, "grad_norm": 0.5366420946784893, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -300312166.4, "logits/rejected": -267201837.17647058, "logps/chosen": -250.13333333333333, "logps/rejected": -360.47058823529414, "loss": 0.1472, "rewards/chosen": 1.7833333333333334, "rewards/margins": 9.349509803921569, "rewards/rejected": -7.5661764705882355, "step": 1134 }, { "epoch": 0.7781967775111416, "grad_norm": 0.7155791405309183, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -228802322.55072463, "logits/rejected": -264454421.69491526, "logps/chosen": -209.2753623188406, "logps/rejected": -360.6779661016949, "loss": 0.1755, "rewards/chosen": 1.3079710144927537, "rewards/margins": 8.858818472119872, "rewards/rejected": -7.5508474576271185, "step": 1135 }, { "epoch": 0.7788824134384642, "grad_norm": 0.4491394598633574, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -279202922.98507464, "logits/rejected": -252483348.98360655, "logps/chosen": -293.4925373134328, "logps/rejected": -388.0655737704918, "loss": 0.1692, "rewards/chosen": 1.9962686567164178, "rewards/margins": 8.561842427208221, "rewards/rejected": -6.565573770491803, "step": 1136 }, { "epoch": 0.7795680493657867, "grad_norm": 0.5009451489830038, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -256820460.30769232, "logits/rejected": -239141904.25396827, "logps/chosen": -240.24615384615385, "logps/rejected": -430.22222222222223, "loss": 0.1648, "rewards/chosen": 1.2615384615384615, "rewards/margins": 10.102808302808302, "rewards/rejected": -8.841269841269842, "step": 1137 }, { "epoch": 0.7802536852931093, "grad_norm": 0.7632241806103421, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -238207540.96551725, "logits/rejected": -223855996.34285715, "logps/chosen": -296.55172413793105, "logps/rejected": -341.9428571428571, "loss": 0.137, "rewards/chosen": 1.3254310344827587, "rewards/margins": 9.325431034482758, "rewards/rejected": -8.0, "step": 1138 }, { "epoch": 0.780939321220432, "grad_norm": 0.6218060389273706, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -198572123.70149255, "logits/rejected": -260734438.81967214, "logps/chosen": -233.7910447761194, "logps/rejected": -349.37704918032784, "loss": 0.1647, "rewards/chosen": 1.132462686567164, "rewards/margins": 7.009511866895033, "rewards/rejected": -5.877049180327869, "step": 1139 }, { "epoch": 0.7816249571477546, "grad_norm": 0.5547774216575301, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -224967214.54545453, "logits/rejected": -263294051.0967742, "logps/chosen": -358.7878787878788, "logps/rejected": -408.7741935483871, "loss": 0.1512, "rewards/chosen": 2.140151515151515, "rewards/margins": 10.019183773216032, "rewards/rejected": -7.879032258064516, "step": 1140 }, { "epoch": 0.7823105930750771, "grad_norm": 0.5732461877415909, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -264973490.7936508, "logits/rejected": -357741867.3230769, "logps/chosen": -287.4920634920635, "logps/rejected": -368.73846153846154, "loss": 0.1554, "rewards/chosen": 1.7936507936507937, "rewards/margins": 9.032112332112332, "rewards/rejected": -7.2384615384615385, "step": 1141 }, { "epoch": 0.7829962290023997, "grad_norm": 0.6204085729997026, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -235913468.06153846, "logits/rejected": -275625691.4285714, "logps/chosen": -314.0923076923077, "logps/rejected": -339.8095238095238, "loss": 0.1626, "rewards/chosen": 1.4509615384615384, "rewards/margins": 9.363659951159951, "rewards/rejected": -7.912698412698413, "step": 1142 }, { "epoch": 0.7836818649297224, "grad_norm": 0.5651218423869293, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -253231104.0, "logits/rejected": -320602112.0, "logps/chosen": -269.5, "logps/rejected": -361.0, "loss": 0.1529, "rewards/chosen": 1.8515625, "rewards/margins": 9.0078125, "rewards/rejected": -7.15625, "step": 1143 }, { "epoch": 0.7843675008570449, "grad_norm": 0.5661063266231575, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -261280466.82352942, "logits/rejected": -234601403.73333332, "logps/chosen": -225.64705882352942, "logps/rejected": -301.8666666666667, "loss": 0.1667, "rewards/chosen": 1.5772058823529411, "rewards/margins": 25970363.177205883, "rewards/rejected": -25970361.6, "step": 1144 }, { "epoch": 0.7850531367843675, "grad_norm": 0.7521816955080275, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -315462500.8484849, "logits/rejected": -292789479.2258065, "logps/chosen": -252.12121212121212, "logps/rejected": -376.7741935483871, "loss": 0.2, "rewards/chosen": 1.4015151515151516, "rewards/margins": 7.046676441837732, "rewards/rejected": -5.645161290322581, "step": 1145 }, { "epoch": 0.7857387727116901, "grad_norm": 0.5696497456125905, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -241279114.84745762, "logits/rejected": -345756538.4347826, "logps/chosen": -279.3220338983051, "logps/rejected": -386.7826086956522, "loss": 0.1262, "rewards/chosen": 2.1970338983050848, "rewards/margins": 10.276744043232622, "rewards/rejected": -8.079710144927537, "step": 1146 }, { "epoch": 0.7864244086390126, "grad_norm": 0.54049263281126, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -256114688.0, "logits/rejected": -315883520.0, "logps/chosen": -307.25, "logps/rejected": -317.0, "loss": 0.1222, "rewards/chosen": 1.216796875, "rewards/margins": 8.435546875, "rewards/rejected": -7.21875, "step": 1147 }, { "epoch": 0.7871100445663353, "grad_norm": 0.5435918064069066, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -377862969.3134328, "logits/rejected": -205727173.24590164, "logps/chosen": -181.73134328358208, "logps/rejected": -400.78688524590166, "loss": 0.1962, "rewards/chosen": 1.2742537313432836, "rewards/margins": 6.7988438952777095, "rewards/rejected": -5.524590163934426, "step": 1148 }, { "epoch": 0.7877956804936579, "grad_norm": 0.48070317646818184, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -279018007.63076925, "logits/rejected": -261511525.58730158, "logps/chosen": -239.26153846153846, "logps/rejected": -307.3015873015873, "loss": 0.139, "rewards/chosen": 1.978846153846154, "rewards/margins": 5.843925518925519, "rewards/rejected": -3.865079365079365, "step": 1149 }, { "epoch": 0.7884813164209804, "grad_norm": 0.45851206577925957, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -306783378.28571427, "logits/rejected": -270066574.2222222, "logps/chosen": -327.42857142857144, "logps/rejected": -388.8888888888889, "loss": 0.1473, "rewards/chosen": 1.4196428571428572, "rewards/margins": 8.891865079365079, "rewards/rejected": -7.472222222222222, "step": 1150 }, { "epoch": 0.789166952348303, "grad_norm": 0.6205240766588683, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -290805077.3333333, "logits/rejected": -250575838.96774194, "logps/chosen": -261.09090909090907, "logps/rejected": -346.83870967741933, "loss": 0.177, "rewards/chosen": 1.875, "rewards/margins": 7.318548387096774, "rewards/rejected": -5.443548387096774, "step": 1151 }, { "epoch": 0.7898525882756257, "grad_norm": 0.5056436238080169, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -246625075.2, "logits/rejected": -270902693.64705884, "logps/chosen": -286.6666666666667, "logps/rejected": -335.7647058823529, "loss": 0.1537, "rewards/chosen": 1.3989583333333333, "rewards/margins": 9.22248774509804, "rewards/rejected": -7.823529411764706, "step": 1152 }, { "epoch": 0.7905382242029483, "grad_norm": 0.5524959590785115, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -243914909.53846154, "logits/rejected": -248462579.80952382, "logps/chosen": -302.7692307692308, "logps/rejected": -341.8412698412698, "loss": 0.1572, "rewards/chosen": 1.9048076923076922, "rewards/margins": 9.325442612942613, "rewards/rejected": -7.420634920634921, "step": 1153 }, { "epoch": 0.7912238601302708, "grad_norm": 0.5716145801332713, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -319335823.1864407, "logits/rejected": -313660994.7826087, "logps/chosen": -273.35593220338984, "logps/rejected": -403.0144927536232, "loss": 0.1417, "rewards/chosen": 2.3940677966101696, "rewards/margins": 10.423053303856546, "rewards/rejected": -8.028985507246377, "step": 1154 }, { "epoch": 0.7919094960575934, "grad_norm": 0.5454942621119332, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -289293616.4324324, "logits/rejected": -324048820.1481481, "logps/chosen": -285.4054054054054, "logps/rejected": -342.22222222222223, "loss": 0.1523, "rewards/chosen": 1.6942567567567568, "rewards/margins": 7.342404904904905, "rewards/rejected": -5.648148148148148, "step": 1155 }, { "epoch": 0.792595131984916, "grad_norm": 0.7791241079211916, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -312332007.4520548, "logits/rejected": -244337273.01818183, "logps/chosen": -309.47945205479454, "logps/rejected": -449.74545454545455, "loss": 0.1739, "rewards/chosen": 2.0924657534246576, "rewards/margins": 8.956102117061022, "rewards/rejected": -6.863636363636363, "step": 1156 }, { "epoch": 0.7932807679122386, "grad_norm": 0.6070823917360897, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -260526197.02857143, "logits/rejected": -306907347.86206895, "logps/chosen": -318.62857142857143, "logps/rejected": -335.44827586206895, "loss": 0.1616, "rewards/chosen": 1.9910714285714286, "rewards/margins": 8.827278325123153, "rewards/rejected": -6.836206896551724, "step": 1157 }, { "epoch": 0.7939664038395612, "grad_norm": 0.6036345286657272, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -276966243.7966102, "logits/rejected": -290561929.2753623, "logps/chosen": -247.864406779661, "logps/rejected": -430.84057971014494, "loss": 0.1624, "rewards/chosen": 1.902542372881356, "rewards/margins": 8.902542372881356, "rewards/rejected": -7.0, "step": 1158 }, { "epoch": 0.7946520397668838, "grad_norm": 0.5281367221176447, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -278088100.82191783, "logits/rejected": -269503097.0181818, "logps/chosen": -259.06849315068496, "logps/rejected": -333.96363636363634, "loss": 0.1624, "rewards/chosen": 2.0256849315068495, "rewards/margins": 10.180230386052305, "rewards/rejected": -8.154545454545454, "step": 1159 }, { "epoch": 0.7953376756942063, "grad_norm": 0.49661924875864094, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -269260564.9836066, "logits/rejected": -302240294.2089552, "logps/chosen": -340.1967213114754, "logps/rejected": -361.07462686567163, "loss": 0.1342, "rewards/chosen": 2.4651639344262297, "rewards/margins": 10.114417665769514, "rewards/rejected": -7.649253731343284, "step": 1160 }, { "epoch": 0.796023311621529, "grad_norm": 0.6401044397417874, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -288513107.93442625, "logits/rejected": -247150928.23880598, "logps/chosen": -277.5081967213115, "logps/rejected": -335.76119402985074, "loss": 0.1505, "rewards/chosen": 1.6690573770491803, "rewards/margins": 9.93771409346709, "rewards/rejected": -8.26865671641791, "step": 1161 }, { "epoch": 0.7967089475488516, "grad_norm": 0.6642170715637004, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -379922762.32258064, "logits/rejected": -282924869.8181818, "logps/chosen": -260.9032258064516, "logps/rejected": -387.3939393939394, "loss": 0.1924, "rewards/chosen": 0.7852822580645161, "rewards/margins": 9.641342864125122, "rewards/rejected": -8.856060606060606, "step": 1162 }, { "epoch": 0.7973945834761742, "grad_norm": 0.5947772224093129, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -208838778.26865673, "logits/rejected": -219891544.13114753, "logps/chosen": -211.82089552238807, "logps/rejected": -294.8196721311475, "loss": 0.1703, "rewards/chosen": 1.5690298507462686, "rewards/margins": 9.757554440910203, "rewards/rejected": -8.188524590163935, "step": 1163 }, { "epoch": 0.7980802194034967, "grad_norm": 0.5459177281159265, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -233948956.44444445, "logits/rejected": -371195904.0, "logps/chosen": -279.55555555555554, "logps/rejected": -431.42857142857144, "loss": 0.1926, "rewards/chosen": 1.390625, "rewards/margins": 10.167410714285714, "rewards/rejected": -8.776785714285714, "step": 1164 }, { "epoch": 0.7987658553308193, "grad_norm": 0.6027184636175347, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -293147841.7297297, "logits/rejected": -267503388.44444445, "logps/chosen": -236.32432432432432, "logps/rejected": -290.962962962963, "loss": 0.1858, "rewards/chosen": 0.7795608108108109, "rewards/margins": 6.094375625625625, "rewards/rejected": -5.314814814814815, "step": 1165 }, { "epoch": 0.799451491258142, "grad_norm": 0.5616400591922442, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -238813184.0, "logits/rejected": -328990720.0, "logps/chosen": -268.5, "logps/rejected": -295.0, "loss": 0.1501, "rewards/chosen": 1.857421875, "rewards/margins": 8.271484375, "rewards/rejected": -6.4140625, "step": 1166 }, { "epoch": 0.8001371271854645, "grad_norm": 0.5269068320284926, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -246831917.58904108, "logits/rejected": -240676789.52727273, "logps/chosen": -254.24657534246575, "logps/rejected": -399.1272727272727, "loss": 0.1796, "rewards/chosen": 1.7037671232876712, "rewards/margins": 6.476494396014944, "rewards/rejected": -4.7727272727272725, "step": 1167 }, { "epoch": 0.8008227631127871, "grad_norm": 0.553405730870075, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -283639808.0, "logits/rejected": -258736128.0, "logps/chosen": -228.0, "logps/rejected": -371.25, "loss": 0.1667, "rewards/chosen": 1.509765625, "rewards/margins": 9.400390625, "rewards/rejected": -7.890625, "step": 1168 }, { "epoch": 0.8015083990401097, "grad_norm": 0.6021119189571603, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -274726912.0, "logits/rejected": -295436288.0, "logps/chosen": -289.5, "logps/rejected": -388.5, "loss": 0.1709, "rewards/chosen": 1.7275390625, "rewards/margins": 9.1728515625, "rewards/rejected": -7.4453125, "step": 1169 }, { "epoch": 0.8021940349674322, "grad_norm": 0.5437039451258726, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -266708389.6470588, "logits/rejected": -303667609.6, "logps/chosen": -266.5882352941176, "logps/rejected": -318.1333333333333, "loss": 0.166, "rewards/chosen": 1.463235294117647, "rewards/margins": 9.238235294117647, "rewards/rejected": -7.775, "step": 1170 }, { "epoch": 0.8028796708947549, "grad_norm": 0.6180969750842724, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -262676113.19402984, "logits/rejected": -309965941.5081967, "logps/chosen": -272.23880597014926, "logps/rejected": -337.3114754098361, "loss": 0.1539, "rewards/chosen": 1.7630597014925373, "rewards/margins": 9.91879740641057, "rewards/rejected": -8.155737704918034, "step": 1171 }, { "epoch": 0.8035653068220775, "grad_norm": 0.5203984002409039, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -293855480.24242425, "logits/rejected": -331485316.12903225, "logps/chosen": -314.6666666666667, "logps/rejected": -386.5806451612903, "loss": 0.1576, "rewards/chosen": 1.7348484848484849, "rewards/margins": 8.452590420332356, "rewards/rejected": -6.717741935483871, "step": 1172 }, { "epoch": 0.8042509427494001, "grad_norm": 0.5473635021941888, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -243935394.53968254, "logits/rejected": -209328033.47692308, "logps/chosen": -278.85714285714283, "logps/rejected": -319.26153846153846, "loss": 0.1375, "rewards/chosen": 1.9538690476190477, "rewards/margins": 9.830792124542125, "rewards/rejected": -7.876923076923077, "step": 1173 }, { "epoch": 0.8049365786767226, "grad_norm": 0.5122183767827211, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -245786214.4, "logits/rejected": -245872993.10344827, "logps/chosen": -301.7142857142857, "logps/rejected": -371.58620689655174, "loss": 0.1465, "rewards/chosen": 2.0035714285714286, "rewards/margins": 10.503571428571428, "rewards/rejected": -8.5, "step": 1174 }, { "epoch": 0.8056222146040453, "grad_norm": 0.5090365747542217, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -202875258.0923077, "logits/rejected": -249261494.85714287, "logps/chosen": -249.72307692307692, "logps/rejected": -291.8095238095238, "loss": 0.1706, "rewards/chosen": 1.5701923076923077, "rewards/margins": 9.0622557997558, "rewards/rejected": -7.492063492063492, "step": 1175 }, { "epoch": 0.8063078505313679, "grad_norm": 0.6853152715984877, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -307171087.0588235, "logits/rejected": -326876091.73333335, "logps/chosen": -265.1764705882353, "logps/rejected": -341.06666666666666, "loss": 0.1787, "rewards/chosen": 1.3988970588235294, "rewards/margins": 9.148897058823529, "rewards/rejected": -7.75, "step": 1176 }, { "epoch": 0.8069934864586904, "grad_norm": 0.5204638937639411, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -225968128.0, "logits/rejected": -318242816.0, "logps/chosen": -269.5, "logps/rejected": -361.5, "loss": 0.1501, "rewards/chosen": 1.833984375, "rewards/margins": 9.255859375, "rewards/rejected": -7.421875, "step": 1177 }, { "epoch": 0.807679122386013, "grad_norm": 0.5044132078731762, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -324276040.5970149, "logits/rejected": -264034874.75409836, "logps/chosen": -245.01492537313433, "logps/rejected": -342.55737704918033, "loss": 0.1766, "rewards/chosen": 1.5242537313432836, "rewards/margins": 25132029.327532418, "rewards/rejected": -25132027.80327869, "step": 1178 }, { "epoch": 0.8083647583133357, "grad_norm": 0.6434642340357448, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -291165877.67741936, "logits/rejected": -297160083.3939394, "logps/chosen": -362.3225806451613, "logps/rejected": -422.3030303030303, "loss": 0.1299, "rewards/chosen": 2.0443548387096775, "rewards/margins": 9.923142717497557, "rewards/rejected": -7.878787878787879, "step": 1179 }, { "epoch": 0.8090503942406582, "grad_norm": 0.5700155687419466, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -305040290.90909094, "logits/rejected": -315790501.16129035, "logps/chosen": -241.45454545454547, "logps/rejected": -395.61290322580646, "loss": 0.174, "rewards/chosen": 1.2556818181818181, "rewards/margins": 8.779875366568914, "rewards/rejected": -7.524193548387097, "step": 1180 }, { "epoch": 0.8097360301679808, "grad_norm": 0.5253602688596833, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -259235047.22580644, "logits/rejected": -265639253.33333334, "logps/chosen": -205.67741935483872, "logps/rejected": -342.3030303030303, "loss": 0.1675, "rewards/chosen": 0.6975806451612904, "rewards/margins": 8.16727761485826, "rewards/rejected": -7.46969696969697, "step": 1181 }, { "epoch": 0.8104216660953034, "grad_norm": 0.49750201344183104, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -330285789.6119403, "logits/rejected": -265960129.04918033, "logps/chosen": -224.47761194029852, "logps/rejected": -430.1639344262295, "loss": 0.1727, "rewards/chosen": 1.6492537313432836, "rewards/margins": 9.526302911671152, "rewards/rejected": -7.877049180327869, "step": 1182 }, { "epoch": 0.8111073020226259, "grad_norm": 0.4871989387422679, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -267403011.93846154, "logits/rejected": -336609540.06349206, "logps/chosen": -216.86153846153846, "logps/rejected": -349.46031746031747, "loss": 0.1569, "rewards/chosen": 1.9115384615384616, "rewards/margins": 9.863919413919414, "rewards/rejected": -7.9523809523809526, "step": 1183 }, { "epoch": 0.8117929379499486, "grad_norm": 0.5025282643248458, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -237555652.63768116, "logits/rejected": -269004175.1864407, "logps/chosen": -229.56521739130434, "logps/rejected": -316.7457627118644, "loss": 0.158, "rewards/chosen": 1.7028985507246377, "rewards/margins": 9.012220584622943, "rewards/rejected": -7.309322033898305, "step": 1184 }, { "epoch": 0.8124785738772712, "grad_norm": 0.5542947279853435, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -253594072.6153846, "logits/rejected": -271746748.6315789, "logps/chosen": -273.2307692307692, "logps/rejected": -386.94736842105266, "loss": 0.146, "rewards/chosen": 0.6268028846153846, "rewards/margins": 8.146539726720647, "rewards/rejected": -7.519736842105263, "step": 1185 }, { "epoch": 0.8131642098045938, "grad_norm": 0.5111343348986431, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -269553937.06666666, "logits/rejected": -237348261.6470588, "logps/chosen": -291.46666666666664, "logps/rejected": -368.2352941176471, "loss": 0.1311, "rewards/chosen": 1.4114583333333333, "rewards/margins": 10.462928921568627, "rewards/rejected": -9.051470588235293, "step": 1186 }, { "epoch": 0.8138498457319163, "grad_norm": 0.5552453695471047, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -272485128.82758623, "logits/rejected": -312535566.62857145, "logps/chosen": -228.68965517241378, "logps/rejected": -357.9428571428571, "loss": 0.1448, "rewards/chosen": 1.1088362068965518, "rewards/margins": 8.701693349753695, "rewards/rejected": -7.5928571428571425, "step": 1187 }, { "epoch": 0.814535481659239, "grad_norm": 0.6083110645318978, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -263176925.6119403, "logits/rejected": -311066086.8196721, "logps/chosen": -339.1044776119403, "logps/rejected": -377.1803278688525, "loss": 0.1702, "rewards/chosen": 1.669776119402985, "rewards/margins": 7.096005627599706, "rewards/rejected": -5.426229508196721, "step": 1188 }, { "epoch": 0.8152211175865616, "grad_norm": 0.5362185123030038, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -267949159.88405797, "logits/rejected": -310805035.3898305, "logps/chosen": -247.18840579710144, "logps/rejected": -381.5593220338983, "loss": 0.1633, "rewards/chosen": 2.0652173913043477, "rewards/margins": 7.560980103168754, "rewards/rejected": -5.495762711864407, "step": 1189 }, { "epoch": 0.8159067535138841, "grad_norm": 0.7207610305381422, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -370131196.06153846, "logits/rejected": -312908393.6507937, "logps/chosen": -309.4153846153846, "logps/rejected": -404.8253968253968, "loss": 0.1699, "rewards/chosen": 1.6798076923076923, "rewards/margins": 10.084569597069597, "rewards/rejected": -8.404761904761905, "step": 1190 }, { "epoch": 0.8165923894412067, "grad_norm": 0.5848627925678922, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -272136312.4705882, "logits/rejected": -308141533.8666667, "logps/chosen": -220.94117647058823, "logps/rejected": -341.3333333333333, "loss": 0.1943, "rewards/chosen": 0.9724264705882353, "rewards/margins": 4.822426470588235, "rewards/rejected": -3.85, "step": 1191 }, { "epoch": 0.8172780253685293, "grad_norm": 0.556299991230448, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -348326960.7619048, "logits/rejected": -317992770.95384616, "logps/chosen": -224.5079365079365, "logps/rejected": -337.7230769230769, "loss": 0.1554, "rewards/chosen": 1.5892857142857142, "rewards/margins": 9.443131868131868, "rewards/rejected": -7.8538461538461535, "step": 1192 }, { "epoch": 0.8179636612958519, "grad_norm": 0.7438412529760215, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -313174698.6666667, "logits/rejected": -217845696.9846154, "logps/chosen": -210.79365079365078, "logps/rejected": -347.5692307692308, "loss": 0.1877, "rewards/chosen": 1.1369047619047619, "rewards/margins": 8.267673992673993, "rewards/rejected": -7.130769230769231, "step": 1193 }, { "epoch": 0.8186492972231745, "grad_norm": 0.5822581067125742, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -269034642.28571427, "logits/rejected": -222997162.66666666, "logps/chosen": -261.85714285714283, "logps/rejected": -350.22222222222223, "loss": 0.1353, "rewards/chosen": 1.1584821428571428, "rewards/margins": 9.741815476190476, "rewards/rejected": -8.583333333333334, "step": 1194 }, { "epoch": 0.8193349331504971, "grad_norm": 0.6149019407159058, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -269367523.5555556, "logits/rejected": -339139437.71428573, "logps/chosen": -223.33333333333334, "logps/rejected": -343.7142857142857, "loss": 0.1748, "rewards/chosen": 1.65625, "rewards/margins": 9.897321428571429, "rewards/rejected": -8.241071428571429, "step": 1195 }, { "epoch": 0.8200205690778197, "grad_norm": 0.5765184751286475, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -304966490.83870965, "logits/rejected": -276061463.27272725, "logps/chosen": -229.16129032258064, "logps/rejected": -373.3333333333333, "loss": 0.1683, "rewards/chosen": 1.7056451612903225, "rewards/margins": 9.069281524926685, "rewards/rejected": -7.363636363636363, "step": 1196 }, { "epoch": 0.8207062050051422, "grad_norm": 0.49960934369268123, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -248774656.0, "logits/rejected": -236453888.0, "logps/chosen": -285.5, "logps/rejected": -380.0, "loss": 0.1495, "rewards/chosen": 1.904296875, "rewards/margins": 9.083984375, "rewards/rejected": -7.1796875, "step": 1197 }, { "epoch": 0.8213918409324649, "grad_norm": 0.5493697240268376, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -238012770.98666668, "logits/rejected": -263370636.0754717, "logps/chosen": -261.5466666666667, "logps/rejected": -362.5660377358491, "loss": 0.162, "rewards/chosen": 2.0966666666666667, "rewards/margins": 8.870251572327044, "rewards/rejected": -6.773584905660377, "step": 1198 }, { "epoch": 0.8220774768597875, "grad_norm": 0.5624229134846115, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -306665505.57377046, "logits/rejected": -314760604.6567164, "logps/chosen": -224.13114754098362, "logps/rejected": -342.44776119402985, "loss": 0.1523, "rewards/chosen": 1.1290983606557377, "rewards/margins": 9.017158062148274, "rewards/rejected": -7.888059701492537, "step": 1199 }, { "epoch": 0.82276311278711, "grad_norm": 0.5619107714837136, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -200036036.92307693, "logits/rejected": -230686720.0, "logps/chosen": -242.92307692307693, "logps/rejected": -299.36842105263156, "loss": 0.1351, "rewards/chosen": 1.6658653846153846, "rewards/margins": 7.6395495951417, "rewards/rejected": -5.973684210526316, "step": 1200 }, { "epoch": 0.8234487487144326, "grad_norm": 0.5149696948638377, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -186912364.1690141, "logits/rejected": -216043448.14035088, "logps/chosen": -244.73239436619718, "logps/rejected": -330.10526315789474, "loss": 0.155, "rewards/chosen": 1.2852112676056338, "rewards/margins": 9.188720039535458, "rewards/rejected": -7.9035087719298245, "step": 1201 }, { "epoch": 0.8241343846417553, "grad_norm": 0.5351406681028955, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -256488044.6060606, "logits/rejected": -307401893.16129035, "logps/chosen": -260.6060606060606, "logps/rejected": -344.258064516129, "loss": 0.1489, "rewards/chosen": 1.8181818181818181, "rewards/margins": 10.18108504398827, "rewards/rejected": -8.362903225806452, "step": 1202 }, { "epoch": 0.8248200205690778, "grad_norm": 0.5555029735283036, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -358922407.86885244, "logits/rejected": -274946017.4328358, "logps/chosen": -232.52459016393442, "logps/rejected": -381.13432835820896, "loss": 0.1396, "rewards/chosen": 1.5860655737704918, "rewards/margins": 10.123379006606314, "rewards/rejected": -8.537313432835822, "step": 1203 }, { "epoch": 0.8255056564964004, "grad_norm": 0.6239721414935953, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -314379216.73846155, "logits/rejected": -343000860.4444444, "logps/chosen": -286.2769230769231, "logps/rejected": -430.22222222222223, "loss": 0.1822, "rewards/chosen": 1.0125, "rewards/margins": 8.853769841269841, "rewards/rejected": -7.841269841269841, "step": 1204 }, { "epoch": 0.826191292423723, "grad_norm": 0.6514652138835932, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -208876339.2, "logits/rejected": -246723764.70588234, "logps/chosen": -259.2, "logps/rejected": -325.1764705882353, "loss": 0.1602, "rewards/chosen": 1.5375, "rewards/margins": 8.434558823529413, "rewards/rejected": -6.897058823529412, "step": 1205 }, { "epoch": 0.8268769283510456, "grad_norm": 0.576274686272536, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -314572800.0, "logits/rejected": -282591232.0, "logps/chosen": -230.75, "logps/rejected": -393.25, "loss": 0.1717, "rewards/chosen": 1.69921875, "rewards/margins": 10.08984375, "rewards/rejected": -8.390625, "step": 1206 }, { "epoch": 0.8275625642783682, "grad_norm": 0.6924166851741436, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -263601342.91525424, "logits/rejected": -311472662.26086956, "logps/chosen": -304.0, "logps/rejected": -373.1014492753623, "loss": 0.1361, "rewards/chosen": 1.8919491525423728, "rewards/margins": 10.667311471382952, "rewards/rejected": -8.77536231884058, "step": 1207 }, { "epoch": 0.8282482002056908, "grad_norm": 0.5316785935016565, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -276061463.27272725, "logits/rejected": -299231989.4794521, "logps/chosen": -314.1818181818182, "logps/rejected": -355.5068493150685, "loss": 0.1435, "rewards/chosen": 1.940909090909091, "rewards/margins": 7.982004981320049, "rewards/rejected": -6.041095890410959, "step": 1208 }, { "epoch": 0.8289338361330134, "grad_norm": 0.5977014506266481, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -193954784.96969697, "logits/rejected": -274388661.67741936, "logps/chosen": -307.1515151515151, "logps/rejected": -375.2258064516129, "loss": 0.1359, "rewards/chosen": 1.928030303030303, "rewards/margins": 9.952223851417399, "rewards/rejected": -8.024193548387096, "step": 1209 }, { "epoch": 0.8296194720603359, "grad_norm": 0.527854764035255, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -271189924.29850745, "logits/rejected": -266235165.37704918, "logps/chosen": -162.7462686567164, "logps/rejected": -327.344262295082, "loss": 0.1647, "rewards/chosen": 1.0671641791044777, "rewards/margins": 10.640934670907756, "rewards/rejected": -9.573770491803279, "step": 1210 }, { "epoch": 0.8303051079876586, "grad_norm": 0.5942628744083656, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -255293303.46666667, "logits/rejected": -287186462.11764705, "logps/chosen": -212.0, "logps/rejected": -352.0, "loss": 0.1414, "rewards/chosen": 1.703125, "rewards/margins": 10.225183823529411, "rewards/rejected": -8.522058823529411, "step": 1211 }, { "epoch": 0.8309907439149812, "grad_norm": 0.749934411843495, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -280486695.6619718, "logits/rejected": -303755910.7368421, "logps/chosen": -237.5211267605634, "logps/rejected": -366.03508771929825, "loss": 0.156, "rewards/chosen": 1.8327464788732395, "rewards/margins": 10.157307882382012, "rewards/rejected": -8.324561403508772, "step": 1212 }, { "epoch": 0.8316763798423037, "grad_norm": 0.5159314783043212, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -283082231.8730159, "logits/rejected": -291665447.38461536, "logps/chosen": -240.76190476190476, "logps/rejected": -344.12307692307695, "loss": 0.1514, "rewards/chosen": 1.5942460317460319, "rewards/margins": 10.917322954822955, "rewards/rejected": -9.323076923076924, "step": 1213 }, { "epoch": 0.8323620157696263, "grad_norm": 0.6668152506912236, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -217819448.40677965, "logits/rejected": -250199351.6521739, "logps/chosen": -241.35593220338984, "logps/rejected": -376.1159420289855, "loss": 0.1242, "rewards/chosen": 1.7330508474576272, "rewards/margins": 11.356239253254728, "rewards/rejected": -9.623188405797102, "step": 1214 }, { "epoch": 0.8330476516969489, "grad_norm": 0.6032308238480255, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -187625198.93333334, "logits/rejected": -208481581.17647058, "logps/chosen": -201.6, "logps/rejected": -386.8235294117647, "loss": 0.1633, "rewards/chosen": 1.5072916666666667, "rewards/margins": 6.823468137254903, "rewards/rejected": -5.3161764705882355, "step": 1215 }, { "epoch": 0.8337332876242715, "grad_norm": 0.5627654280667803, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -299016710.48101264, "logits/rejected": -392381419.1020408, "logps/chosen": -206.37974683544303, "logps/rejected": -368.0, "loss": 0.1841, "rewards/chosen": 1.2262658227848102, "rewards/margins": 9.99157194523379, "rewards/rejected": -8.76530612244898, "step": 1216 }, { "epoch": 0.8344189235515941, "grad_norm": 0.5575176683832731, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -209848352.5079365, "logits/rejected": -229718803.69230768, "logps/chosen": -275.55555555555554, "logps/rejected": -308.18461538461537, "loss": 0.1654, "rewards/chosen": 1.2946428571428572, "rewards/margins": 9.725412087912089, "rewards/rejected": -8.430769230769231, "step": 1217 }, { "epoch": 0.8351045594789167, "grad_norm": 0.5602393335401425, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -255386510.2222222, "logits/rejected": -303280443.0769231, "logps/chosen": -266.92063492063494, "logps/rejected": -401.2307692307692, "loss": 0.1681, "rewards/chosen": 1.2564484126984128, "rewards/margins": 7.9718330280830285, "rewards/rejected": -6.7153846153846155, "step": 1218 }, { "epoch": 0.8357901954062393, "grad_norm": 0.50687107659948, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -219938816.0, "logits/rejected": -273416192.0, "logps/chosen": -323.25, "logps/rejected": -335.5, "loss": 0.1516, "rewards/chosen": 1.767578125, "rewards/margins": 10.400390625, "rewards/rejected": -8.6328125, "step": 1219 }, { "epoch": 0.8364758313335618, "grad_norm": 0.6268125106654177, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -200055590.78787878, "logits/rejected": -260723348.6451613, "logps/chosen": -218.1818181818182, "logps/rejected": -329.2903225806452, "loss": 0.1747, "rewards/chosen": 1.5691287878787878, "rewards/margins": 9.311064271749755, "rewards/rejected": -7.741935483870968, "step": 1220 }, { "epoch": 0.8371614672608845, "grad_norm": 0.613247517817253, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -229202581.66153845, "logits/rejected": -271364811.17460316, "logps/chosen": -231.63076923076923, "logps/rejected": -392.3809523809524, "loss": 0.1657, "rewards/chosen": 1.0682692307692307, "rewards/margins": 10.155570818070817, "rewards/rejected": -9.087301587301587, "step": 1221 }, { "epoch": 0.8378471031882071, "grad_norm": 0.5706230518660559, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -248396003.55555555, "logits/rejected": -293002093.71428573, "logps/chosen": -263.1111111111111, "logps/rejected": -364.0, "loss": 0.1669, "rewards/chosen": 1.78125, "rewards/margins": 9.558035714285715, "rewards/rejected": -7.776785714285714, "step": 1222 }, { "epoch": 0.8385327391155296, "grad_norm": 0.5045941872111985, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -249182913.04918033, "logits/rejected": -263677738.02985075, "logps/chosen": -322.3606557377049, "logps/rejected": -403.5820895522388, "loss": 0.1471, "rewards/chosen": 2.1331967213114753, "rewards/margins": 9.648122094445803, "rewards/rejected": -7.514925373134329, "step": 1223 }, { "epoch": 0.8392183750428522, "grad_norm": 0.6193690000661587, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -246353679.05882353, "logits/rejected": -272629760.0, "logps/chosen": -309.1764705882353, "logps/rejected": -392.53333333333336, "loss": 0.1545, "rewards/chosen": 2.051470588235294, "rewards/margins": 6.734803921568627, "rewards/rejected": -4.683333333333334, "step": 1224 }, { "epoch": 0.8399040109701749, "grad_norm": 0.5216633069197268, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -286261248.0, "logits/rejected": -350486528.0, "logps/chosen": -233.75, "logps/rejected": -341.75, "loss": 0.1812, "rewards/chosen": 1.1201171875, "rewards/margins": 7.2060546875, "rewards/rejected": -6.0859375, "step": 1225 }, { "epoch": 0.8405896468974974, "grad_norm": 0.47992405970946284, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -322617612.59016395, "logits/rejected": -243895647.52238807, "logps/chosen": -268.8524590163934, "logps/rejected": -410.74626865671644, "loss": 0.1589, "rewards/chosen": 0.9549180327868853, "rewards/margins": 10.19372400293614, "rewards/rejected": -9.238805970149254, "step": 1226 }, { "epoch": 0.84127528282482, "grad_norm": 0.5714078371928173, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -249099003.66101694, "logits/rejected": -283996931.71014494, "logps/chosen": -181.96610169491527, "logps/rejected": -397.9130434782609, "loss": 0.1496, "rewards/chosen": 1.4152542372881356, "rewards/margins": 9.241341193809875, "rewards/rejected": -7.826086956521739, "step": 1227 }, { "epoch": 0.8419609187521426, "grad_norm": 0.7584142152193312, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -262425706.98507464, "logits/rejected": -255508748.59016395, "logps/chosen": -225.19402985074626, "logps/rejected": -409.1803278688525, "loss": 0.1939, "rewards/chosen": 1.2070895522388059, "rewards/margins": 7.780860044042084, "rewards/rejected": -6.573770491803279, "step": 1228 }, { "epoch": 0.8426465546794653, "grad_norm": 0.5121335846083414, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -267258107.50877193, "logits/rejected": -217867509.18309858, "logps/chosen": -247.859649122807, "logps/rejected": -374.0845070422535, "loss": 0.1588, "rewards/chosen": 1.269736842105263, "rewards/margins": 9.621849518161602, "rewards/rejected": -8.352112676056338, "step": 1229 }, { "epoch": 0.8433321906067878, "grad_norm": 0.598911583251286, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -365208383.07246375, "logits/rejected": -256776712.6779661, "logps/chosen": -237.44927536231884, "logps/rejected": -346.5762711864407, "loss": 0.1725, "rewards/chosen": 1.5815217391304348, "rewards/margins": 23993265.310335297, "rewards/rejected": -23993263.72881356, "step": 1230 }, { "epoch": 0.8440178265341104, "grad_norm": 0.5573779618101906, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -236018462.37288135, "logits/rejected": -280836006.95652175, "logps/chosen": -296.6779661016949, "logps/rejected": -402.5507246376812, "loss": 0.1438, "rewards/chosen": 1.2351694915254237, "rewards/margins": 9.22792311471383, "rewards/rejected": -7.992753623188406, "step": 1231 }, { "epoch": 0.844703462461433, "grad_norm": 0.7032694543666121, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -270249209.0810811, "logits/rejected": -300591786.6666667, "logps/chosen": -229.83783783783784, "logps/rejected": -360.2962962962963, "loss": 0.1757, "rewards/chosen": 1.4864864864864864, "rewards/margins": 9.32907907907908, "rewards/rejected": -7.842592592592593, "step": 1232 }, { "epoch": 0.8453890983887555, "grad_norm": 0.5187204427117909, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -289729614.7692308, "logits/rejected": -252541251.36842105, "logps/chosen": -223.3846153846154, "logps/rejected": -361.6842105263158, "loss": 0.1493, "rewards/chosen": 1.0859375, "rewards/margins": 9.046463815789473, "rewards/rejected": -7.9605263157894735, "step": 1233 }, { "epoch": 0.8460747343160782, "grad_norm": 0.5121233563247504, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -217158612.73239437, "logits/rejected": -226345247.4385965, "logps/chosen": -244.73239436619718, "logps/rejected": -342.7368421052632, "loss": 0.1766, "rewards/chosen": 1.6848591549295775, "rewards/margins": 9.101525821596244, "rewards/rejected": -7.416666666666667, "step": 1234 }, { "epoch": 0.8467603702434008, "grad_norm": 0.49150221017552753, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -374324442.2295082, "logits/rejected": -275697236.0597015, "logps/chosen": -219.27868852459017, "logps/rejected": -340.05970149253733, "loss": 0.1355, "rewards/chosen": 1.7315573770491803, "rewards/margins": 7.082303645705897, "rewards/rejected": -5.350746268656716, "step": 1235 }, { "epoch": 0.8474460061707233, "grad_norm": 0.5605120593149555, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -204821845.33333334, "logits/rejected": -214209097.14285713, "logps/chosen": -259.77777777777777, "logps/rejected": -360.0, "loss": 0.1898, "rewards/chosen": 1.5625, "rewards/margins": 9.526785714285715, "rewards/rejected": -7.964285714285714, "step": 1236 }, { "epoch": 0.8481316420980459, "grad_norm": 0.5387491641763444, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -220458504.98245615, "logits/rejected": -224483876.05633804, "logps/chosen": -299.50877192982455, "logps/rejected": -305.1267605633803, "loss": 0.1344, "rewards/chosen": 2.3092105263157894, "rewards/margins": 9.914844329132691, "rewards/rejected": -7.605633802816901, "step": 1237 }, { "epoch": 0.8488172780253685, "grad_norm": 0.4942571524633027, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -326393111.27272725, "logits/rejected": -371534154.32258064, "logps/chosen": -273.2121212121212, "logps/rejected": -393.80645161290323, "loss": 0.1611, "rewards/chosen": 1.759469696969697, "rewards/margins": 8.01753421309873, "rewards/rejected": -6.258064516129032, "step": 1238 }, { "epoch": 0.8495029139526912, "grad_norm": 0.6043648217268749, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -266401854.06060606, "logits/rejected": -278718265.8064516, "logps/chosen": -304.24242424242425, "logps/rejected": -349.16129032258067, "loss": 0.165, "rewards/chosen": 1.2992424242424243, "rewards/margins": 8.976661779081134, "rewards/rejected": -7.67741935483871, "step": 1239 }, { "epoch": 0.8501885498800137, "grad_norm": 0.5120671430286736, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -287535671.13846153, "logits/rejected": -267902845.96825397, "logps/chosen": -194.2153846153846, "logps/rejected": -386.53968253968253, "loss": 0.1647, "rewards/chosen": 1.4807692307692308, "rewards/margins": 39031195.89346764, "rewards/rejected": -39031194.41269841, "step": 1240 }, { "epoch": 0.8508741858073363, "grad_norm": 0.4862270996764965, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -326432556.13793105, "logits/rejected": -359991120.45714283, "logps/chosen": -294.62068965517244, "logps/rejected": -351.0857142857143, "loss": 0.1609, "rewards/chosen": 1.6303879310344827, "rewards/margins": 10.244673645320196, "rewards/rejected": -8.614285714285714, "step": 1241 }, { "epoch": 0.8515598217346589, "grad_norm": 0.5496193523951016, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -255919120.25396827, "logits/rejected": -242108132.43076923, "logps/chosen": -242.79365079365078, "logps/rejected": -360.3692307692308, "loss": 0.1716, "rewards/chosen": 1.6468253968253967, "rewards/margins": 9.085286935286936, "rewards/rejected": -7.438461538461539, "step": 1242 }, { "epoch": 0.8522454576619815, "grad_norm": 0.4660167169874139, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -289873009.7777778, "logits/rejected": -274330153.5135135, "logps/chosen": -300.14814814814815, "logps/rejected": -310.9189189189189, "loss": 0.1306, "rewards/chosen": 1.2534722222222223, "rewards/margins": 9.73995870870871, "rewards/rejected": -8.486486486486486, "step": 1243 }, { "epoch": 0.8529310935893041, "grad_norm": 0.5814276159313811, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -215729673.66037735, "logits/rejected": -350308270.08, "logps/chosen": -272.75471698113205, "logps/rejected": -418.9866666666667, "loss": 0.1565, "rewards/chosen": 1.2452830188679245, "rewards/margins": 9.818616352201259, "rewards/rejected": -8.573333333333334, "step": 1244 }, { "epoch": 0.8536167295166267, "grad_norm": 0.5618744132503991, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -298535755.2941176, "logits/rejected": -267876215.46666667, "logps/chosen": -347.6470588235294, "logps/rejected": -339.2, "loss": 0.1533, "rewards/chosen": 1.8878676470588236, "rewards/margins": 8.546200980392157, "rewards/rejected": -6.658333333333333, "step": 1245 }, { "epoch": 0.8543023654439492, "grad_norm": 0.5573203609821709, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -248546337.03225806, "logits/rejected": -237677226.66666666, "logps/chosen": -217.67741935483872, "logps/rejected": -394.6666666666667, "loss": 0.1621, "rewards/chosen": 1.4526209677419355, "rewards/margins": 8.87686339198436, "rewards/rejected": -7.424242424242424, "step": 1246 }, { "epoch": 0.8549880013712718, "grad_norm": 0.5377633597153606, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -265779063.46666667, "logits/rejected": -280524920.4705882, "logps/chosen": -326.8, "logps/rejected": -326.3529411764706, "loss": 0.1606, "rewards/chosen": 2.025, "rewards/margins": 9.672058823529412, "rewards/rejected": -7.647058823529412, "step": 1247 }, { "epoch": 0.8556736372985945, "grad_norm": 0.5063817795993281, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -210460854.04444444, "logits/rejected": -217093132.33734939, "logps/chosen": -195.9111111111111, "logps/rejected": -367.421686746988, "loss": 0.1311, "rewards/chosen": 0.8541666666666666, "rewards/margins": 9.552961847389557, "rewards/rejected": -8.698795180722891, "step": 1248 }, { "epoch": 0.856359273225917, "grad_norm": 0.500389718005917, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -295626116.4137931, "logits/rejected": -254534334.17142856, "logps/chosen": -274.2068965517241, "logps/rejected": -352.9142857142857, "loss": 0.1434, "rewards/chosen": 2.0581896551724137, "rewards/margins": 10.501046798029556, "rewards/rejected": -8.442857142857143, "step": 1249 }, { "epoch": 0.8570449091532396, "grad_norm": 0.4770902127728846, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -239430777.4915254, "logits/rejected": -289103040.92753625, "logps/chosen": -344.40677966101697, "logps/rejected": -359.6521739130435, "loss": 0.1248, "rewards/chosen": 2.3146186440677967, "rewards/margins": 10.263894006386638, "rewards/rejected": -7.949275362318841, "step": 1250 }, { "epoch": 0.8577305450805622, "grad_norm": 0.5535045876253213, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -252444672.0, "logits/rejected": -272105472.0, "logps/chosen": -276.25, "logps/rejected": -375.0, "loss": 0.1407, "rewards/chosen": 1.908203125, "rewards/margins": 10.025390625, "rewards/rejected": -8.1171875, "step": 1251 }, { "epoch": 0.8584161810078849, "grad_norm": 0.5078463695229122, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -220947403.93220338, "logits/rejected": -218346956.057971, "logps/chosen": -216.94915254237287, "logps/rejected": -361.27536231884056, "loss": 0.1562, "rewards/chosen": 1.097457627118644, "rewards/margins": 9.4018054532056, "rewards/rejected": -8.304347826086957, "step": 1252 }, { "epoch": 0.8591018169352074, "grad_norm": 0.6619328228026884, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -341296508.3428571, "logits/rejected": -322816776.82758623, "logps/chosen": -258.74285714285713, "logps/rejected": -405.51724137931035, "loss": 0.1666, "rewards/chosen": 1.5839285714285714, "rewards/margins": 9.264963054187191, "rewards/rejected": -7.681034482758621, "step": 1253 }, { "epoch": 0.85978745286253, "grad_norm": 0.5380967066834434, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -295415033.0810811, "logits/rejected": -292979901.6296296, "logps/chosen": -301.4054054054054, "logps/rejected": -372.44444444444446, "loss": 0.1581, "rewards/chosen": 2.0135135135135136, "rewards/margins": 11.022772772772774, "rewards/rejected": -9.00925925925926, "step": 1254 }, { "epoch": 0.8604730887898526, "grad_norm": 0.5718147586034874, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -264581230.7027027, "logits/rejected": -276824064.0, "logps/chosen": -226.8108108108108, "logps/rejected": -376.0, "loss": 0.1715, "rewards/chosen": 1.6908783783783783, "rewards/margins": 9.977915415415415, "rewards/rejected": -8.287037037037036, "step": 1255 }, { "epoch": 0.8611587247171751, "grad_norm": 0.6501780482795511, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -259925273.9710145, "logits/rejected": -253648757.15254238, "logps/chosen": -234.43478260869566, "logps/rejected": -326.50847457627117, "loss": 0.1796, "rewards/chosen": 1.4085144927536233, "rewards/margins": 8.77292127241464, "rewards/rejected": -7.364406779661017, "step": 1256 }, { "epoch": 0.8618443606444978, "grad_norm": 0.6012159220292492, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -313973613.71428573, "logits/rejected": -375293416.36923075, "logps/chosen": -233.9047619047619, "logps/rejected": -316.55384615384617, "loss": 0.1729, "rewards/chosen": 0.8958333333333334, "rewards/margins": 9.57275641025641, "rewards/rejected": -8.676923076923076, "step": 1257 }, { "epoch": 0.8625299965718204, "grad_norm": 0.5571940676999039, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -331596739.7647059, "logits/rejected": -291923558.4, "logps/chosen": -257.1764705882353, "logps/rejected": -296.8, "loss": 0.1468, "rewards/chosen": 1.2738970588235294, "rewards/margins": 8.34889705882353, "rewards/rejected": -7.075, "step": 1258 }, { "epoch": 0.8632156324991429, "grad_norm": 0.6570991290397671, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -255548608.92753622, "logits/rejected": -235734102.77966103, "logps/chosen": -281.5072463768116, "logps/rejected": -405.1525423728813, "loss": 0.1539, "rewards/chosen": 2.1394927536231885, "rewards/margins": 10.114069024809629, "rewards/rejected": -7.97457627118644, "step": 1259 }, { "epoch": 0.8639012684264655, "grad_norm": 0.5584087396267047, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -248082767.73770493, "logits/rejected": -296981763.8208955, "logps/chosen": -214.29508196721312, "logps/rejected": -405.4925373134328, "loss": 0.1642, "rewards/chosen": 0.7715163934426229, "rewards/margins": 9.592411915830683, "rewards/rejected": -8.82089552238806, "step": 1260 }, { "epoch": 0.8645869043537882, "grad_norm": 0.5805357264376693, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -229105261.1147541, "logits/rejected": -260172051.1044776, "logps/chosen": -289.57377049180326, "logps/rejected": -377.3134328358209, "loss": 0.153, "rewards/chosen": 1.389344262295082, "rewards/margins": 9.344568142892097, "rewards/rejected": -7.955223880597015, "step": 1261 }, { "epoch": 0.8652725402811108, "grad_norm": 0.6223802088797704, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -245299133.93548387, "logits/rejected": -326901511.75757575, "logps/chosen": -308.38709677419354, "logps/rejected": -337.2121212121212, "loss": 0.1337, "rewards/chosen": 2.242943548387097, "rewards/margins": 11.288398093841643, "rewards/rejected": -9.045454545454545, "step": 1262 }, { "epoch": 0.8659581762084333, "grad_norm": 0.49992030938737947, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -240043244.30769232, "logits/rejected": -296663787.6825397, "logps/chosen": -295.87692307692305, "logps/rejected": -375.87301587301585, "loss": 0.1318, "rewards/chosen": 1.8480769230769232, "rewards/margins": 9.538553113553114, "rewards/rejected": -7.690476190476191, "step": 1263 }, { "epoch": 0.8666438121357559, "grad_norm": 0.6075824162540712, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -281680626.5263158, "logits/rejected": -280373090.46153843, "logps/chosen": -279.7894736842105, "logps/rejected": -371.6923076923077, "loss": 0.1797, "rewards/chosen": 1.5082236842105263, "rewards/margins": 10.008223684210526, "rewards/rejected": -8.5, "step": 1264 }, { "epoch": 0.8673294480630785, "grad_norm": 0.6404878547072753, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -263881640.22857141, "logits/rejected": -301989888.0, "logps/chosen": -288.22857142857146, "logps/rejected": -358.62068965517244, "loss": 0.15, "rewards/chosen": 2.0142857142857142, "rewards/margins": 7.505665024630542, "rewards/rejected": -5.491379310344827, "step": 1265 }, { "epoch": 0.8680150839904011, "grad_norm": 0.5840714449869315, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -233422135.6521739, "logits/rejected": -234027945.22033897, "logps/chosen": -240.231884057971, "logps/rejected": -374.77966101694915, "loss": 0.1671, "rewards/chosen": 1.7744565217391304, "rewards/margins": 7.528693809874724, "rewards/rejected": -5.754237288135593, "step": 1266 }, { "epoch": 0.8687007199177237, "grad_norm": 0.6853296741539991, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -209593625.9710145, "logits/rejected": -297440134.5084746, "logps/chosen": -274.7826086956522, "logps/rejected": -390.50847457627117, "loss": 0.1905, "rewards/chosen": 1.3623188405797102, "rewards/margins": 8.074183247359372, "rewards/rejected": -6.711864406779661, "step": 1267 }, { "epoch": 0.8693863558450463, "grad_norm": 0.571089345864459, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -213385216.0, "logits/rejected": -247988224.0, "logps/chosen": -221.0, "logps/rejected": -304.0, "loss": 0.1513, "rewards/chosen": 1.904296875, "rewards/margins": 8.818359375, "rewards/rejected": -6.9140625, "step": 1268 }, { "epoch": 0.8700719917723688, "grad_norm": 0.6618780765164793, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -284950528.0, "logits/rejected": -272629760.0, "logps/chosen": -257.5, "logps/rejected": -362.5, "loss": 0.1789, "rewards/chosen": 1.1474609375, "rewards/margins": 9.6552734375, "rewards/rejected": -8.5078125, "step": 1269 }, { "epoch": 0.8707576276996914, "grad_norm": 0.5087567728935694, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -220861706.52054796, "logits/rejected": -244032232.72727272, "logps/chosen": -241.75342465753425, "logps/rejected": -371.2, "loss": 0.1444, "rewards/chosen": 2.136986301369863, "rewards/margins": 10.309713574097135, "rewards/rejected": -8.172727272727272, "step": 1270 }, { "epoch": 0.8714432636270141, "grad_norm": 0.6320393665089847, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -328833433.6, "logits/rejected": -304519785.6507937, "logps/chosen": -322.95384615384614, "logps/rejected": -405.8412698412698, "loss": 0.15, "rewards/chosen": 1.978846153846154, "rewards/margins": 10.177258852258852, "rewards/rejected": -8.198412698412698, "step": 1271 }, { "epoch": 0.8721288995543367, "grad_norm": 0.6158355088228586, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -246332139.68253967, "logits/rejected": -283147783.8769231, "logps/chosen": -211.04761904761904, "logps/rejected": -385.4769230769231, "loss": 0.1811, "rewards/chosen": 1.2797619047619047, "rewards/margins": 8.772069597069597, "rewards/rejected": -7.492307692307692, "step": 1272 }, { "epoch": 0.8728145354816592, "grad_norm": 0.6336102210484414, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -223889054.89655173, "logits/rejected": -259327824.45714286, "logps/chosen": -276.41379310344826, "logps/rejected": -330.9714285714286, "loss": 0.146, "rewards/chosen": 1.8297413793103448, "rewards/margins": 9.986884236453202, "rewards/rejected": -8.157142857142857, "step": 1273 }, { "epoch": 0.8735001714089818, "grad_norm": 0.615605668581359, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -269186674.6268657, "logits/rejected": -314641559.08196723, "logps/chosen": -289.910447761194, "logps/rejected": -372.4590163934426, "loss": 0.1528, "rewards/chosen": 1.8451492537313432, "rewards/margins": 10.77957548323954, "rewards/rejected": -8.934426229508198, "step": 1274 }, { "epoch": 0.8741858073363045, "grad_norm": 0.4774030515638399, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -240931823.21311477, "logits/rejected": -247150928.23880598, "logps/chosen": -332.8524590163934, "logps/rejected": -392.1194029850746, "loss": 0.1338, "rewards/chosen": 2.3422131147540983, "rewards/margins": 9.857138487888427, "rewards/rejected": -7.514925373134329, "step": 1275 }, { "epoch": 0.874871443263627, "grad_norm": 0.4816491300310377, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -303280443.0769231, "logits/rejected": -344332385.52380955, "logps/chosen": -225.72307692307692, "logps/rejected": -388.06349206349205, "loss": 0.1451, "rewards/chosen": 2.0096153846153846, "rewards/margins": 10.176282051282051, "rewards/rejected": -8.166666666666666, "step": 1276 }, { "epoch": 0.8755570791909496, "grad_norm": 0.4935787658292199, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -263994428.2352941, "logits/rejected": -269274316.8, "logps/chosen": -314.8235294117647, "logps/rejected": -387.2, "loss": 0.1445, "rewards/chosen": 2.2003676470588234, "rewards/margins": 10.983700980392157, "rewards/rejected": -8.783333333333333, "step": 1277 }, { "epoch": 0.8762427151182722, "grad_norm": 0.5305114331383015, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -210913572.57142857, "logits/rejected": -235748811.03448275, "logps/chosen": -185.37142857142857, "logps/rejected": -354.2068965517241, "loss": 0.1678, "rewards/chosen": 1.3714285714285714, "rewards/margins": 9.750738916256157, "rewards/rejected": -8.379310344827585, "step": 1278 }, { "epoch": 0.8769283510455947, "grad_norm": 0.5997460018064784, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -284965948.2352941, "logits/rejected": -277103684.26666665, "logps/chosen": -260.0, "logps/rejected": -358.1333333333333, "loss": 0.1747, "rewards/chosen": 0.9319852941176471, "rewards/margins": 6.71531862745098, "rewards/rejected": -5.783333333333333, "step": 1279 }, { "epoch": 0.8776139869729174, "grad_norm": 0.6853737930496859, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -281079155.01449275, "logits/rejected": -257629791.45762712, "logps/chosen": -288.463768115942, "logps/rejected": -417.08474576271186, "loss": 0.1827, "rewards/chosen": 2.148550724637681, "rewards/margins": 7.521432080569884, "rewards/rejected": -5.372881355932203, "step": 1280 }, { "epoch": 0.87829962290024, "grad_norm": 0.5640698588128846, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -194873816.6153846, "logits/rejected": -207984217.3968254, "logps/chosen": -195.44615384615383, "logps/rejected": -315.42857142857144, "loss": 0.1566, "rewards/chosen": 1.3680288461538461, "rewards/margins": 9.510885989010989, "rewards/rejected": -8.142857142857142, "step": 1281 }, { "epoch": 0.8789852588275625, "grad_norm": 0.622179900189473, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -255190285.47368422, "logits/rejected": -241261092.05633804, "logps/chosen": -204.91228070175438, "logps/rejected": -388.9577464788732, "loss": 0.1433, "rewards/chosen": 1.381578947368421, "rewards/margins": 9.832283172720533, "rewards/rejected": -8.450704225352112, "step": 1282 }, { "epoch": 0.8796708947548851, "grad_norm": 0.576379666733684, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -254054985.14285713, "logits/rejected": -261466458.58461538, "logps/chosen": -228.6984126984127, "logps/rejected": -372.67692307692306, "loss": 0.146, "rewards/chosen": 1.6567460317460319, "rewards/margins": 10.702899877899878, "rewards/rejected": -9.046153846153846, "step": 1283 }, { "epoch": 0.8803565306822078, "grad_norm": 0.7266923636930003, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -286890393.6, "logits/rejected": -231120613.5172414, "logps/chosen": -260.1142857142857, "logps/rejected": -381.2413793103448, "loss": 0.1718, "rewards/chosen": 1.5035714285714286, "rewards/margins": 8.400123152709359, "rewards/rejected": -6.896551724137931, "step": 1284 }, { "epoch": 0.8810421666095304, "grad_norm": 0.5275207881203609, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -273443580.1791045, "logits/rejected": -252483348.98360655, "logps/chosen": -255.28358208955223, "logps/rejected": -437.5081967213115, "loss": 0.1337, "rewards/chosen": 1.912313432835821, "rewards/margins": 7.772969170540739, "rewards/rejected": -5.860655737704918, "step": 1285 }, { "epoch": 0.8817278025368529, "grad_norm": 0.5265050625604953, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -218383428.26666668, "logits/rejected": -327895883.2941176, "logps/chosen": -280.53333333333336, "logps/rejected": -370.3529411764706, "loss": 0.1488, "rewards/chosen": 1.878125, "rewards/margins": 9.944301470588236, "rewards/rejected": -8.066176470588236, "step": 1286 }, { "epoch": 0.8824134384641755, "grad_norm": 0.6080473514125789, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -230031360.0, "logits/rejected": -264241152.0, "logps/chosen": -249.375, "logps/rejected": -385.5, "loss": 0.1567, "rewards/chosen": 1.70703125, "rewards/margins": 7.32421875, "rewards/rejected": -5.6171875, "step": 1287 }, { "epoch": 0.8830990743914982, "grad_norm": 0.6386728480763818, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -300339670.0327869, "logits/rejected": -320770353.67164177, "logps/chosen": -321.3114754098361, "logps/rejected": -293.4925373134328, "loss": 0.1706, "rewards/chosen": 1.2950819672131149, "rewards/margins": 8.966723758257892, "rewards/rejected": -7.6716417910447765, "step": 1288 }, { "epoch": 0.8837847103188207, "grad_norm": 0.5334370750354098, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -236978176.0, "logits/rejected": -285999104.0, "logps/chosen": -249.25, "logps/rejected": -376.5, "loss": 0.1523, "rewards/chosen": 1.673828125, "rewards/margins": 9.962890625, "rewards/rejected": -8.2890625, "step": 1289 }, { "epoch": 0.8844703462461433, "grad_norm": 0.4992166724889878, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -294912000.0, "logits/rejected": -325058560.0, "logps/chosen": -254.5, "logps/rejected": -374.0, "loss": 0.1282, "rewards/chosen": 1.4140625, "rewards/margins": 11.0, "rewards/rejected": -9.5859375, "step": 1290 }, { "epoch": 0.8851559821734659, "grad_norm": 0.8121130535256996, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -280224851.027027, "logits/rejected": -271542347.8518519, "logps/chosen": -170.59459459459458, "logps/rejected": -448.5925925925926, "loss": 0.1848, "rewards/chosen": 1.6537162162162162, "rewards/margins": 8.73704954954955, "rewards/rejected": -7.083333333333333, "step": 1291 }, { "epoch": 0.8858416181007884, "grad_norm": 0.5324926646937946, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -256376832.0, "logits/rejected": -299630592.0, "logps/chosen": -223.75, "logps/rejected": -377.0, "loss": 0.1755, "rewards/chosen": 1.41796875, "rewards/margins": 7.01171875, "rewards/rejected": -5.59375, "step": 1292 }, { "epoch": 0.8865272540281111, "grad_norm": 0.491323443375673, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -279145910.85714287, "logits/rejected": -253988408.8888889, "logps/chosen": -228.21428571428572, "logps/rejected": -346.22222222222223, "loss": 0.1291, "rewards/chosen": 1.5479910714285714, "rewards/margins": 9.367435515873016, "rewards/rejected": -7.819444444444445, "step": 1293 }, { "epoch": 0.8872128899554337, "grad_norm": 0.5519850110828991, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -258928366.93333334, "logits/rejected": -263500980.70588234, "logps/chosen": -247.73333333333332, "logps/rejected": -348.2352941176471, "loss": 0.1522, "rewards/chosen": 1.9104166666666667, "rewards/margins": 9.932475490196078, "rewards/rejected": -8.022058823529411, "step": 1294 }, { "epoch": 0.8878985258827563, "grad_norm": 0.5691754567726878, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -287076807.1111111, "logits/rejected": -228986326.4864865, "logps/chosen": -216.59259259259258, "logps/rejected": -352.43243243243245, "loss": 0.1347, "rewards/chosen": 1.3356481481481481, "rewards/margins": 9.619431931931933, "rewards/rejected": -8.283783783783784, "step": 1295 }, { "epoch": 0.8885841618100788, "grad_norm": 0.5848827270408817, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -241641991.64179105, "logits/rejected": -262659693.1147541, "logps/chosen": -245.01492537313433, "logps/rejected": -380.327868852459, "loss": 0.168, "rewards/chosen": 1.632462686567164, "rewards/margins": 10.419347932468805, "rewards/rejected": -8.78688524590164, "step": 1296 }, { "epoch": 0.8892697977374014, "grad_norm": 0.5957178194310885, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -300756269.1764706, "logits/rejected": -264241152.0, "logps/chosen": -277.88235294117646, "logps/rejected": -382.93333333333334, "loss": 0.1506, "rewards/chosen": 2.2720588235294117, "rewards/margins": 10.013725490196078, "rewards/rejected": -7.741666666666666, "step": 1297 }, { "epoch": 0.8899554336647241, "grad_norm": 0.7624851503306929, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -170967820.19047618, "logits/rejected": -233074246.8923077, "logps/chosen": -272.5079365079365, "logps/rejected": -350.5230769230769, "loss": 0.1591, "rewards/chosen": 1.3452380952380953, "rewards/margins": 6.929853479853479, "rewards/rejected": -5.584615384615384, "step": 1298 }, { "epoch": 0.8906410695920466, "grad_norm": 0.5699167216647428, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -323413102.27692306, "logits/rejected": -286011587.04761904, "logps/chosen": -299.0769230769231, "logps/rejected": -438.6031746031746, "loss": 0.1437, "rewards/chosen": 1.4826923076923078, "rewards/margins": 10.173168498168497, "rewards/rejected": -8.69047619047619, "step": 1299 }, { "epoch": 0.8913267055193692, "grad_norm": 0.5237650456934403, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -202772903.72413793, "logits/rejected": -295039327.0857143, "logps/chosen": -275.3103448275862, "logps/rejected": -374.85714285714283, "loss": 0.1634, "rewards/chosen": 1.2122844827586208, "rewards/margins": 10.24085591133005, "rewards/rejected": -9.028571428571428, "step": 1300 }, { "epoch": 0.8920123414466918, "grad_norm": 0.6464107679048489, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -293483130.5915493, "logits/rejected": -264314736.28070176, "logps/chosen": -278.53521126760563, "logps/rejected": -348.9122807017544, "loss": 0.1592, "rewards/chosen": 1.7517605633802817, "rewards/margins": 9.92719915987151, "rewards/rejected": -8.175438596491228, "step": 1301 }, { "epoch": 0.8926979773740144, "grad_norm": 0.774695793848552, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -328006730.20289856, "logits/rejected": -288056267.9322034, "logps/chosen": -266.2028985507246, "logps/rejected": -369.35593220338984, "loss": 0.1733, "rewards/chosen": 1.2952898550724639, "rewards/margins": 25557689.702069517, "rewards/rejected": -25557688.40677966, "step": 1302 }, { "epoch": 0.893383613301337, "grad_norm": 0.5865062890317646, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -223007917.2923077, "logits/rejected": -206386387.3015873, "logps/chosen": -188.30769230769232, "logps/rejected": -372.3174603174603, "loss": 0.1772, "rewards/chosen": 1.2620192307692308, "rewards/margins": 9.095352564102564, "rewards/rejected": -7.833333333333333, "step": 1303 }, { "epoch": 0.8940692492286596, "grad_norm": 0.5374574388284661, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -349160611.2463768, "logits/rejected": -245402328.94915253, "logps/chosen": -244.40579710144928, "logps/rejected": -374.50847457627117, "loss": 0.1684, "rewards/chosen": 1.5833333333333333, "rewards/margins": 9.896892655367232, "rewards/rejected": -8.313559322033898, "step": 1304 }, { "epoch": 0.8947548851559822, "grad_norm": 0.6644311105584971, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -271353232.6956522, "logits/rejected": -262179544.94915253, "logps/chosen": -257.8550724637681, "logps/rejected": -403.79661016949154, "loss": 0.173, "rewards/chosen": 1.3677536231884058, "rewards/margins": 6.605041758781626, "rewards/rejected": -5.237288135593221, "step": 1305 }, { "epoch": 0.8954405210833047, "grad_norm": 0.6532650264834649, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -292507113.73913044, "logits/rejected": -318482744.40677965, "logps/chosen": -248.1159420289855, "logps/rejected": -402.9830508474576, "loss": 0.172, "rewards/chosen": 1.3115942028985508, "rewards/margins": 7.557356914762957, "rewards/rejected": -6.245762711864407, "step": 1306 }, { "epoch": 0.8961261570106274, "grad_norm": 0.5168438157686703, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -249298944.0, "logits/rejected": -251396096.0, "logps/chosen": -267.5, "logps/rejected": -336.75, "loss": 0.1253, "rewards/chosen": 2.1328125, "rewards/margins": 7.8984375, "rewards/rejected": -5.765625, "step": 1307 }, { "epoch": 0.89681179293795, "grad_norm": 0.6143776962726475, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -228271817.6969697, "logits/rejected": -233257422.4516129, "logps/chosen": -231.5151515151515, "logps/rejected": -392.258064516129, "loss": 0.1732, "rewards/chosen": 1.1704545454545454, "rewards/margins": 9.726906158357771, "rewards/rejected": -8.556451612903226, "step": 1308 }, { "epoch": 0.8974974288652725, "grad_norm": 0.4894850646246181, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -305345331.2, "logits/rejected": -322229069.2063492, "logps/chosen": -265.84615384615387, "logps/rejected": -307.8095238095238, "loss": 0.1317, "rewards/chosen": 1.8721153846153846, "rewards/margins": 9.229258241758242, "rewards/rejected": -7.357142857142857, "step": 1309 }, { "epoch": 0.8981830647925951, "grad_norm": 0.5238773631075209, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -200945291.63636363, "logits/rejected": -210527000.77419356, "logps/chosen": -286.3030303030303, "logps/rejected": -323.0967741935484, "loss": 0.165, "rewards/chosen": 1.5208333333333333, "rewards/margins": 9.165994623655914, "rewards/rejected": -7.645161290322581, "step": 1310 }, { "epoch": 0.8988687007199178, "grad_norm": 0.6781047052622321, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -220278632.2962963, "logits/rejected": -270249209.0810811, "logps/chosen": -274.22222222222223, "logps/rejected": -360.2162162162162, "loss": 0.1436, "rewards/chosen": 1.3587962962962963, "rewards/margins": 8.595282782782784, "rewards/rejected": -7.236486486486487, "step": 1311 }, { "epoch": 0.8995543366472403, "grad_norm": 0.8551419929928411, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -350568692.53731346, "logits/rejected": -316291777.0491803, "logps/chosen": -327.4029850746269, "logps/rejected": -400.0, "loss": 0.1466, "rewards/chosen": 2.126865671641791, "rewards/margins": 8.315390261805726, "rewards/rejected": -6.188524590163935, "step": 1312 }, { "epoch": 0.9002399725745629, "grad_norm": 0.7614680615957682, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -272430031.2380952, "logits/rejected": -279018007.63076925, "logps/chosen": -300.44444444444446, "logps/rejected": -390.4, "loss": 0.1594, "rewards/chosen": 1.2956349206349207, "rewards/margins": 9.749481074481075, "rewards/rejected": -8.453846153846154, "step": 1313 }, { "epoch": 0.9009256085018855, "grad_norm": 0.6881201684280511, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -276957216.50793654, "logits/rejected": -264821901.7846154, "logps/chosen": -237.71428571428572, "logps/rejected": -416.4923076923077, "loss": 0.166, "rewards/chosen": 1.3779761904761905, "rewards/margins": 9.139514652014652, "rewards/rejected": -7.7615384615384615, "step": 1314 }, { "epoch": 0.901611244429208, "grad_norm": 0.581912450961899, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -220707169.10344827, "logits/rejected": -272989271.7714286, "logps/chosen": -225.6551724137931, "logps/rejected": -348.57142857142856, "loss": 0.1511, "rewards/chosen": 1.394396551724138, "rewards/margins": 9.680110837438423, "rewards/rejected": -8.285714285714286, "step": 1315 }, { "epoch": 0.9022968803565307, "grad_norm": 0.5606710796264901, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -301989888.0, "logits/rejected": -273888051.2, "logps/chosen": -367.0, "logps/rejected": -377.2, "loss": 0.1289, "rewards/chosen": 1.8177083333333333, "rewards/margins": 9.817708333333334, "rewards/rejected": -8.0, "step": 1316 }, { "epoch": 0.9029825162838533, "grad_norm": 0.6304405491180322, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -252807364.38356164, "logits/rejected": -235643624.72727272, "logps/chosen": -211.2876712328767, "logps/rejected": -406.6909090909091, "loss": 0.1718, "rewards/chosen": 1.4982876712328768, "rewards/margins": 8.489196762141969, "rewards/rejected": -6.990909090909091, "step": 1317 }, { "epoch": 0.9036681522111759, "grad_norm": 0.516812394614714, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -234069223.22580644, "logits/rejected": -255725443.87878788, "logps/chosen": -224.6451612903226, "logps/rejected": -397.09090909090907, "loss": 0.1548, "rewards/chosen": 1.9596774193548387, "rewards/margins": 9.883919843597262, "rewards/rejected": -7.924242424242424, "step": 1318 }, { "epoch": 0.9043537881384984, "grad_norm": 0.5286065255697026, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -307054707.9245283, "logits/rejected": -281857228.8, "logps/chosen": -212.0754716981132, "logps/rejected": -328.96, "loss": 0.1475, "rewards/chosen": 1.3360849056603774, "rewards/margins": 8.68941823899371, "rewards/rejected": -7.3533333333333335, "step": 1319 }, { "epoch": 0.905039424065821, "grad_norm": 0.4700531696981173, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -229288618.66666666, "logits/rejected": -279530066.58064514, "logps/chosen": -232.96969696969697, "logps/rejected": -373.16129032258067, "loss": 0.1448, "rewards/chosen": 1.5208333333333333, "rewards/margins": 9.770833333333334, "rewards/rejected": -8.25, "step": 1320 }, { "epoch": 0.9057250599931437, "grad_norm": 0.5187019366830777, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -274783591.7837838, "logits/rejected": -388982859.8518519, "logps/chosen": -257.94594594594594, "logps/rejected": -289.48148148148147, "loss": 0.1816, "rewards/chosen": 2.1469594594594597, "rewards/margins": 9.109922422422422, "rewards/rejected": -6.962962962962963, "step": 1321 }, { "epoch": 0.9064106959204662, "grad_norm": 0.6053793993673011, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -275544445.8305085, "logits/rejected": -252630832.23188406, "logps/chosen": -245.5593220338983, "logps/rejected": -337.6231884057971, "loss": 0.1541, "rewards/chosen": 1.3347457627118644, "rewards/margins": 7.961557356914763, "rewards/rejected": -6.6268115942028984, "step": 1322 }, { "epoch": 0.9070963318477888, "grad_norm": 0.6060205554062841, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -222577732.26666668, "logits/rejected": -248204107.29411766, "logps/chosen": -220.4, "logps/rejected": -278.8235294117647, "loss": 0.1507, "rewards/chosen": 2.027083333333333, "rewards/margins": 8.188848039215687, "rewards/rejected": -6.161764705882353, "step": 1323 }, { "epoch": 0.9077819677751114, "grad_norm": 0.5219384084649864, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -255965903.56756756, "logits/rejected": -298261617.7777778, "logps/chosen": -302.05405405405406, "logps/rejected": -406.81481481481484, "loss": 0.1605, "rewards/chosen": 2.2449324324324325, "rewards/margins": 8.356043543543542, "rewards/rejected": -6.111111111111111, "step": 1324 }, { "epoch": 0.908467603702434, "grad_norm": 0.6479664644658258, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -283085982.64788735, "logits/rejected": -260488353.68421054, "logps/chosen": -320.4507042253521, "logps/rejected": -410.3859649122807, "loss": 0.1703, "rewards/chosen": 1.255281690140845, "rewards/margins": 9.281597479614529, "rewards/rejected": -8.026315789473685, "step": 1325 }, { "epoch": 0.9091532396297566, "grad_norm": 0.5405765644342856, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -319885585.06666666, "logits/rejected": -272136312.4705882, "logps/chosen": -260.4, "logps/rejected": -378.8235294117647, "loss": 0.1401, "rewards/chosen": 1.7895833333333333, "rewards/margins": 9.561642156862746, "rewards/rejected": -7.772058823529412, "step": 1326 }, { "epoch": 0.9098388755570792, "grad_norm": 0.6032907882328262, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -296783165.79310346, "logits/rejected": -245666377.14285713, "logps/chosen": -344.0, "logps/rejected": -380.34285714285716, "loss": 0.1529, "rewards/chosen": 2.2855603448275863, "rewards/margins": 10.364131773399016, "rewards/rejected": -8.07857142857143, "step": 1327 }, { "epoch": 0.9105245114844018, "grad_norm": 0.5997676583371427, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -246451517.79310346, "logits/rejected": -210913572.57142857, "logps/chosen": -224.55172413793105, "logps/rejected": -339.42857142857144, "loss": 0.1522, "rewards/chosen": 1.6163793103448276, "rewards/margins": 8.723522167487685, "rewards/rejected": -7.107142857142857, "step": 1328 }, { "epoch": 0.9112101474117243, "grad_norm": 0.7027191547729399, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -264981323.29411766, "logits/rejected": -380283562.6666667, "logps/chosen": -276.0, "logps/rejected": -290.4, "loss": 0.1793, "rewards/chosen": 1.0790441176470589, "rewards/margins": 9.037377450980392, "rewards/rejected": -7.958333333333333, "step": 1329 }, { "epoch": 0.911895783339047, "grad_norm": 0.6780994893718859, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -260411570.08695653, "logits/rejected": -313364271.7288136, "logps/chosen": -298.4347826086956, "logps/rejected": -400.8135593220339, "loss": 0.1691, "rewards/chosen": 2.036231884057971, "rewards/margins": 9.137926799312208, "rewards/rejected": -7.101694915254237, "step": 1330 }, { "epoch": 0.9125814192663696, "grad_norm": 0.7099452178287461, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -309391600.9411765, "logits/rejected": -332748117.3333333, "logps/chosen": -280.0, "logps/rejected": -384.0, "loss": 0.1566, "rewards/chosen": 1.322610294117647, "rewards/margins": 8.622610294117647, "rewards/rejected": -7.3, "step": 1331 }, { "epoch": 0.9132670551936921, "grad_norm": 0.6808213125675139, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -243879712.5818182, "logits/rejected": -295554791.4520548, "logps/chosen": -308.3636363636364, "logps/rejected": -384.43835616438355, "loss": 0.1284, "rewards/chosen": 2.0056818181818183, "rewards/margins": 9.211161270236612, "rewards/rejected": -7.205479452054795, "step": 1332 }, { "epoch": 0.9139526911210147, "grad_norm": 0.5358312102389821, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -313927522.46153843, "logits/rejected": -327597217.68421054, "logps/chosen": -228.6153846153846, "logps/rejected": -336.0, "loss": 0.1286, "rewards/chosen": 1.7920673076923077, "rewards/margins": 9.502593623481781, "rewards/rejected": -7.7105263157894735, "step": 1333 }, { "epoch": 0.9146383270483374, "grad_norm": 0.5389301211850509, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -386605412.17391306, "logits/rejected": -403506262.779661, "logps/chosen": -282.8985507246377, "logps/rejected": -374.77966101694915, "loss": 0.1521, "rewards/chosen": 1.8423913043478262, "rewards/margins": 10.59662859248342, "rewards/rejected": -8.754237288135593, "step": 1334 }, { "epoch": 0.9153239629756599, "grad_norm": 0.5856166405739042, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -239509221.5172414, "logits/rejected": -239434839.77142859, "logps/chosen": -273.6551724137931, "logps/rejected": -370.74285714285713, "loss": 0.1359, "rewards/chosen": 2.0668103448275863, "rewards/margins": 10.745381773399014, "rewards/rejected": -8.678571428571429, "step": 1335 }, { "epoch": 0.9160095989029825, "grad_norm": 0.5520215025294343, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -252630832.23188406, "logits/rejected": -392700598.2372881, "logps/chosen": -361.04347826086956, "logps/rejected": -350.64406779661016, "loss": 0.1566, "rewards/chosen": 2.141304347826087, "rewards/margins": 9.963338246131173, "rewards/rejected": -7.822033898305085, "step": 1336 }, { "epoch": 0.9166952348303051, "grad_norm": 0.5793463567181064, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -283941670.7878788, "logits/rejected": -251387639.7419355, "logps/chosen": -285.09090909090907, "logps/rejected": -308.38709677419354, "loss": 0.1544, "rewards/chosen": 1.5984848484848484, "rewards/margins": 9.509775171065494, "rewards/rejected": -7.911290322580645, "step": 1337 }, { "epoch": 0.9173808707576278, "grad_norm": 0.5161811956966869, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -234082108.95238096, "logits/rejected": -242882465.47692308, "logps/chosen": -224.0, "logps/rejected": -315.0769230769231, "loss": 0.1434, "rewards/chosen": 2.0476190476190474, "rewards/margins": 17261382.970695972, "rewards/rejected": -17261380.923076924, "step": 1338 }, { "epoch": 0.9180665066849503, "grad_norm": 0.5856342204283115, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -311101651.86206895, "logits/rejected": -285452346.51428574, "logps/chosen": -204.55172413793105, "logps/rejected": -368.0, "loss": 0.1532, "rewards/chosen": 1.2823275862068966, "rewards/margins": 10.453756157635468, "rewards/rejected": -9.17142857142857, "step": 1339 }, { "epoch": 0.9187521426122729, "grad_norm": 0.5973137374623976, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -255590400.0, "logits/rejected": -300679168.0, "logps/chosen": -303.75, "logps/rejected": -381.0, "loss": 0.1346, "rewards/chosen": 1.935546875, "rewards/margins": 9.896484375, "rewards/rejected": -7.9609375, "step": 1340 }, { "epoch": 0.9194377785395955, "grad_norm": 0.6845485769688306, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -290633003.3230769, "logits/rejected": -356848721.26984125, "logps/chosen": -246.64615384615385, "logps/rejected": -400.25396825396825, "loss": 0.1539, "rewards/chosen": 1.6961538461538461, "rewards/margins": 6.251709401709402, "rewards/rejected": -4.555555555555555, "step": 1341 }, { "epoch": 0.920123414466918, "grad_norm": 0.6002001700379875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -281515997.2881356, "logits/rejected": -262356754.55072463, "logps/chosen": -282.3050847457627, "logps/rejected": -359.18840579710144, "loss": 0.1542, "rewards/chosen": 1.2372881355932204, "rewards/margins": 9.585114222549741, "rewards/rejected": -8.347826086956522, "step": 1342 }, { "epoch": 0.9208090503942407, "grad_norm": 0.7166485956365891, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -253981239.13846153, "logits/rejected": -257516950.34920636, "logps/chosen": -229.16923076923078, "logps/rejected": -320.0, "loss": 0.1654, "rewards/chosen": 1.5807692307692307, "rewards/margins": 7.437912087912087, "rewards/rejected": -5.857142857142857, "step": 1343 }, { "epoch": 0.9214946863215633, "grad_norm": 0.6522306683501058, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -250004148.28169015, "logits/rejected": -271673164.35087717, "logps/chosen": -353.1267605633803, "logps/rejected": -324.49122807017545, "loss": 0.171, "rewards/chosen": 2.102992957746479, "rewards/margins": 9.567905238448233, "rewards/rejected": -7.464912280701754, "step": 1344 }, { "epoch": 0.9221803222488858, "grad_norm": 0.5709594697052831, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -278702110.5671642, "logits/rejected": -239281605.24590164, "logps/chosen": -239.76119402985074, "logps/rejected": -370.3606557377049, "loss": 0.1567, "rewards/chosen": 1.5914179104477613, "rewards/margins": 9.976663812087105, "rewards/rejected": -8.385245901639344, "step": 1345 }, { "epoch": 0.9228659581762084, "grad_norm": 0.6668302142845998, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -209430840.40677965, "logits/rejected": -266490271.53623188, "logps/chosen": -286.64406779661016, "logps/rejected": -346.4347826086956, "loss": 0.1618, "rewards/chosen": 1.6027542372881356, "rewards/margins": 9.50855133873741, "rewards/rejected": -7.905797101449275, "step": 1346 }, { "epoch": 0.923551594103531, "grad_norm": 0.6105062418632907, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -288847735.46666664, "logits/rejected": -294588175.0588235, "logps/chosen": -299.46666666666664, "logps/rejected": -418.8235294117647, "loss": 0.1349, "rewards/chosen": 1.6416666666666666, "rewards/margins": 10.891666666666666, "rewards/rejected": -9.25, "step": 1347 }, { "epoch": 0.9242372300308536, "grad_norm": 0.5919907012781214, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -279366066.42424244, "logits/rejected": -254905443.0967742, "logps/chosen": -231.5151515151515, "logps/rejected": -377.03225806451616, "loss": 0.1524, "rewards/chosen": 1.2708333333333333, "rewards/margins": 9.367607526881722, "rewards/rejected": -8.096774193548388, "step": 1348 }, { "epoch": 0.9249228659581762, "grad_norm": 0.7546035804965538, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -273940480.0, "logits/rejected": -266338304.0, "logps/chosen": -238.5, "logps/rejected": -372.0, "loss": 0.1387, "rewards/chosen": 1.791015625, "rewards/margins": 10.697265625, "rewards/rejected": -8.90625, "step": 1349 }, { "epoch": 0.9256085018854988, "grad_norm": 0.656596500723767, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -246565156.57142857, "logits/rejected": -220200960.0, "logps/chosen": -225.71428571428572, "logps/rejected": -333.3333333333333, "loss": 0.1475, "rewards/chosen": 1.2064732142857142, "rewards/margins": 9.338417658730158, "rewards/rejected": -8.131944444444445, "step": 1350 }, { "epoch": 0.9262941378128214, "grad_norm": 0.6384041136291316, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -210004462.3448276, "logits/rejected": -206839105.82857144, "logps/chosen": -319.17241379310343, "logps/rejected": -405.48571428571427, "loss": 0.16, "rewards/chosen": 1.552801724137931, "rewards/margins": 10.517087438423644, "rewards/rejected": -8.964285714285714, "step": 1351 }, { "epoch": 0.926979773740144, "grad_norm": 0.5179368868186497, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -249860681.14285713, "logits/rejected": -245133767.1111111, "logps/chosen": -200.85714285714286, "logps/rejected": -349.77777777777777, "loss": 0.1308, "rewards/chosen": 1.5739397321428572, "rewards/margins": 9.907273065476192, "rewards/rejected": -8.333333333333334, "step": 1352 }, { "epoch": 0.9276654096674666, "grad_norm": 0.8869465385592789, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -262044135.6190476, "logits/rejected": -222233584.24615383, "logps/chosen": -292.8253968253968, "logps/rejected": -345.6, "loss": 0.1466, "rewards/chosen": 1.6408730158730158, "rewards/margins": 10.656257631257631, "rewards/rejected": -9.015384615384615, "step": 1353 }, { "epoch": 0.9283510455947892, "grad_norm": 0.5171688381909696, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -224598214.19354838, "logits/rejected": -281908068.8484849, "logps/chosen": -291.48387096774195, "logps/rejected": -415.030303030303, "loss": 0.1449, "rewards/chosen": 1.657258064516129, "rewards/margins": 10.339076246334312, "rewards/rejected": -8.681818181818182, "step": 1354 }, { "epoch": 0.9290366815221117, "grad_norm": 0.5021856027028533, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -259907037.86666667, "logits/rejected": -275837168.9411765, "logps/chosen": -234.93333333333334, "logps/rejected": -352.2352941176471, "loss": 0.1453, "rewards/chosen": 1.9625, "rewards/margins": 8.844852941176471, "rewards/rejected": -6.882352941176471, "step": 1355 }, { "epoch": 0.9297223174494343, "grad_norm": 0.6555026954468716, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -318167917.71428573, "logits/rejected": -325757610.6666667, "logps/chosen": -267.7142857142857, "logps/rejected": -441.3333333333333, "loss": 0.1653, "rewards/chosen": 1.5948660714285714, "rewards/margins": 8.60875496031746, "rewards/rejected": -7.013888888888889, "step": 1356 }, { "epoch": 0.930407953376757, "grad_norm": 0.5047369523379377, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -250833131.01639345, "logits/rejected": -285963890.6268657, "logps/chosen": -264.91803278688525, "logps/rejected": -374.92537313432837, "loss": 0.1255, "rewards/chosen": 2.2049180327868854, "rewards/margins": 10.89148519696599, "rewards/rejected": -8.686567164179104, "step": 1357 }, { "epoch": 0.9310935893040795, "grad_norm": 0.7172966356305874, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -268177344.9846154, "logits/rejected": -271098506.15873015, "logps/chosen": -215.13846153846154, "logps/rejected": -384.76190476190476, "loss": 0.1624, "rewards/chosen": 1.7653846153846153, "rewards/margins": 10.40030525030525, "rewards/rejected": -8.634920634920634, "step": 1358 }, { "epoch": 0.9317792252314021, "grad_norm": 0.5740255078277967, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -325027259.2238806, "logits/rejected": -262109620.45901638, "logps/chosen": -164.29850746268656, "logps/rejected": -408.655737704918, "loss": 0.1735, "rewards/chosen": 0.9869402985074627, "rewards/margins": 9.355792757523856, "rewards/rejected": -8.368852459016393, "step": 1359 }, { "epoch": 0.9324648611587247, "grad_norm": 0.6437518760618764, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -358545341.9354839, "logits/rejected": -352321536.0, "logps/chosen": -266.5806451612903, "logps/rejected": -372.3636363636364, "loss": 0.1532, "rewards/chosen": 1.3366935483870968, "rewards/margins": 9.215481427174975, "rewards/rejected": -7.878787878787879, "step": 1360 }, { "epoch": 0.9331504970860474, "grad_norm": 0.6151485500434772, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -323560594.28571427, "logits/rejected": -269303243.0344828, "logps/chosen": -317.0285714285714, "logps/rejected": -403.86206896551727, "loss": 0.1765, "rewards/chosen": 1.7767857142857142, "rewards/margins": 11.173337438423644, "rewards/rejected": -9.39655172413793, "step": 1361 }, { "epoch": 0.9338361330133699, "grad_norm": 0.4264248397872292, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -252199440.51612905, "logits/rejected": -318258703.5151515, "logps/chosen": -217.03225806451613, "logps/rejected": -379.6363636363636, "loss": 0.1494, "rewards/chosen": 1.125, "rewards/margins": 9.125, "rewards/rejected": -8.0, "step": 1362 }, { "epoch": 0.9345217689406925, "grad_norm": 0.6205633319999877, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -253755392.0, "logits/rejected": -267911168.0, "logps/chosen": -252.75, "logps/rejected": -462.0, "loss": 0.166, "rewards/chosen": 1.638671875, "rewards/margins": 8.958984375, "rewards/rejected": -7.3203125, "step": 1363 }, { "epoch": 0.9352074048680151, "grad_norm": 0.5087871476738202, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -244893233.5483871, "logits/rejected": -213528203.63636363, "logps/chosen": -283.8709677419355, "logps/rejected": -410.6666666666667, "loss": 0.1399, "rewards/chosen": 1.954133064516129, "rewards/margins": 12.090496700879765, "rewards/rejected": -10.136363636363637, "step": 1364 }, { "epoch": 0.9358930407953376, "grad_norm": 0.5212158052131939, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -256207993.4915254, "logits/rejected": -225884545.85507247, "logps/chosen": -272.8135593220339, "logps/rejected": -331.59420289855075, "loss": 0.1401, "rewards/chosen": 1.88135593220339, "rewards/margins": 9.859616801768608, "rewards/rejected": -7.978260869565218, "step": 1365 }, { "epoch": 0.9365786767226603, "grad_norm": 0.5867737495114602, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -263254256.94117647, "logits/rejected": -302549128.53333336, "logps/chosen": -319.29411764705884, "logps/rejected": -374.93333333333334, "loss": 0.148, "rewards/chosen": 2.014705882352941, "rewards/margins": 11.523039215686275, "rewards/rejected": -9.508333333333333, "step": 1366 }, { "epoch": 0.9372643126499829, "grad_norm": 0.499589880416045, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -214163704.24242425, "logits/rejected": -268976656.516129, "logps/chosen": -208.36363636363637, "logps/rejected": -399.48387096774195, "loss": 0.1828, "rewards/chosen": 1.3863636363636365, "rewards/margins": 10.854105571847507, "rewards/rejected": -9.46774193548387, "step": 1367 }, { "epoch": 0.9379499485773054, "grad_norm": 0.44874786445838755, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -209715200.0, "logits/rejected": -239308344.8888889, "logps/chosen": -168.28571428571428, "logps/rejected": -389.3333333333333, "loss": 0.1191, "rewards/chosen": 1.8013392857142858, "rewards/margins": 10.461061507936508, "rewards/rejected": -8.659722222222221, "step": 1368 }, { "epoch": 0.938635584504628, "grad_norm": 0.518995858645547, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -238211794.82352942, "logits/rejected": -240753049.6, "logps/chosen": -238.11764705882354, "logps/rejected": -336.26666666666665, "loss": 0.1582, "rewards/chosen": 1.8897058823529411, "rewards/margins": 10.798039215686273, "rewards/rejected": -8.908333333333333, "step": 1369 }, { "epoch": 0.9393212204319507, "grad_norm": 0.6177654766153367, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -256258444.38709676, "logits/rejected": -280128667.1515151, "logps/chosen": -313.80645161290323, "logps/rejected": -325.8181818181818, "loss": 0.1446, "rewards/chosen": 1.6139112903225807, "rewards/margins": 8.045729472140762, "rewards/rejected": -6.431818181818182, "step": 1370 }, { "epoch": 0.9400068563592733, "grad_norm": 0.5132469278564068, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -295733976.9491525, "logits/rejected": -303935072.4637681, "logps/chosen": -248.135593220339, "logps/rejected": -390.95652173913044, "loss": 0.1446, "rewards/chosen": 1.7648305084745763, "rewards/margins": 10.177873986735445, "rewards/rejected": -8.41304347826087, "step": 1371 }, { "epoch": 0.9406924922865958, "grad_norm": 0.5513793793586068, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -331549744.7619048, "logits/rejected": -252948795.07692307, "logps/chosen": -279.87301587301585, "logps/rejected": -334.2769230769231, "loss": 0.137, "rewards/chosen": 2.0833333333333335, "rewards/margins": 10.49871794871795, "rewards/rejected": -8.415384615384616, "step": 1372 }, { "epoch": 0.9413781282139184, "grad_norm": 0.7515499731101839, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -256764349.2173913, "logits/rejected": -282369076.0677966, "logps/chosen": -261.7971014492754, "logps/rejected": -373.1525423728813, "loss": 0.1705, "rewards/chosen": 1.6539855072463767, "rewards/margins": 10.221782117415868, "rewards/rejected": -8.567796610169491, "step": 1373 }, { "epoch": 0.942063764141241, "grad_norm": 0.498847269909831, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -253858530.62295082, "logits/rejected": -245898897.19402984, "logps/chosen": -263.344262295082, "logps/rejected": -327.1641791044776, "loss": 0.1303, "rewards/chosen": 2.1024590163934427, "rewards/margins": 9.594996329826278, "rewards/rejected": -7.492537313432836, "step": 1374 }, { "epoch": 0.9427494000685636, "grad_norm": 0.5122442441095998, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -218358008.24242425, "logits/rejected": -372887155.61290324, "logps/chosen": -322.42424242424244, "logps/rejected": -321.5483870967742, "loss": 0.1579, "rewards/chosen": 1.378787878787879, "rewards/margins": 9.354594330400783, "rewards/rejected": -7.975806451612903, "step": 1375 }, { "epoch": 0.9434350359958862, "grad_norm": 0.5862831907355319, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -246065834.66666666, "logits/rejected": -283238881.88235295, "logps/chosen": -269.06666666666666, "logps/rejected": -389.1764705882353, "loss": 0.1503, "rewards/chosen": 1.6604166666666667, "rewards/margins": 9.564828431372549, "rewards/rejected": -7.904411764705882, "step": 1376 }, { "epoch": 0.9441206719232088, "grad_norm": 0.6203131653217947, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -284969523.942029, "logits/rejected": -253364397.55932203, "logps/chosen": -275.94202898550725, "logps/rejected": -445.2881355932203, "loss": 0.1756, "rewards/chosen": 1.4528985507246377, "rewards/margins": 9.741034143944976, "rewards/rejected": -8.288135593220339, "step": 1377 }, { "epoch": 0.9448063078505313, "grad_norm": 0.6948988938840249, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -231830621.0909091, "logits/rejected": -238940027.87096775, "logps/chosen": -272.72727272727275, "logps/rejected": -311.2258064516129, "loss": 0.1602, "rewards/chosen": 1.581439393939394, "rewards/margins": 9.702407135874878, "rewards/rejected": -8.120967741935484, "step": 1378 }, { "epoch": 0.945491943777854, "grad_norm": 0.5042932901480578, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -253718599.85964912, "logits/rejected": -285448970.8169014, "logps/chosen": -277.6140350877193, "logps/rejected": -369.1267605633803, "loss": 0.1233, "rewards/chosen": 1.9100877192982457, "rewards/margins": 11.438256733382755, "rewards/rejected": -9.528169014084508, "step": 1379 }, { "epoch": 0.9461775797051766, "grad_norm": 0.586455143456539, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -246295031.60655737, "logits/rejected": -225115181.85074627, "logps/chosen": -253.9016393442623, "logps/rejected": -383.5223880597015, "loss": 0.1511, "rewards/chosen": 1.6951844262295082, "rewards/margins": 10.620557560557867, "rewards/rejected": -8.925373134328359, "step": 1380 }, { "epoch": 0.9468632156324991, "grad_norm": 0.60693414323499, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -263641965.7142857, "logits/rejected": -261203897.37931034, "logps/chosen": -247.77142857142857, "logps/rejected": -390.0689655172414, "loss": 0.1502, "rewards/chosen": 1.8732142857142857, "rewards/margins": 11.14045566502463, "rewards/rejected": -9.267241379310345, "step": 1381 }, { "epoch": 0.9475488515598217, "grad_norm": 0.5660126300561685, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -238731532.59016395, "logits/rejected": -297983388.6567164, "logps/chosen": -263.344262295082, "logps/rejected": -333.3731343283582, "loss": 0.1561, "rewards/chosen": 1.396516393442623, "rewards/margins": 9.769650721800831, "rewards/rejected": -8.373134328358208, "step": 1382 }, { "epoch": 0.9482344874871443, "grad_norm": 0.5548887271888958, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -246399709.6119403, "logits/rejected": -244094740.98360655, "logps/chosen": -229.73134328358208, "logps/rejected": -304.5245901639344, "loss": 0.1654, "rewards/chosen": 1.2145522388059702, "rewards/margins": 9.058814533887938, "rewards/rejected": -7.844262295081967, "step": 1383 }, { "epoch": 0.948920123414467, "grad_norm": 0.7223103932500075, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -232339021.57575756, "logits/rejected": -259776247.7419355, "logps/chosen": -253.57575757575756, "logps/rejected": -350.4516129032258, "loss": 0.1528, "rewards/chosen": 1.3371212121212122, "rewards/margins": 9.893572825024439, "rewards/rejected": -8.556451612903226, "step": 1384 }, { "epoch": 0.9496057593417895, "grad_norm": 0.49398327005343223, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -306115432.91803277, "logits/rejected": -290471202.3880597, "logps/chosen": -314.4918032786885, "logps/rejected": -339.5820895522388, "loss": 0.1331, "rewards/chosen": 1.209016393442623, "rewards/margins": 9.395583557621727, "rewards/rejected": -8.186567164179104, "step": 1385 }, { "epoch": 0.9502913952691121, "grad_norm": 0.5914502223962315, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -263454720.0, "logits/rejected": -268435456.0, "logps/chosen": -200.0, "logps/rejected": -371.5, "loss": 0.1829, "rewards/chosen": 1.0283203125, "rewards/margins": 9.7236328125, "rewards/rejected": -8.6953125, "step": 1386 }, { "epoch": 0.9509770311964347, "grad_norm": 0.5215121882965954, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -302741106.6268657, "logits/rejected": -290163325.90163934, "logps/chosen": -263.64179104477614, "logps/rejected": -316.59016393442624, "loss": 0.1334, "rewards/chosen": 2.0615671641791047, "rewards/margins": 10.323862246146318, "rewards/rejected": -8.262295081967213, "step": 1387 }, { "epoch": 0.9516626671237572, "grad_norm": 0.49449173740060043, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -247855404.37333333, "logits/rejected": -301673336.754717, "logps/chosen": -230.61333333333334, "logps/rejected": -349.8867924528302, "loss": 0.1748, "rewards/chosen": 1.7233333333333334, "rewards/margins": 10.355408805031447, "rewards/rejected": -8.632075471698114, "step": 1388 }, { "epoch": 0.9523483030510799, "grad_norm": 0.45021391945250055, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -252409458.62686568, "logits/rejected": -317941995.0163934, "logps/chosen": -215.88059701492537, "logps/rejected": -339.40983606557376, "loss": 0.1359, "rewards/chosen": 1.3619402985074627, "rewards/margins": 9.329153413261562, "rewards/rejected": -7.967213114754099, "step": 1389 }, { "epoch": 0.9530339389784025, "grad_norm": 0.6141477537465782, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -288142027.17460316, "logits/rejected": -294762779.5692308, "logps/chosen": -231.36507936507937, "logps/rejected": -346.83076923076925, "loss": 0.1332, "rewards/chosen": 1.503968253968254, "rewards/margins": 9.788583638583638, "rewards/rejected": -8.284615384615385, "step": 1390 }, { "epoch": 0.953719574905725, "grad_norm": 0.5745454824302259, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -276953119.50769234, "logits/rejected": -374424852.3174603, "logps/chosen": -197.16923076923078, "logps/rejected": -374.85714285714283, "loss": 0.1417, "rewards/chosen": 1.3173076923076923, "rewards/margins": 10.222069597069597, "rewards/rejected": -8.904761904761905, "step": 1391 }, { "epoch": 0.9544052108330476, "grad_norm": 0.5863293118321844, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -250846439.22580644, "logits/rejected": -360201743.5151515, "logps/chosen": -289.03225806451616, "logps/rejected": -356.3636363636364, "loss": 0.1693, "rewards/chosen": 1.4340977822580645, "rewards/margins": 8.562885661045943, "rewards/rejected": -7.128787878787879, "step": 1392 }, { "epoch": 0.9550908467603703, "grad_norm": 0.520248474721291, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -248632840.39344263, "logits/rejected": -243394835.1044776, "logps/chosen": -243.14754098360655, "logps/rejected": -300.4179104477612, "loss": 0.1469, "rewards/chosen": 1.8790983606557377, "rewards/margins": 9.117904330804992, "rewards/rejected": -7.2388059701492535, "step": 1393 }, { "epoch": 0.9557764826876929, "grad_norm": 0.5604459797563909, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -260046848.0, "logits/rejected": -289931264.0, "logps/chosen": -250.5, "logps/rejected": -364.25, "loss": 0.1647, "rewards/chosen": 2.0390625, "rewards/margins": 9.4765625, "rewards/rejected": -7.4375, "step": 1394 }, { "epoch": 0.9564621186150154, "grad_norm": 0.6088616933855835, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -283348536.8888889, "logits/rejected": -262757013.66153845, "logps/chosen": -212.31746031746033, "logps/rejected": -370.7076923076923, "loss": 0.1682, "rewards/chosen": 1.1145833333333333, "rewards/margins": 6.953044871794871, "rewards/rejected": -5.838461538461538, "step": 1395 }, { "epoch": 0.957147754542338, "grad_norm": 0.6633556224732327, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -262522174.95081967, "logits/rejected": -234129805.37313432, "logps/chosen": -223.60655737704917, "logps/rejected": -353.43283582089555, "loss": 0.1667, "rewards/chosen": 1.0, "rewards/margins": 8.835820895522389, "rewards/rejected": -7.835820895522388, "step": 1396 }, { "epoch": 0.9578333904696607, "grad_norm": 0.5182832630201452, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -251391934.98412699, "logits/rejected": -285470783.0153846, "logps/chosen": -254.6031746031746, "logps/rejected": -362.33846153846156, "loss": 0.1736, "rewards/chosen": 1.3174603174603174, "rewards/margins": 7.463614163614164, "rewards/rejected": -6.1461538461538465, "step": 1397 }, { "epoch": 0.9585190263969832, "grad_norm": 0.5272866254499892, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -329672294.4, "logits/rejected": -257579610.3529412, "logps/chosen": -278.93333333333334, "logps/rejected": -470.5882352941176, "loss": 0.1522, "rewards/chosen": 1.7229166666666667, "rewards/margins": 9.046446078431373, "rewards/rejected": -7.323529411764706, "step": 1398 }, { "epoch": 0.9592046623243058, "grad_norm": 0.5363298655913118, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -359548644.4307692, "logits/rejected": -326489949.46031743, "logps/chosen": -221.53846153846155, "logps/rejected": -330.1587301587302, "loss": 0.1671, "rewards/chosen": 1.0596153846153846, "rewards/margins": 8.758028083028083, "rewards/rejected": -7.698412698412699, "step": 1399 }, { "epoch": 0.9598902982516284, "grad_norm": 0.5503763945690417, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -199401337.70491803, "logits/rejected": -226868025.31343284, "logps/chosen": -200.78688524590163, "logps/rejected": -362.5074626865672, "loss": 0.1374, "rewards/chosen": 1.6813524590163935, "rewards/margins": 9.755979324688035, "rewards/rejected": -8.074626865671641, "step": 1400 }, { "epoch": 0.9605759341789509, "grad_norm": 0.7135289513422257, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -293312017.6551724, "logits/rejected": -352800885.0285714, "logps/chosen": -295.44827586206895, "logps/rejected": -390.4, "loss": 0.1237, "rewards/chosen": 2.0269396551724137, "rewards/margins": 10.276939655172413, "rewards/rejected": -8.25, "step": 1401 }, { "epoch": 0.9612615701062736, "grad_norm": 0.5276495874490439, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -238875599.23809522, "logits/rejected": -225072805.41538462, "logps/chosen": -266.2857142857143, "logps/rejected": -363.81538461538463, "loss": 0.1509, "rewards/chosen": 2.1339285714285716, "rewards/margins": 9.618543956043956, "rewards/rejected": -7.484615384615385, "step": 1402 }, { "epoch": 0.9619472060335962, "grad_norm": 0.6527846987075122, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -272891904.0, "logits/rejected": -365428736.0, "logps/chosen": -241.5, "logps/rejected": -448.0, "loss": 0.1456, "rewards/chosen": 2.0625, "rewards/margins": 10.6640625, "rewards/rejected": -8.6015625, "step": 1403 }, { "epoch": 0.9626328419609188, "grad_norm": 0.5066999647098647, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -273269569.08474576, "logits/rejected": -325818397.68115944, "logps/chosen": -271.1864406779661, "logps/rejected": -440.3478260869565, "loss": 0.1493, "rewards/chosen": 1.4809322033898304, "rewards/margins": 9.3939756816507, "rewards/rejected": -7.913043478260869, "step": 1404 }, { "epoch": 0.9633184778882413, "grad_norm": 0.6809370673674533, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -349966839.01754385, "logits/rejected": -337907308.1690141, "logps/chosen": -264.140350877193, "logps/rejected": -420.50704225352115, "loss": 0.1339, "rewards/chosen": 1.4298245614035088, "rewards/margins": 10.993204843093649, "rewards/rejected": -9.56338028169014, "step": 1405 }, { "epoch": 0.964004113815564, "grad_norm": 0.6328255422444694, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -269881767.7241379, "logits/rejected": -277063738.51428574, "logps/chosen": -219.58620689655172, "logps/rejected": -336.0, "loss": 0.1609, "rewards/chosen": 1.2133620689655173, "rewards/margins": 10.070504926108375, "rewards/rejected": -8.857142857142858, "step": 1406 }, { "epoch": 0.9646897497428866, "grad_norm": 0.5694819646091389, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -299892736.0, "logits/rejected": -332336911.0588235, "logps/chosen": -250.93333333333334, "logps/rejected": -397.1764705882353, "loss": 0.1348, "rewards/chosen": 1.2354166666666666, "rewards/margins": 9.573651960784314, "rewards/rejected": -8.338235294117647, "step": 1407 }, { "epoch": 0.9653753856702091, "grad_norm": 0.5759656554636509, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -252190850.03174603, "logits/rejected": -234106690.95384616, "logps/chosen": -200.88888888888889, "logps/rejected": -396.8, "loss": 0.1401, "rewards/chosen": 1.5099206349206349, "rewards/margins": 10.517612942612942, "rewards/rejected": -9.007692307692308, "step": 1408 }, { "epoch": 0.9660610215975317, "grad_norm": 0.7702636267332196, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -238862058.30508474, "logits/rejected": -278161378.31884056, "logps/chosen": -226.71186440677965, "logps/rejected": -386.3188405797101, "loss": 0.1588, "rewards/chosen": 1.5858050847457628, "rewards/margins": 8.716239867354458, "rewards/rejected": -7.130434782608695, "step": 1409 }, { "epoch": 0.9667466575248543, "grad_norm": 0.7131661126629687, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -283740986.38596493, "logits/rejected": -253548630.53521127, "logps/chosen": -245.05263157894737, "logps/rejected": -344.3380281690141, "loss": 0.147, "rewards/chosen": 1.0592105263157894, "rewards/margins": 10.650759822090437, "rewards/rejected": -9.591549295774648, "step": 1410 }, { "epoch": 0.9674322934521769, "grad_norm": 0.5628555296385014, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -337003208.34782606, "logits/rejected": -341800231.0508475, "logps/chosen": -303.30434782608694, "logps/rejected": -402.9830508474576, "loss": 0.1526, "rewards/chosen": 2.5579710144927534, "rewards/margins": 9.269835421272415, "rewards/rejected": -6.711864406779661, "step": 1411 }, { "epoch": 0.9681179293794995, "grad_norm": 0.6379984559264623, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -341504646.7368421, "logits/rejected": -312959606.15384614, "logps/chosen": -277.2631578947368, "logps/rejected": -412.9230769230769, "loss": 0.1785, "rewards/chosen": 1.6957236842105263, "rewards/margins": 10.95533906882591, "rewards/rejected": -9.259615384615385, "step": 1412 }, { "epoch": 0.9688035653068221, "grad_norm": 0.5091879106788324, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -260692125.53846154, "logits/rejected": -363506346.6666667, "logps/chosen": -214.15384615384616, "logps/rejected": -327.1111111111111, "loss": 0.1519, "rewards/chosen": 1.7538461538461538, "rewards/margins": 8.26971916971917, "rewards/rejected": -6.515873015873016, "step": 1413 }, { "epoch": 0.9694892012341446, "grad_norm": 0.5471149677087599, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -212618948.92307693, "logits/rejected": -216779290.9473684, "logps/chosen": -202.76923076923077, "logps/rejected": -379.7894736842105, "loss": 0.1505, "rewards/chosen": 1.328125, "rewards/margins": 8.716282894736842, "rewards/rejected": -7.3881578947368425, "step": 1414 }, { "epoch": 0.9701748371614672, "grad_norm": 0.5246532388761391, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -267082454.70967743, "logits/rejected": -266910254.54545453, "logps/chosen": -317.6774193548387, "logps/rejected": -445.57575757575756, "loss": 0.1671, "rewards/chosen": 1.4359879032258065, "rewards/margins": 9.776896994134898, "rewards/rejected": -8.340909090909092, "step": 1415 }, { "epoch": 0.9708604730887899, "grad_norm": 0.5584379409237732, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -268928903.5294118, "logits/rejected": -240193809.06666666, "logps/chosen": -232.7058823529412, "logps/rejected": -342.93333333333334, "loss": 0.1589, "rewards/chosen": 1.5018382352941178, "rewards/margins": 10.23517156862745, "rewards/rejected": -8.733333333333333, "step": 1416 }, { "epoch": 0.9715461090161125, "grad_norm": 0.5725102585847509, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -327843302.8196721, "logits/rejected": -345560568.35820895, "logps/chosen": -214.29508196721312, "logps/rejected": -318.56716417910445, "loss": 0.1232, "rewards/chosen": 1.778688524590164, "rewards/margins": 8.114509420112551, "rewards/rejected": -6.335820895522388, "step": 1417 }, { "epoch": 0.972231744943435, "grad_norm": 0.7469799628030926, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -284969523.942029, "logits/rejected": -308530158.6440678, "logps/chosen": -226.7826086956522, "logps/rejected": -413.2881355932203, "loss": 0.1522, "rewards/chosen": 1.7536231884057971, "rewards/margins": 9.499385900270203, "rewards/rejected": -7.745762711864407, "step": 1418 }, { "epoch": 0.9729173808707576, "grad_norm": 0.48411556415091117, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -297675746.74285716, "logits/rejected": -316742267.5862069, "logps/chosen": -286.4, "logps/rejected": -348.6896551724138, "loss": 0.1558, "rewards/chosen": 1.4321428571428572, "rewards/margins": 10.009729064039409, "rewards/rejected": -8.577586206896552, "step": 1419 }, { "epoch": 0.9736030167980803, "grad_norm": 0.545988913157032, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -241646030.4516129, "logits/rejected": -234372623.5151515, "logps/chosen": -305.5483870967742, "logps/rejected": -354.90909090909093, "loss": 0.1422, "rewards/chosen": 1.8366935483870968, "rewards/margins": 10.041239002932551, "rewards/rejected": -8.204545454545455, "step": 1420 }, { "epoch": 0.9742886527254028, "grad_norm": 0.637215655904867, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -264988082.84931508, "logits/rejected": -305650371.4909091, "logps/chosen": -250.95890410958904, "logps/rejected": -334.25454545454545, "loss": 0.1821, "rewards/chosen": 1.5496575342465753, "rewards/margins": 9.676930261519303, "rewards/rejected": -8.127272727272727, "step": 1421 }, { "epoch": 0.9749742886527254, "grad_norm": 0.5518175884358099, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -320269541.25373137, "logits/rejected": -261284511.47540984, "logps/chosen": -207.76119402985074, "logps/rejected": -346.2295081967213, "loss": 0.1501, "rewards/chosen": 1.7929104477611941, "rewards/margins": 10.309303890384145, "rewards/rejected": -8.51639344262295, "step": 1422 }, { "epoch": 0.975659924580048, "grad_norm": 0.5180096009964169, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -267699613.19298247, "logits/rejected": -274342926.42253524, "logps/chosen": -298.6666666666667, "logps/rejected": -324.9577464788732, "loss": 0.1263, "rewards/chosen": 2.0241228070175437, "rewards/margins": 9.115672102792193, "rewards/rejected": -7.091549295774648, "step": 1423 }, { "epoch": 0.9763455605073705, "grad_norm": 0.5851575436769634, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -220268610.06451613, "logits/rejected": -228017617.45454547, "logps/chosen": -247.74193548387098, "logps/rejected": -366.54545454545456, "loss": 0.1454, "rewards/chosen": 1.5826612903225807, "rewards/margins": 10.749327956989246, "rewards/rejected": -9.166666666666666, "step": 1424 }, { "epoch": 0.9770311964346932, "grad_norm": 0.5000818979302928, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -300072491.8857143, "logits/rejected": -279861318.62068963, "logps/chosen": -203.65714285714284, "logps/rejected": -458.7586206896552, "loss": 0.1471, "rewards/chosen": 1.8678571428571429, "rewards/margins": 10.531650246305418, "rewards/rejected": -8.663793103448276, "step": 1425 }, { "epoch": 0.9777168323620158, "grad_norm": 0.5141855189212862, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -246153216.0, "logits/rejected": -281542656.0, "logps/chosen": -225.25, "logps/rejected": -319.5, "loss": 0.1415, "rewards/chosen": 1.720703125, "rewards/margins": 10.595703125, "rewards/rejected": -8.875, "step": 1426 }, { "epoch": 0.9784024682893384, "grad_norm": 0.6329537516814829, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -274726912.0, "logits/rejected": -324534272.0, "logps/chosen": -363.0, "logps/rejected": -347.5, "loss": 0.1553, "rewards/chosen": 1.8369140625, "rewards/margins": 9.1806640625, "rewards/rejected": -7.34375, "step": 1427 }, { "epoch": 0.9790881042166609, "grad_norm": 0.581006772457246, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -234618880.0, "logits/rejected": -288096256.0, "logps/chosen": -300.25, "logps/rejected": -394.5, "loss": 0.193, "rewards/chosen": 0.701171875, "rewards/margins": 8.326171875, "rewards/rejected": -7.625, "step": 1428 }, { "epoch": 0.9797737401439836, "grad_norm": 0.5809776096074717, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -297119083.3548387, "logits/rejected": -288263074.90909094, "logps/chosen": -249.03225806451613, "logps/rejected": -367.5151515151515, "loss": 0.1325, "rewards/chosen": 2.3528225806451615, "rewards/margins": 10.352822580645162, "rewards/rejected": -8.0, "step": 1429 }, { "epoch": 0.9804593760713062, "grad_norm": 0.5440540291402846, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -265974797.65333334, "logits/rejected": -283946467.0188679, "logps/chosen": -271.7866666666667, "logps/rejected": -431.6981132075472, "loss": 0.1477, "rewards/chosen": 1.92, "rewards/margins": 11.22188679245283, "rewards/rejected": -9.30188679245283, "step": 1430 }, { "epoch": 0.9811450119986287, "grad_norm": 0.47914119224273355, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -218398145.12280703, "logits/rejected": -266781364.28169015, "logps/chosen": -238.59649122807016, "logps/rejected": -300.16901408450707, "loss": 0.1323, "rewards/chosen": 1.694078947368421, "rewards/margins": 9.870135285396591, "rewards/rejected": -8.17605633802817, "step": 1431 }, { "epoch": 0.9818306479259513, "grad_norm": 0.6039357447642143, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -288008874.6666667, "logits/rejected": -290895277.41935486, "logps/chosen": -337.2121212121212, "logps/rejected": -368.0, "loss": 0.1433, "rewards/chosen": 2.3882575757575757, "rewards/margins": 9.47696725317693, "rewards/rejected": -7.088709677419355, "step": 1432 }, { "epoch": 0.9825162838532739, "grad_norm": 0.5626946349290689, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -227066978.19178084, "logits/rejected": -250285558.6909091, "logps/chosen": -246.7945205479452, "logps/rejected": -331.92727272727274, "loss": 0.1724, "rewards/chosen": 1.6335616438356164, "rewards/margins": 10.233561643835616, "rewards/rejected": -8.6, "step": 1433 }, { "epoch": 0.9832019197805965, "grad_norm": 0.5402609214117889, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -255552950.85714287, "logits/rejected": -245133767.1111111, "logps/chosen": -253.14285714285714, "logps/rejected": -407.55555555555554, "loss": 0.1245, "rewards/chosen": 2.049107142857143, "rewards/margins": 10.812996031746032, "rewards/rejected": -8.76388888888889, "step": 1434 }, { "epoch": 0.9838875557079191, "grad_norm": 0.5259858498770166, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -255997175.1724138, "logits/rejected": -245187028.1142857, "logps/chosen": -246.6206896551724, "logps/rejected": -426.51428571428573, "loss": 0.1555, "rewards/chosen": 1.6336206896551724, "rewards/margins": 9.162192118226601, "rewards/rejected": -7.5285714285714285, "step": 1435 }, { "epoch": 0.9845731916352417, "grad_norm": 0.5930127314872905, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -264241152.0, "logits/rejected": -246970488.47058824, "logps/chosen": -262.6666666666667, "logps/rejected": -356.0, "loss": 0.1628, "rewards/chosen": 1.7666666666666666, "rewards/margins": 9.354901960784314, "rewards/rejected": -7.588235294117647, "step": 1436 }, { "epoch": 0.9852588275625643, "grad_norm": 0.6184013366225918, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -223782250.33846155, "logits/rejected": -217038587.93650794, "logps/chosen": -231.63076923076923, "logps/rejected": -397.2063492063492, "loss": 0.155, "rewards/chosen": 1.7134615384615384, "rewards/margins": 8.665842490842492, "rewards/rejected": -6.9523809523809526, "step": 1437 }, { "epoch": 0.9859444634898868, "grad_norm": 0.7701232459763332, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -297589306.75409836, "logits/rejected": -246650115.82089552, "logps/chosen": -256.78688524590166, "logps/rejected": -444.17910447761193, "loss": 0.147, "rewards/chosen": 1.0942622950819672, "rewards/margins": 9.676351847320772, "rewards/rejected": -8.582089552238806, "step": 1438 }, { "epoch": 0.9866300994172095, "grad_norm": 0.8232658694802718, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -382331485.74647886, "logits/rejected": -329068903.2982456, "logps/chosen": -247.43661971830986, "logps/rejected": -332.63157894736844, "loss": 0.1724, "rewards/chosen": 1.1408450704225352, "rewards/margins": 9.053125772176921, "rewards/rejected": -7.912280701754386, "step": 1439 }, { "epoch": 0.9873157353445321, "grad_norm": 0.47082065388970407, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -271364811.17460316, "logits/rejected": -224556583.3846154, "logps/chosen": -302.4761904761905, "logps/rejected": -359.1384615384615, "loss": 0.1294, "rewards/chosen": 2.259920634920635, "rewards/margins": 9.767612942612942, "rewards/rejected": -7.507692307692308, "step": 1440 }, { "epoch": 0.9880013712718546, "grad_norm": 0.5867132807300226, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -289873009.7777778, "logits/rejected": -360710144.0, "logps/chosen": -245.55555555555554, "logps/rejected": -345.42857142857144, "loss": 0.1742, "rewards/chosen": 1.5694444444444444, "rewards/margins": 8.212301587301587, "rewards/rejected": -6.642857142857143, "step": 1441 }, { "epoch": 0.9886870071991772, "grad_norm": 0.6115928424938486, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -313254590.17142856, "logits/rejected": -290419394.20689654, "logps/chosen": -248.9142857142857, "logps/rejected": -471.17241379310343, "loss": 0.179, "rewards/chosen": 1.2651785714285715, "rewards/margins": 10.204833743842364, "rewards/rejected": -8.939655172413794, "step": 1442 }, { "epoch": 0.9893726431264999, "grad_norm": 0.4668014039252287, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -220458504.98245615, "logits/rejected": -187975708.84507042, "logps/chosen": -209.40350877192984, "logps/rejected": -367.77464788732397, "loss": 0.1183, "rewards/chosen": 1.9780701754385965, "rewards/margins": 11.47102792191747, "rewards/rejected": -9.492957746478874, "step": 1443 }, { "epoch": 0.9900582790538224, "grad_norm": 0.4957446142480713, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -324703657.35384613, "logits/rejected": -436740226.03174603, "logps/chosen": -257.4769230769231, "logps/rejected": -431.23809523809524, "loss": 0.1374, "rewards/chosen": 1.8653846153846154, "rewards/margins": 9.857448107448107, "rewards/rejected": -7.992063492063492, "step": 1444 }, { "epoch": 0.990743914981145, "grad_norm": 0.5263069636874985, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -248607837.0909091, "logits/rejected": -245163833.80645162, "logps/chosen": -316.3636363636364, "logps/rejected": -380.38709677419354, "loss": 0.1544, "rewards/chosen": 2.1382575757575757, "rewards/margins": 10.541483382209188, "rewards/rejected": -8.403225806451612, "step": 1445 }, { "epoch": 0.9914295509084676, "grad_norm": 0.7455995752521766, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -297002067.027027, "logits/rejected": -229133274.07407406, "logps/chosen": -278.27027027027026, "logps/rejected": -433.18518518518516, "loss": 0.1775, "rewards/chosen": 1.5557432432432432, "rewards/margins": 9.926113613613614, "rewards/rejected": -8.37037037037037, "step": 1446 }, { "epoch": 0.9921151868357901, "grad_norm": 0.5527248864192829, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -267942008.47058824, "logits/rejected": -253895202.13333333, "logps/chosen": -377.88235294117646, "logps/rejected": -348.26666666666665, "loss": 0.1509, "rewards/chosen": 2.036764705882353, "rewards/margins": 9.995098039215685, "rewards/rejected": -7.958333333333333, "step": 1447 }, { "epoch": 0.9928008227631128, "grad_norm": 0.47561470192117605, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -314366522.75409836, "logits/rejected": -285713484.41791046, "logps/chosen": -246.55737704918033, "logps/rejected": -391.64179104477614, "loss": 0.1445, "rewards/chosen": 1.653688524590164, "rewards/margins": 9.69846464399315, "rewards/rejected": -8.044776119402986, "step": 1448 }, { "epoch": 0.9934864586904354, "grad_norm": 0.7600007208088573, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -306070832.4324324, "logits/rejected": -331194671.4074074, "logps/chosen": -296.43243243243245, "logps/rejected": -352.0, "loss": 0.1692, "rewards/chosen": 2.222972972972973, "rewards/margins": 10.917417417417418, "rewards/rejected": -8.694444444444445, "step": 1449 }, { "epoch": 0.994172094617758, "grad_norm": 0.5429771927487982, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -220061149.86666667, "logits/rejected": -247710659.7647059, "logps/chosen": -282.1333333333333, "logps/rejected": -368.47058823529414, "loss": 0.1488, "rewards/chosen": 1.3229166666666667, "rewards/margins": 9.705269607843137, "rewards/rejected": -8.382352941176471, "step": 1450 }, { "epoch": 0.9948577305450805, "grad_norm": 0.5341113353187565, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -304054776.1230769, "logits/rejected": -326223644.4444444, "logps/chosen": -271.26153846153846, "logps/rejected": -353.015873015873, "loss": 0.1465, "rewards/chosen": 1.8076923076923077, "rewards/margins": 10.133089133089134, "rewards/rejected": -8.325396825396826, "step": 1451 }, { "epoch": 0.9955433664724032, "grad_norm": 0.53131137067072, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -239638741.97014925, "logits/rejected": -247807731.40983605, "logps/chosen": -229.2537313432836, "logps/rejected": -337.8360655737705, "loss": 0.1318, "rewards/chosen": 2.2555970149253732, "rewards/margins": 8.362154391974553, "rewards/rejected": -6.10655737704918, "step": 1452 }, { "epoch": 0.9962290023997258, "grad_norm": 0.5573665714674558, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -265080012.8, "logits/rejected": -286810502.0952381, "logps/chosen": -252.30769230769232, "logps/rejected": -465.77777777777777, "loss": 0.1622, "rewards/chosen": 1.3634615384615385, "rewards/margins": 10.315842490842492, "rewards/rejected": -8.952380952380953, "step": 1453 }, { "epoch": 0.9969146383270483, "grad_norm": 0.6396825912537858, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -248210866.84931508, "logits/rejected": -259589287.56363636, "logps/chosen": -261.9178082191781, "logps/rejected": -382.54545454545456, "loss": 0.1708, "rewards/chosen": 1.6369863013698631, "rewards/margins": 9.26425902864259, "rewards/rejected": -7.627272727272727, "step": 1454 }, { "epoch": 0.9976002742543709, "grad_norm": 0.5675770738213122, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -277707075.3684211, "logits/rejected": -241656438.15384614, "logps/chosen": -281.89473684210526, "logps/rejected": -465.2307692307692, "loss": 0.159, "rewards/chosen": 2.2516447368421053, "rewards/margins": 9.415106275303643, "rewards/rejected": -7.163461538461538, "step": 1455 }, { "epoch": 0.9982859101816935, "grad_norm": 0.5922068103542574, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -295279001.6, "logits/rejected": -256592715.29411766, "logps/chosen": -237.6, "logps/rejected": -392.0, "loss": 0.1259, "rewards/chosen": 1.9854166666666666, "rewards/margins": 11.14718137254902, "rewards/rejected": -9.161764705882353, "step": 1456 }, { "epoch": 0.9989715461090161, "grad_norm": 0.6128327914498901, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -251924545.01587301, "logits/rejected": -250367684.92307693, "logps/chosen": -203.17460317460316, "logps/rejected": -365.7846153846154, "loss": 0.1612, "rewards/chosen": 1.625, "rewards/margins": 10.040384615384616, "rewards/rejected": -8.415384615384616, "step": 1457 }, { "epoch": 0.9996571820363387, "grad_norm": 0.6267903835954981, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -292584479.030303, "logits/rejected": -326073310.9677419, "logps/chosen": -308.3636363636364, "logps/rejected": -367.48387096774195, "loss": 0.1619, "rewards/chosen": 1.7121212121212122, "rewards/margins": 9.728250244379277, "rewards/rejected": -8.016129032258064, "step": 1458 }, { "epoch": 1.0, "grad_norm": 0.6267903835954981, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -281162999.17241377, "logits/rejected": -230087533.7142857, "logps/chosen": -300.13793103448273, "logps/rejected": -403.2, "loss": 0.0648, "rewards/chosen": 1.7801724137931034, "rewards/margins": 6.123029556650246, "rewards/rejected": -4.3428571428571425, "step": 1459 } ], "logging_steps": 1, "max_steps": 1459, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }