{ "best_metric": 0.38143062591552734, "best_model_checkpoint": "./models/checkpoint-405", "epoch": 1.8, "eval_steps": 45, "global_step": 405, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0044444444444444444, "grad_norm": 3.2011494636535645, "learning_rate": 2.173913043478261e-06, "logits/chosen": 1.6946959495544434, "logits/rejected": 1.7046217918395996, "logps/chosen": -123.91139221191406, "logps/rejected": -152.06222534179688, "loss": 0.5724, "rewards/accuracies": 1.0, "rewards/chosen": 0.14388123154640198, "rewards/margins": 0.2681159973144531, "rewards/rejected": -0.12423478066921234, "step": 1 }, { "epoch": 0.008888888888888889, "grad_norm": 2.7101495265960693, "learning_rate": 4.347826086956522e-06, "logits/chosen": 2.161226749420166, "logits/rejected": 2.1654703617095947, "logps/chosen": -257.9621276855469, "logps/rejected": -336.0558776855469, "loss": 0.6413, "rewards/accuracies": 1.0, "rewards/chosen": 0.051631927490234375, "rewards/margins": 0.10870284587144852, "rewards/rejected": -0.05707092583179474, "step": 2 }, { "epoch": 0.013333333333333334, "grad_norm": 2.1376404762268066, "learning_rate": 6.521739130434783e-06, "logits/chosen": 1.9573543071746826, "logits/rejected": 1.8775691986083984, "logps/chosen": -253.7610626220703, "logps/rejected": -210.71412658691406, "loss": 0.527, "rewards/accuracies": 1.0, "rewards/chosen": 0.1437244415283203, "rewards/margins": 0.37279435992240906, "rewards/rejected": -0.22906990349292755, "step": 3 }, { "epoch": 0.017777777777777778, "grad_norm": 2.6771838665008545, "learning_rate": 8.695652173913044e-06, "logits/chosen": 2.2943520545959473, "logits/rejected": 2.242229461669922, "logps/chosen": -384.8254089355469, "logps/rejected": -270.86602783203125, "loss": 0.5007, "rewards/accuracies": 1.0, "rewards/chosen": 0.3624267578125, "rewards/margins": 0.435385137796402, "rewards/rejected": -0.07295837253332138, "step": 4 }, { "epoch": 0.022222222222222223, "grad_norm": 3.228928565979004, "learning_rate": 1.0869565217391305e-05, "logits/chosen": 2.2134103775024414, "logits/rejected": 2.145387887954712, "logps/chosen": -316.5057373046875, "logps/rejected": -360.7799377441406, "loss": 0.6288, "rewards/accuracies": 1.0, "rewards/chosen": 0.0028488151729106903, "rewards/margins": 0.1330413818359375, "rewards/rejected": -0.1301925629377365, "step": 5 }, { "epoch": 0.02666666666666667, "grad_norm": 3.082205057144165, "learning_rate": 1.3043478260869566e-05, "logits/chosen": 2.1260976791381836, "logits/rejected": 2.1222031116485596, "logps/chosen": -358.46337890625, "logps/rejected": -424.693359375, "loss": 0.5927, "rewards/accuracies": 1.0, "rewards/chosen": 0.11278533935546875, "rewards/margins": 0.21288147568702698, "rewards/rejected": -0.10009613633155823, "step": 6 }, { "epoch": 0.03111111111111111, "grad_norm": 3.3314132690429688, "learning_rate": 1.5217391304347828e-05, "logits/chosen": 2.2984189987182617, "logits/rejected": 2.247058391571045, "logps/chosen": -534.6464233398438, "logps/rejected": -502.7433776855469, "loss": 0.5904, "rewards/accuracies": 1.0, "rewards/chosen": 0.02135010063648224, "rewards/margins": 0.21751099824905396, "rewards/rejected": -0.19616088271141052, "step": 7 }, { "epoch": 0.035555555555555556, "grad_norm": 3.316230058670044, "learning_rate": 1.739130434782609e-05, "logits/chosen": 2.123837947845459, "logits/rejected": 2.181354284286499, "logps/chosen": -245.4013671875, "logps/rejected": -403.6361083984375, "loss": 0.6805, "rewards/accuracies": 0.5, "rewards/chosen": 0.11298942565917969, "rewards/margins": 0.027771372348070145, "rewards/rejected": 0.08521804958581924, "step": 8 }, { "epoch": 0.04, "grad_norm": 2.0176970958709717, "learning_rate": 1.956521739130435e-05, "logits/chosen": 1.7723705768585205, "logits/rejected": 1.846294641494751, "logps/chosen": -183.6702423095703, "logps/rejected": -227.79495239257812, "loss": 0.7198, "rewards/accuracies": 0.5, "rewards/chosen": -0.1414192169904709, "rewards/margins": -0.05180053412914276, "rewards/rejected": -0.08961868286132812, "step": 9 }, { "epoch": 0.044444444444444446, "grad_norm": 2.7427902221679688, "learning_rate": 2.173913043478261e-05, "logits/chosen": 2.143216848373413, "logits/rejected": 2.135941982269287, "logps/chosen": -362.2386169433594, "logps/rejected": -326.8141174316406, "loss": 0.6138, "rewards/accuracies": 1.0, "rewards/chosen": 0.03693237528204918, "rewards/margins": 0.16980285942554474, "rewards/rejected": -0.13287048041820526, "step": 10 }, { "epoch": 0.04888888888888889, "grad_norm": 2.896284580230713, "learning_rate": 2.391304347826087e-05, "logits/chosen": 2.1498055458068848, "logits/rejected": 2.200744152069092, "logps/chosen": -248.22348022460938, "logps/rejected": -351.1915283203125, "loss": 0.6633, "rewards/accuracies": 0.5, "rewards/chosen": -0.033481597900390625, "rewards/margins": 0.09698867797851562, "rewards/rejected": -0.13047027587890625, "step": 11 }, { "epoch": 0.05333333333333334, "grad_norm": 3.3868470191955566, "learning_rate": 2.608695652173913e-05, "logits/chosen": 2.2540273666381836, "logits/rejected": 2.0217299461364746, "logps/chosen": -290.06707763671875, "logps/rejected": -291.1870422363281, "loss": 0.6204, "rewards/accuracies": 1.0, "rewards/chosen": -0.057250212877988815, "rewards/margins": 0.15169525146484375, "rewards/rejected": -0.20894546806812286, "step": 12 }, { "epoch": 0.057777777777777775, "grad_norm": 2.541471004486084, "learning_rate": 2.826086956521739e-05, "logits/chosen": 2.0605201721191406, "logits/rejected": 1.9781224727630615, "logps/chosen": -280.5556335449219, "logps/rejected": -206.835693359375, "loss": 0.6807, "rewards/accuracies": 0.5, "rewards/chosen": 0.02671203762292862, "rewards/margins": 0.0282897986471653, "rewards/rejected": -0.001577761024236679, "step": 13 }, { "epoch": 0.06222222222222222, "grad_norm": 3.5865259170532227, "learning_rate": 3.0434782608695656e-05, "logits/chosen": 2.4662587642669678, "logits/rejected": 2.4787802696228027, "logps/chosen": -318.6181945800781, "logps/rejected": -371.00048828125, "loss": 0.7615, "rewards/accuracies": 0.0, "rewards/chosen": -0.21838226914405823, "rewards/margins": -0.1308029294013977, "rewards/rejected": -0.08757934719324112, "step": 14 }, { "epoch": 0.06666666666666667, "grad_norm": 1.8042571544647217, "learning_rate": 3.260869565217392e-05, "logits/chosen": 1.8834528923034668, "logits/rejected": 1.8412845134735107, "logps/chosen": -160.182861328125, "logps/rejected": -134.62167358398438, "loss": 0.6405, "rewards/accuracies": 1.0, "rewards/chosen": -0.048540499061346054, "rewards/margins": 0.10904045403003693, "rewards/rejected": -0.15758095681667328, "step": 15 }, { "epoch": 0.07111111111111111, "grad_norm": 3.6051087379455566, "learning_rate": 3.478260869565218e-05, "logits/chosen": 2.4191336631774902, "logits/rejected": 2.4424967765808105, "logps/chosen": -330.76373291015625, "logps/rejected": -360.3594970703125, "loss": 0.7277, "rewards/accuracies": 0.5, "rewards/chosen": -0.3086807429790497, "rewards/margins": -0.06518251448869705, "rewards/rejected": -0.24349823594093323, "step": 16 }, { "epoch": 0.07555555555555556, "grad_norm": 2.667231321334839, "learning_rate": 3.695652173913043e-05, "logits/chosen": 2.087791919708252, "logits/rejected": 2.067237615585327, "logps/chosen": -260.45025634765625, "logps/rejected": -310.5743713378906, "loss": 0.6497, "rewards/accuracies": 0.5, "rewards/chosen": -0.17707443237304688, "rewards/margins": 0.09263762831687927, "rewards/rejected": -0.26971206068992615, "step": 17 }, { "epoch": 0.08, "grad_norm": 2.471524477005005, "learning_rate": 3.91304347826087e-05, "logits/chosen": 1.9968055486679077, "logits/rejected": 1.9818394184112549, "logps/chosen": -148.7676239013672, "logps/rejected": -160.41592407226562, "loss": 0.6841, "rewards/accuracies": 0.5, "rewards/chosen": -0.12331848591566086, "rewards/margins": 0.021195977926254272, "rewards/rejected": -0.14451447129249573, "step": 18 }, { "epoch": 0.08444444444444445, "grad_norm": 3.896228790283203, "learning_rate": 4.130434782608696e-05, "logits/chosen": 2.1896002292633057, "logits/rejected": 2.2027523517608643, "logps/chosen": -280.221923828125, "logps/rejected": -345.39849853515625, "loss": 0.6961, "rewards/accuracies": 0.5, "rewards/chosen": -0.23584365844726562, "rewards/margins": -0.005173489451408386, "rewards/rejected": -0.23067016899585724, "step": 19 }, { "epoch": 0.08888888888888889, "grad_norm": 2.9211912155151367, "learning_rate": 4.347826086956522e-05, "logits/chosen": 1.9726223945617676, "logits/rejected": 1.9995529651641846, "logps/chosen": -223.56761169433594, "logps/rejected": -288.2007141113281, "loss": 0.6075, "rewards/accuracies": 1.0, "rewards/chosen": -0.14491653442382812, "rewards/margins": 0.18477173149585724, "rewards/rejected": -0.32968828082084656, "step": 20 }, { "epoch": 0.09333333333333334, "grad_norm": 3.5073137283325195, "learning_rate": 4.565217391304348e-05, "logits/chosen": 2.084686517715454, "logits/rejected": 2.1801323890686035, "logps/chosen": -302.4353332519531, "logps/rejected": -421.34222412109375, "loss": 0.6006, "rewards/accuracies": 1.0, "rewards/chosen": -0.2602279782295227, "rewards/margins": 0.19529570639133453, "rewards/rejected": -0.45552366971969604, "step": 21 }, { "epoch": 0.09777777777777778, "grad_norm": 4.747559070587158, "learning_rate": 4.782608695652174e-05, "logits/chosen": 2.384913682937622, "logits/rejected": 2.304309368133545, "logps/chosen": -486.0771484375, "logps/rejected": -395.3549499511719, "loss": 0.7814, "rewards/accuracies": 0.0, "rewards/chosen": -0.5777496099472046, "rewards/margins": -0.1678207516670227, "rewards/rejected": -0.4099288880825043, "step": 22 }, { "epoch": 0.10222222222222223, "grad_norm": 3.5170698165893555, "learning_rate": 5e-05, "logits/chosen": 2.422769546508789, "logits/rejected": 2.377617597579956, "logps/chosen": -356.8462219238281, "logps/rejected": -335.63311767578125, "loss": 0.7371, "rewards/accuracies": 0.5, "rewards/chosen": -0.5601089596748352, "rewards/margins": -0.08314056694507599, "rewards/rejected": -0.4769684076309204, "step": 23 }, { "epoch": 0.10666666666666667, "grad_norm": 1.9441181421279907, "learning_rate": 4.999932336875371e-05, "logits/chosen": 1.8213062286376953, "logits/rejected": 1.8396917581558228, "logps/chosen": -134.66082763671875, "logps/rejected": -149.367431640625, "loss": 0.65, "rewards/accuracies": 1.0, "rewards/chosen": -0.14266128838062286, "rewards/margins": 0.0897216796875, "rewards/rejected": -0.23238298296928406, "step": 24 }, { "epoch": 0.1111111111111111, "grad_norm": 3.068272352218628, "learning_rate": 4.9997293511641216e-05, "logits/chosen": 2.278895139694214, "logits/rejected": 2.275456190109253, "logps/chosen": -333.77490234375, "logps/rejected": -472.25897216796875, "loss": 0.4571, "rewards/accuracies": 1.0, "rewards/chosen": -0.23233337700366974, "rewards/margins": 0.5613830089569092, "rewards/rejected": -0.7937164306640625, "step": 25 }, { "epoch": 0.11555555555555555, "grad_norm": 1.8619799613952637, "learning_rate": 4.999391053853971e-05, "logits/chosen": 1.8031303882598877, "logits/rejected": 1.8489115238189697, "logps/chosen": -96.40797424316406, "logps/rejected": -161.97528076171875, "loss": 0.5789, "rewards/accuracies": 1.0, "rewards/chosen": -0.05866394191980362, "rewards/margins": 0.2442375123500824, "rewards/rejected": -0.3029014468193054, "step": 26 }, { "epoch": 0.12, "grad_norm": 5.386238098144531, "learning_rate": 4.998917463257121e-05, "logits/chosen": 2.5145506858825684, "logits/rejected": 2.4547314643859863, "logps/chosen": -413.3758239746094, "logps/rejected": -371.0469055175781, "loss": 0.7603, "rewards/accuracies": 0.5, "rewards/chosen": -0.5716583728790283, "rewards/margins": -0.12001956254243851, "rewards/rejected": -0.4516388177871704, "step": 27 }, { "epoch": 0.12444444444444444, "grad_norm": 4.0328145027160645, "learning_rate": 4.998308605009268e-05, "logits/chosen": 1.9604880809783936, "logits/rejected": 1.9736435413360596, "logps/chosen": -294.1231689453125, "logps/rejected": -225.6836700439453, "loss": 0.7918, "rewards/accuracies": 0.5, "rewards/chosen": -0.390707403421402, "rewards/margins": -0.17286226153373718, "rewards/rejected": -0.217845156788826, "step": 28 }, { "epoch": 0.1288888888888889, "grad_norm": 2.2388572692871094, "learning_rate": 4.997564512068212e-05, "logits/chosen": 1.7558441162109375, "logits/rejected": 1.728846549987793, "logps/chosen": -262.6331787109375, "logps/rejected": -276.0820617675781, "loss": 0.5986, "rewards/accuracies": 0.5, "rewards/chosen": -0.36693495512008667, "rewards/margins": 0.3257931172847748, "rewards/rejected": -0.6927281022071838, "step": 29 }, { "epoch": 0.13333333333333333, "grad_norm": 2.462900400161743, "learning_rate": 4.9966852247120764e-05, "logits/chosen": 2.112412214279175, "logits/rejected": 2.067960739135742, "logps/chosen": -276.7638244628906, "logps/rejected": -449.4617919921875, "loss": 0.3334, "rewards/accuracies": 1.0, "rewards/chosen": -0.44448322057724, "rewards/margins": 0.9420753121376038, "rewards/rejected": -1.3865585327148438, "step": 30 }, { "epoch": 0.13777777777777778, "grad_norm": 2.836688280105591, "learning_rate": 4.995670790537125e-05, "logits/chosen": 1.9061617851257324, "logits/rejected": 1.8595614433288574, "logps/chosen": -160.72055053710938, "logps/rejected": -129.44073486328125, "loss": 0.7916, "rewards/accuracies": 0.5, "rewards/chosen": -0.3917423486709595, "rewards/margins": -0.17064018547534943, "rewards/rejected": -0.22110214829444885, "step": 31 }, { "epoch": 0.14222222222222222, "grad_norm": 2.559312343597412, "learning_rate": 4.994521264455187e-05, "logits/chosen": 2.1637351512908936, "logits/rejected": 2.2050771713256836, "logps/chosen": -304.7850341796875, "logps/rejected": -348.8180847167969, "loss": 0.4367, "rewards/accuracies": 1.0, "rewards/chosen": -0.3886520266532898, "rewards/margins": 0.71138995885849, "rewards/rejected": -1.1000419855117798, "step": 32 }, { "epoch": 0.14666666666666667, "grad_norm": 4.237148761749268, "learning_rate": 4.993236708690683e-05, "logits/chosen": 2.109586238861084, "logits/rejected": 2.0614817142486572, "logps/chosen": -375.04638671875, "logps/rejected": -299.8951110839844, "loss": 0.7802, "rewards/accuracies": 0.5, "rewards/chosen": -0.5753310918807983, "rewards/margins": -0.14784467220306396, "rewards/rejected": -0.4274864196777344, "step": 33 }, { "epoch": 0.1511111111111111, "grad_norm": 2.7111451625823975, "learning_rate": 4.991817192777259e-05, "logits/chosen": 2.2655739784240723, "logits/rejected": 2.2684638500213623, "logps/chosen": -313.1495056152344, "logps/rejected": -275.92181396484375, "loss": 0.5089, "rewards/accuracies": 1.0, "rewards/chosen": -0.40856704115867615, "rewards/margins": 0.41301044821739197, "rewards/rejected": -0.8215774893760681, "step": 34 }, { "epoch": 0.15555555555555556, "grad_norm": 3.8092589378356934, "learning_rate": 4.9902627935540205e-05, "logits/chosen": 2.179189682006836, "logits/rejected": 2.120750904083252, "logps/chosen": -387.86395263671875, "logps/rejected": -440.36138916015625, "loss": 0.799, "rewards/accuracies": 0.5, "rewards/chosen": -1.1031348705291748, "rewards/margins": 0.2254989743232727, "rewards/rejected": -1.3286339044570923, "step": 35 }, { "epoch": 0.16, "grad_norm": 3.2711353302001953, "learning_rate": 4.9885735951613745e-05, "logits/chosen": 2.114718198776245, "logits/rejected": 2.0954365730285645, "logps/chosen": -358.84710693359375, "logps/rejected": -392.369384765625, "loss": 0.5344, "rewards/accuracies": 1.0, "rewards/chosen": -0.9632622003555298, "rewards/margins": 0.3675689697265625, "rewards/rejected": -1.3308311700820923, "step": 36 }, { "epoch": 0.16444444444444445, "grad_norm": 5.44711446762085, "learning_rate": 4.9867496890364726e-05, "logits/chosen": 2.1442081928253174, "logits/rejected": 2.1060421466827393, "logps/chosen": -323.090087890625, "logps/rejected": -316.7653503417969, "loss": 0.6798, "rewards/accuracies": 0.5, "rewards/chosen": -0.6822533011436462, "rewards/margins": 0.13051298260688782, "rewards/rejected": -0.8127662539482117, "step": 37 }, { "epoch": 0.1688888888888889, "grad_norm": 3.415454626083374, "learning_rate": 4.984791173908267e-05, "logits/chosen": 2.2119979858398438, "logits/rejected": 2.155428409576416, "logps/chosen": -411.5396728515625, "logps/rejected": -440.5595703125, "loss": 0.3268, "rewards/accuracies": 1.0, "rewards/chosen": -1.0235031843185425, "rewards/margins": 1.1276824474334717, "rewards/rejected": -2.1511855125427246, "step": 38 }, { "epoch": 0.17333333333333334, "grad_norm": 2.106764554977417, "learning_rate": 4.982698155792159e-05, "logits/chosen": 1.676947832107544, "logits/rejected": 1.8609204292297363, "logps/chosen": -207.95248413085938, "logps/rejected": -255.15560913085938, "loss": 0.4656, "rewards/accuracies": 1.0, "rewards/chosen": -0.20581628382205963, "rewards/margins": 0.5979617834091187, "rewards/rejected": -0.8037780523300171, "step": 39 }, { "epoch": 0.17777777777777778, "grad_norm": 2.561739444732666, "learning_rate": 4.980470747984265e-05, "logits/chosen": 1.903275489807129, "logits/rejected": 1.878113031387329, "logps/chosen": -231.56222534179688, "logps/rejected": -203.46058654785156, "loss": 0.4862, "rewards/accuracies": 1.0, "rewards/chosen": -0.5600662231445312, "rewards/margins": 0.49753645062446594, "rewards/rejected": -1.0576026439666748, "step": 40 }, { "epoch": 0.18222222222222223, "grad_norm": 3.4072999954223633, "learning_rate": 4.9781090710552835e-05, "logits/chosen": 2.3119935989379883, "logits/rejected": 2.174755334854126, "logps/chosen": -306.8345947265625, "logps/rejected": -353.8470764160156, "loss": 0.5132, "rewards/accuracies": 1.0, "rewards/chosen": -0.6789085865020752, "rewards/margins": 0.40302202105522156, "rewards/rejected": -1.0819306373596191, "step": 41 }, { "epoch": 0.18666666666666668, "grad_norm": 2.7821035385131836, "learning_rate": 4.975613252843966e-05, "logits/chosen": 1.9496957063674927, "logits/rejected": 1.9538969993591309, "logps/chosen": -228.1984100341797, "logps/rejected": -221.22930908203125, "loss": 0.5304, "rewards/accuracies": 0.5, "rewards/chosen": -0.608843982219696, "rewards/margins": 0.46152499318122864, "rewards/rejected": -1.070369005203247, "step": 42 }, { "epoch": 0.19111111111111112, "grad_norm": 2.4308483600616455, "learning_rate": 4.9729834284501995e-05, "logits/chosen": 1.9656260013580322, "logits/rejected": 1.9733545780181885, "logps/chosen": -214.34481811523438, "logps/rejected": -260.02227783203125, "loss": 0.4267, "rewards/accuracies": 1.0, "rewards/chosen": -0.5842536687850952, "rewards/margins": 0.8209755420684814, "rewards/rejected": -1.4052292108535767, "step": 43 }, { "epoch": 0.19555555555555557, "grad_norm": 3.7162206172943115, "learning_rate": 4.970219740227693e-05, "logits/chosen": 2.2411859035491943, "logits/rejected": 2.2703304290771484, "logps/chosen": -327.88653564453125, "logps/rejected": -440.0887145996094, "loss": 0.4777, "rewards/accuracies": 0.5, "rewards/chosen": -1.2694365978240967, "rewards/margins": 0.9213591814041138, "rewards/rejected": -2.1907958984375, "step": 44 }, { "epoch": 0.2, "grad_norm": 2.9216010570526123, "learning_rate": 4.9673223377762715e-05, "logits/chosen": 2.17927885055542, "logits/rejected": 2.1836795806884766, "logps/chosen": -384.8567199707031, "logps/rejected": -417.8768310546875, "loss": 0.3178, "rewards/accuracies": 1.0, "rewards/chosen": -0.5690780878067017, "rewards/margins": 1.0208160877227783, "rewards/rejected": -1.5898940563201904, "step": 45 }, { "epoch": 0.2, "eval_logits/chosen": 2.1410844326019287, "eval_logits/rejected": 2.0819036960601807, "eval_logps/chosen": -302.23443603515625, "eval_logps/rejected": -335.14215087890625, "eval_loss": 0.590552568435669, "eval_rewards/accuracies": 0.6964285969734192, "eval_rewards/chosen": -1.194185495376587, "eval_rewards/margins": 0.738350510597229, "eval_rewards/rejected": -1.932536244392395, "eval_runtime": 17.8289, "eval_samples_per_second": 2.804, "eval_steps_per_second": 0.393, "step": 45 }, { "epoch": 0.20444444444444446, "grad_norm": 5.059287071228027, "learning_rate": 4.9642913779337757e-05, "logits/chosen": 1.6329092979431152, "logits/rejected": 1.6129851341247559, "logps/chosen": -348.64971923828125, "logps/rejected": -319.07080078125, "loss": 0.6636, "rewards/accuracies": 1.0, "rewards/chosen": -1.3844482898712158, "rewards/margins": 0.060698702931404114, "rewards/rejected": -1.445146918296814, "step": 46 }, { "epoch": 0.2088888888888889, "grad_norm": 2.8684635162353516, "learning_rate": 4.9611270247675776e-05, "logits/chosen": 1.4863775968551636, "logits/rejected": 1.5527057647705078, "logps/chosen": -104.7228775024414, "logps/rejected": -144.18276977539062, "loss": 0.7198, "rewards/accuracies": 0.0, "rewards/chosen": -0.2717903256416321, "rewards/margins": -0.05223694443702698, "rewards/rejected": -0.2195533812046051, "step": 47 }, { "epoch": 0.21333333333333335, "grad_norm": 4.421427249908447, "learning_rate": 4.9578294495656965e-05, "logits/chosen": 2.095689296722412, "logits/rejected": 2.1000843048095703, "logps/chosen": -379.61566162109375, "logps/rejected": -351.688232421875, "loss": 0.5805, "rewards/accuracies": 1.0, "rewards/chosen": -1.1613686084747314, "rewards/margins": 0.24983596801757812, "rewards/rejected": -1.4112045764923096, "step": 48 }, { "epoch": 0.21777777777777776, "grad_norm": 2.3262526988983154, "learning_rate": 4.954398830827524e-05, "logits/chosen": 1.5170578956604004, "logits/rejected": 1.41743004322052, "logps/chosen": -141.56716918945312, "logps/rejected": -152.45985412597656, "loss": 0.6136, "rewards/accuracies": 1.0, "rewards/chosen": -0.250314325094223, "rewards/margins": 0.1659536361694336, "rewards/rejected": -0.4162679612636566, "step": 49 }, { "epoch": 0.2222222222222222, "grad_norm": 1.6063750982284546, "learning_rate": 4.950835354254167e-05, "logits/chosen": 2.113161087036133, "logits/rejected": 2.027552843093872, "logps/chosen": -283.05487060546875, "logps/rejected": -338.911865234375, "loss": 0.202, "rewards/accuracies": 1.0, "rewards/chosen": -0.8912537097930908, "rewards/margins": 1.6376852989196777, "rewards/rejected": -2.5289390087127686, "step": 50 }, { "epoch": 0.22666666666666666, "grad_norm": 1.9353735446929932, "learning_rate": 4.947139212738395e-05, "logits/chosen": 2.0660266876220703, "logits/rejected": 1.9209285974502563, "logps/chosen": -392.48199462890625, "logps/rejected": -411.1744079589844, "loss": 0.1246, "rewards/accuracies": 1.0, "rewards/chosen": -1.723092794418335, "rewards/margins": 2.101870536804199, "rewards/rejected": -3.8249635696411133, "step": 51 }, { "epoch": 0.2311111111111111, "grad_norm": 3.488070011138916, "learning_rate": 4.943310606354192e-05, "logits/chosen": 2.190558433532715, "logits/rejected": 2.1188831329345703, "logps/chosen": -325.7753601074219, "logps/rejected": -416.0865783691406, "loss": 0.4128, "rewards/accuracies": 0.5, "rewards/chosen": -1.3961410522460938, "rewards/margins": 1.5372390747070312, "rewards/rejected": -2.933380126953125, "step": 52 }, { "epoch": 0.23555555555555555, "grad_norm": 9.58780574798584, "learning_rate": 4.9393497423459376e-05, "logits/chosen": 2.139993667602539, "logits/rejected": 2.1976380348205566, "logps/chosen": -336.98150634765625, "logps/rejected": -287.92987060546875, "loss": 1.3144, "rewards/accuracies": 0.5, "rewards/chosen": -1.950069546699524, "rewards/margins": -0.7764908075332642, "rewards/rejected": -1.1735787391662598, "step": 53 }, { "epoch": 0.24, "grad_norm": 2.2417423725128174, "learning_rate": 4.935256835117179e-05, "logits/chosen": 2.2602908611297607, "logits/rejected": 2.23490047454834, "logps/chosen": -410.2571105957031, "logps/rejected": -527.3067016601562, "loss": 0.212, "rewards/accuracies": 1.0, "rewards/chosen": -1.7630027532577515, "rewards/margins": 3.0140035152435303, "rewards/rejected": -4.77700662612915, "step": 54 }, { "epoch": 0.24444444444444444, "grad_norm": 2.4229862689971924, "learning_rate": 4.931032106219029e-05, "logits/chosen": 1.751630425453186, "logits/rejected": 1.79133939743042, "logps/chosen": -251.48580932617188, "logps/rejected": -306.2901611328125, "loss": 0.248, "rewards/accuracies": 1.0, "rewards/chosen": -0.854656994342804, "rewards/margins": 1.2701172828674316, "rewards/rejected": -2.124774217605591, "step": 55 }, { "epoch": 0.24888888888888888, "grad_norm": 7.835116386413574, "learning_rate": 4.926675784338174e-05, "logits/chosen": 1.9964067935943604, "logits/rejected": 2.117382287979126, "logps/chosen": -287.7210388183594, "logps/rejected": -260.3846740722656, "loss": 0.9769, "rewards/accuracies": 0.5, "rewards/chosen": -1.8229798078536987, "rewards/margins": -0.4610947072505951, "rewards/rejected": -1.3618850708007812, "step": 56 }, { "epoch": 0.25333333333333335, "grad_norm": 3.7002341747283936, "learning_rate": 4.922188105284495e-05, "logits/chosen": 2.1905529499053955, "logits/rejected": 2.094609260559082, "logps/chosen": -429.6041259765625, "logps/rejected": -498.65740966796875, "loss": 0.2605, "rewards/accuracies": 1.0, "rewards/chosen": -2.7014801502227783, "rewards/margins": 1.3251266479492188, "rewards/rejected": -4.026606559753418, "step": 57 }, { "epoch": 0.2577777777777778, "grad_norm": 1.7439998388290405, "learning_rate": 4.9175693119783013e-05, "logits/chosen": 1.88455069065094, "logits/rejected": 1.802990198135376, "logps/chosen": -438.767578125, "logps/rejected": -430.30194091796875, "loss": 0.1209, "rewards/accuracies": 1.0, "rewards/chosen": -2.552638292312622, "rewards/margins": 2.1732101440429688, "rewards/rejected": -4.725848197937012, "step": 58 }, { "epoch": 0.26222222222222225, "grad_norm": 2.541234254837036, "learning_rate": 4.912819654437182e-05, "logits/chosen": 2.0551681518554688, "logits/rejected": 1.987473726272583, "logps/chosen": -397.0820617675781, "logps/rejected": -455.72442626953125, "loss": 0.1633, "rewards/accuracies": 1.0, "rewards/chosen": -1.4276413917541504, "rewards/margins": 1.9974074363708496, "rewards/rejected": -3.425048828125, "step": 59 }, { "epoch": 0.26666666666666666, "grad_norm": 2.0597288608551025, "learning_rate": 4.9079393897624745e-05, "logits/chosen": 1.7409842014312744, "logits/rejected": 1.764671802520752, "logps/chosen": -259.36578369140625, "logps/rejected": -400.5860595703125, "loss": 0.226, "rewards/accuracies": 1.0, "rewards/chosen": -0.98876953125, "rewards/margins": 3.264120578765869, "rewards/rejected": -4.252890110015869, "step": 60 }, { "epoch": 0.27111111111111114, "grad_norm": 3.1185505390167236, "learning_rate": 4.9029287821253445e-05, "logits/chosen": 2.0401980876922607, "logits/rejected": 2.0045888423919678, "logps/chosen": -233.93736267089844, "logps/rejected": -272.4693603515625, "loss": 0.4837, "rewards/accuracies": 1.0, "rewards/chosen": -0.8766113519668579, "rewards/margins": 0.49696657061576843, "rewards/rejected": -1.3735778331756592, "step": 61 }, { "epoch": 0.27555555555555555, "grad_norm": 0.503414511680603, "learning_rate": 4.897788102752485e-05, "logits/chosen": 2.0203349590301514, "logits/rejected": 1.9820505380630493, "logps/chosen": -306.28814697265625, "logps/rejected": -438.020263671875, "loss": 0.0311, "rewards/accuracies": 1.0, "rewards/chosen": -1.1499321460723877, "rewards/margins": 4.276589393615723, "rewards/rejected": -5.426521301269531, "step": 62 }, { "epoch": 0.28, "grad_norm": 5.02221155166626, "learning_rate": 4.8925176299114416e-05, "logits/chosen": 1.7009226083755493, "logits/rejected": 1.5652942657470703, "logps/chosen": -425.44415283203125, "logps/rejected": -476.310546875, "loss": 0.2624, "rewards/accuracies": 1.0, "rewards/chosen": -3.0838661193847656, "rewards/margins": 2.905472993850708, "rewards/rejected": -5.989339351654053, "step": 63 }, { "epoch": 0.28444444444444444, "grad_norm": 10.18476390838623, "learning_rate": 4.8871176488955415e-05, "logits/chosen": 1.9149291515350342, "logits/rejected": 1.852341651916504, "logps/chosen": -378.48138427734375, "logps/rejected": -236.2688751220703, "loss": 2.0384, "rewards/accuracies": 0.5, "rewards/chosen": -3.0230088233947754, "rewards/margins": -1.4521996974945068, "rewards/rejected": -1.5708091259002686, "step": 64 }, { "epoch": 0.28888888888888886, "grad_norm": 2.800845146179199, "learning_rate": 4.881588452008456e-05, "logits/chosen": 1.3878483772277832, "logits/rejected": 1.4146528244018555, "logps/chosen": -115.78246307373047, "logps/rejected": -137.14707946777344, "loss": 0.6799, "rewards/accuracies": 0.5, "rewards/chosen": -0.47142869234085083, "rewards/margins": 0.036671459674835205, "rewards/rejected": -0.508100152015686, "step": 65 }, { "epoch": 0.29333333333333333, "grad_norm": 4.6413726806640625, "learning_rate": 4.875930338548376e-05, "logits/chosen": 1.9736688137054443, "logits/rejected": 1.8488330841064453, "logps/chosen": -233.1080322265625, "logps/rejected": -270.9902038574219, "loss": 0.4431, "rewards/accuracies": 1.0, "rewards/chosen": -1.331383466720581, "rewards/margins": 0.5846099853515625, "rewards/rejected": -1.9159934520721436, "step": 66 }, { "epoch": 0.29777777777777775, "grad_norm": 8.721001625061035, "learning_rate": 4.87014361479181e-05, "logits/chosen": 1.8805785179138184, "logits/rejected": 1.8359558582305908, "logps/chosen": -321.511962890625, "logps/rejected": -275.58209228515625, "loss": 1.3317, "rewards/accuracies": 0.5, "rewards/chosen": -3.167226552963257, "rewards/margins": -0.7222648859024048, "rewards/rejected": -2.4449615478515625, "step": 67 }, { "epoch": 0.3022222222222222, "grad_norm": 3.038731098175049, "learning_rate": 4.864228593977006e-05, "logits/chosen": 2.2448015213012695, "logits/rejected": 2.306088447570801, "logps/chosen": -347.45849609375, "logps/rejected": -423.48297119140625, "loss": 0.229, "rewards/accuracies": 1.0, "rewards/chosen": -2.2222633361816406, "rewards/margins": 2.896373748779297, "rewards/rejected": -5.1186370849609375, "step": 68 }, { "epoch": 0.30666666666666664, "grad_norm": 4.552579879760742, "learning_rate": 4.858185596286997e-05, "logits/chosen": 2.1001484394073486, "logits/rejected": 1.979229211807251, "logps/chosen": -295.586181640625, "logps/rejected": -303.2032470703125, "loss": 0.3926, "rewards/accuracies": 0.5, "rewards/chosen": -1.715203881263733, "rewards/margins": 2.0468697547912598, "rewards/rejected": -3.762073516845703, "step": 69 }, { "epoch": 0.3111111111111111, "grad_norm": 3.6529836654663086, "learning_rate": 4.852014948832268e-05, "logits/chosen": 1.9788322448730469, "logits/rejected": 1.9580605030059814, "logps/chosen": -213.86343383789062, "logps/rejected": -261.7784118652344, "loss": 0.524, "rewards/accuracies": 1.0, "rewards/chosen": -1.2392014265060425, "rewards/margins": 0.37295836210250854, "rewards/rejected": -1.6121597290039062, "step": 70 }, { "epoch": 0.31555555555555553, "grad_norm": 1.9879947900772095, "learning_rate": 4.8457169856330485e-05, "logits/chosen": 1.942040205001831, "logits/rejected": 1.8756356239318848, "logps/chosen": -313.9153747558594, "logps/rejected": -427.9259948730469, "loss": 0.1422, "rewards/accuracies": 1.0, "rewards/chosen": -1.7393081188201904, "rewards/margins": 4.251686096191406, "rewards/rejected": -5.990994453430176, "step": 71 }, { "epoch": 0.32, "grad_norm": 5.4608001708984375, "learning_rate": 4.839292047601234e-05, "logits/chosen": 1.7308683395385742, "logits/rejected": 1.7762730121612549, "logps/chosen": -293.4880065917969, "logps/rejected": -250.67381286621094, "loss": 0.6266, "rewards/accuracies": 0.5, "rewards/chosen": -1.6677329540252686, "rewards/margins": 1.1602835655212402, "rewards/rejected": -2.828016757965088, "step": 72 }, { "epoch": 0.3244444444444444, "grad_norm": 11.236379623413086, "learning_rate": 4.832740482521931e-05, "logits/chosen": 1.5850169658660889, "logits/rejected": 1.6753277778625488, "logps/chosen": -275.2799072265625, "logps/rejected": -174.08926391601562, "loss": 1.7558, "rewards/accuracies": 0.5, "rewards/chosen": -3.16182279586792, "rewards/margins": -1.245647668838501, "rewards/rejected": -1.916175127029419, "step": 73 }, { "epoch": 0.3288888888888889, "grad_norm": 1.0129520893096924, "learning_rate": 4.826062645034631e-05, "logits/chosen": 2.147963047027588, "logits/rejected": 2.13932728767395, "logps/chosen": -533.4431762695312, "logps/rejected": -664.0446166992188, "loss": 0.038, "rewards/accuracies": 1.0, "rewards/chosen": -4.084846496582031, "rewards/margins": 4.350230693817139, "rewards/rejected": -8.435077667236328, "step": 74 }, { "epoch": 0.3333333333333333, "grad_norm": 8.0421724319458, "learning_rate": 4.819258896614014e-05, "logits/chosen": 2.038822650909424, "logits/rejected": 2.02886962890625, "logps/chosen": -347.89373779296875, "logps/rejected": -322.3008728027344, "loss": 0.4447, "rewards/accuracies": 1.0, "rewards/chosen": -2.955258369445801, "rewards/margins": 0.5941513180732727, "rewards/rejected": -3.5494096279144287, "step": 75 }, { "epoch": 0.3377777777777778, "grad_norm": 10.355673789978027, "learning_rate": 4.812329605550381e-05, "logits/chosen": 2.0920519828796387, "logits/rejected": 2.0875403881073, "logps/chosen": -335.3114318847656, "logps/rejected": -350.8553466796875, "loss": 0.9773, "rewards/accuracies": 0.5, "rewards/chosen": -3.6420540809631348, "rewards/margins": 0.05045384168624878, "rewards/rejected": -3.6925079822540283, "step": 76 }, { "epoch": 0.3422222222222222, "grad_norm": 0.42018255591392517, "learning_rate": 4.805275146929721e-05, "logits/chosen": 2.0620903968811035, "logits/rejected": 2.108494281768799, "logps/chosen": -342.378662109375, "logps/rejected": -450.7765808105469, "loss": 0.0191, "rewards/accuracies": 1.0, "rewards/chosen": -2.201615810394287, "rewards/margins": 4.498736381530762, "rewards/rejected": -6.700352668762207, "step": 77 }, { "epoch": 0.3466666666666667, "grad_norm": 0.4926978349685669, "learning_rate": 4.7980959026134044e-05, "logits/chosen": 1.9305293560028076, "logits/rejected": 2.000296115875244, "logps/chosen": -292.07354736328125, "logps/rejected": -431.2196044921875, "loss": 0.0316, "rewards/accuracies": 1.0, "rewards/chosen": -1.1881355047225952, "rewards/margins": 3.963468074798584, "rewards/rejected": -5.151603698730469, "step": 78 }, { "epoch": 0.3511111111111111, "grad_norm": 4.756207466125488, "learning_rate": 4.790792261217512e-05, "logits/chosen": 2.1176319122314453, "logits/rejected": 2.1045117378234863, "logps/chosen": -309.7745666503906, "logps/rejected": -279.74969482421875, "loss": 0.5587, "rewards/accuracies": 0.5, "rewards/chosen": -1.1116164922714233, "rewards/margins": 0.3822830319404602, "rewards/rejected": -1.4938995838165283, "step": 79 }, { "epoch": 0.35555555555555557, "grad_norm": 0.1574506163597107, "learning_rate": 4.783364618091803e-05, "logits/chosen": 2.228512763977051, "logits/rejected": 2.239095687866211, "logps/chosen": -452.6519470214844, "logps/rejected": -519.6800537109375, "loss": 0.0082, "rewards/accuracies": 1.0, "rewards/chosen": -2.023036241531372, "rewards/margins": 5.381681442260742, "rewards/rejected": -7.404717922210693, "step": 80 }, { "epoch": 0.36, "grad_norm": 17.396940231323242, "learning_rate": 4.7758133752983135e-05, "logits/chosen": 2.2299280166625977, "logits/rejected": 2.275631904602051, "logps/chosen": -481.376708984375, "logps/rejected": -424.80029296875, "loss": 1.0849, "rewards/accuracies": 0.5, "rewards/chosen": -4.431848049163818, "rewards/margins": -0.2735259532928467, "rewards/rejected": -4.158322334289551, "step": 81 }, { "epoch": 0.36444444444444446, "grad_norm": 2.288203477859497, "learning_rate": 4.7681389415895864e-05, "logits/chosen": 1.9121689796447754, "logits/rejected": 1.9124395847320557, "logps/chosen": -365.42669677734375, "logps/rejected": -389.65118408203125, "loss": 0.3228, "rewards/accuracies": 1.0, "rewards/chosen": -2.7858033180236816, "rewards/margins": 1.5727282762527466, "rewards/rejected": -4.358531475067139, "step": 82 }, { "epoch": 0.3688888888888889, "grad_norm": 17.783615112304688, "learning_rate": 4.7603417323865547e-05, "logits/chosen": 2.15109920501709, "logits/rejected": 2.273561954498291, "logps/chosen": -492.3847351074219, "logps/rejected": -423.0718994140625, "loss": 2.6581, "rewards/accuracies": 0.5, "rewards/chosen": -4.891546726226807, "rewards/margins": -1.6526780128479004, "rewards/rejected": -3.2388687133789062, "step": 83 }, { "epoch": 0.37333333333333335, "grad_norm": 3.1931591033935547, "learning_rate": 4.752422169756048e-05, "logits/chosen": 2.216590404510498, "logits/rejected": 2.2690138816833496, "logps/chosen": -395.58953857421875, "logps/rejected": -360.23663330078125, "loss": 0.2526, "rewards/accuracies": 1.0, "rewards/chosen": -1.509881615638733, "rewards/margins": 2.4663939476013184, "rewards/rejected": -3.976275682449341, "step": 84 }, { "epoch": 0.37777777777777777, "grad_norm": 0.4893577992916107, "learning_rate": 4.74438068238795e-05, "logits/chosen": 2.0717084407806396, "logits/rejected": 2.0398921966552734, "logps/chosen": -313.4667053222656, "logps/rejected": -517.9127197265625, "loss": 0.023, "rewards/accuracies": 1.0, "rewards/chosen": -1.5163437128067017, "rewards/margins": 4.198652744293213, "rewards/rejected": -5.714996337890625, "step": 85 }, { "epoch": 0.38222222222222224, "grad_norm": 12.34611988067627, "learning_rate": 4.736217705571989e-05, "logits/chosen": 1.8056581020355225, "logits/rejected": 1.906313180923462, "logps/chosen": -328.50921630859375, "logps/rejected": -252.91552734375, "loss": 1.6628, "rewards/accuracies": 0.0, "rewards/chosen": -3.34942626953125, "rewards/margins": -1.445077657699585, "rewards/rejected": -1.9043487310409546, "step": 86 }, { "epoch": 0.38666666666666666, "grad_norm": 11.105749130249023, "learning_rate": 4.7279336811741806e-05, "logits/chosen": 2.4057044982910156, "logits/rejected": 2.336398124694824, "logps/chosen": -602.2092895507812, "logps/rejected": -557.8479614257812, "loss": 0.6199, "rewards/accuracies": 0.5, "rewards/chosen": -3.7743942737579346, "rewards/margins": 0.2018601894378662, "rewards/rejected": -3.976254463195801, "step": 87 }, { "epoch": 0.39111111111111113, "grad_norm": 17.08523941040039, "learning_rate": 4.7195290576129034e-05, "logits/chosen": 2.213070869445801, "logits/rejected": 2.195730686187744, "logps/chosen": -453.27703857421875, "logps/rejected": -573.7431640625, "loss": 2.1029, "rewards/accuracies": 0.5, "rewards/chosen": -4.91035795211792, "rewards/margins": 0.73905348777771, "rewards/rejected": -5.649411201477051, "step": 88 }, { "epoch": 0.39555555555555555, "grad_norm": 1.3783494234085083, "learning_rate": 4.711004289834632e-05, "logits/chosen": 2.123533248901367, "logits/rejected": 2.0941734313964844, "logps/chosen": -282.4661865234375, "logps/rejected": -387.58892822265625, "loss": 0.1011, "rewards/accuracies": 1.0, "rewards/chosen": -1.8440461158752441, "rewards/margins": 2.256805419921875, "rewards/rejected": -4.100851535797119, "step": 89 }, { "epoch": 0.4, "grad_norm": 2.8574769496917725, "learning_rate": 4.702359839289306e-05, "logits/chosen": 2.0068724155426025, "logits/rejected": 2.0709903240203857, "logps/chosen": -362.0110168457031, "logps/rejected": -384.2983093261719, "loss": 0.2747, "rewards/accuracies": 1.0, "rewards/chosen": -2.201084852218628, "rewards/margins": 1.2602746486663818, "rewards/rejected": -3.4613595008850098, "step": 90 }, { "epoch": 0.4, "eval_logits/chosen": 2.161973237991333, "eval_logits/rejected": 2.1175014972686768, "eval_logps/chosen": -310.35650634765625, "eval_logps/rejected": -351.593994140625, "eval_loss": 0.5963193774223328, "eval_rewards/accuracies": 0.75, "eval_rewards/chosen": -2.006390333175659, "eval_rewards/margins": 1.5713260173797607, "eval_rewards/rejected": -3.57771635055542, "eval_runtime": 17.4029, "eval_samples_per_second": 2.873, "eval_steps_per_second": 0.402, "step": 90 }, { "epoch": 0.40444444444444444, "grad_norm": 12.335380554199219, "learning_rate": 4.693596173905352e-05, "logits/chosen": 2.364140272140503, "logits/rejected": 2.410036563873291, "logps/chosen": -285.2010192871094, "logps/rejected": -307.5989074707031, "loss": 1.0673, "rewards/accuracies": 0.5, "rewards/chosen": -2.113780975341797, "rewards/margins": -0.5538902282714844, "rewards/rejected": -1.5598907470703125, "step": 91 }, { "epoch": 0.4088888888888889, "grad_norm": 15.816446304321289, "learning_rate": 4.684713768064357e-05, "logits/chosen": 1.9842954874038696, "logits/rejected": 2.057584762573242, "logps/chosen": -406.9105224609375, "logps/rejected": -411.98211669921875, "loss": 1.7238, "rewards/accuracies": 0.0, "rewards/chosen": -5.087683200836182, "rewards/margins": -1.399601697921753, "rewards/rejected": -3.6880815029144287, "step": 92 }, { "epoch": 0.41333333333333333, "grad_norm": 9.741644859313965, "learning_rate": 4.6757131025753886e-05, "logits/chosen": 1.788228988647461, "logits/rejected": 1.8304262161254883, "logps/chosen": -257.06671142578125, "logps/rejected": -334.83636474609375, "loss": 0.9151, "rewards/accuracies": 0.5, "rewards/chosen": -2.2510857582092285, "rewards/margins": 0.16967010498046875, "rewards/rejected": -2.4207558631896973, "step": 93 }, { "epoch": 0.4177777777777778, "grad_norm": 2.8402011394500732, "learning_rate": 4.666594664648965e-05, "logits/chosen": 2.0854671001434326, "logits/rejected": 2.104097366333008, "logps/chosen": -244.27151489257812, "logps/rejected": -314.5588073730469, "loss": 0.3846, "rewards/accuracies": 0.5, "rewards/chosen": -0.4888412654399872, "rewards/margins": 1.8905991315841675, "rewards/rejected": -2.3794403076171875, "step": 94 }, { "epoch": 0.4222222222222222, "grad_norm": 8.729580879211426, "learning_rate": 4.657358947870691e-05, "logits/chosen": 2.1040725708007812, "logits/rejected": 2.0335569381713867, "logps/chosen": -278.30023193359375, "logps/rejected": -244.13815307617188, "loss": 1.8136, "rewards/accuracies": 0.5, "rewards/chosen": -2.41619873046875, "rewards/margins": -0.721272349357605, "rewards/rejected": -1.6949265003204346, "step": 95 }, { "epoch": 0.4266666666666667, "grad_norm": 10.822171211242676, "learning_rate": 4.648006452174529e-05, "logits/chosen": 2.428173542022705, "logits/rejected": 2.208796977996826, "logps/chosen": -400.2300720214844, "logps/rejected": -370.8153381347656, "loss": 1.0031, "rewards/accuracies": 0.0, "rewards/chosen": -3.485565185546875, "rewards/margins": -0.5405601859092712, "rewards/rejected": -2.945004940032959, "step": 96 }, { "epoch": 0.4311111111111111, "grad_norm": 0.6067838668823242, "learning_rate": 4.638537683815744e-05, "logits/chosen": 2.0516138076782227, "logits/rejected": 2.128382682800293, "logps/chosen": -282.2442321777344, "logps/rejected": -405.89276123046875, "loss": 0.0315, "rewards/accuracies": 1.0, "rewards/chosen": -0.78936767578125, "rewards/margins": 3.77711820602417, "rewards/rejected": -4.56648588180542, "step": 97 }, { "epoch": 0.43555555555555553, "grad_norm": 13.154459953308105, "learning_rate": 4.628953155343499e-05, "logits/chosen": 2.1049790382385254, "logits/rejected": 1.929673433303833, "logps/chosen": -305.3583984375, "logps/rejected": -173.13394165039062, "loss": 0.9443, "rewards/accuracies": 0.0, "rewards/chosen": -2.3848648071289062, "rewards/margins": -0.4350753426551819, "rewards/rejected": -1.9497895240783691, "step": 98 }, { "epoch": 0.44, "grad_norm": 14.330009460449219, "learning_rate": 4.6192533855731114e-05, "logits/chosen": 2.194329261779785, "logits/rejected": 2.2239904403686523, "logps/chosen": -411.20849609375, "logps/rejected": -397.444580078125, "loss": 1.1319, "rewards/accuracies": 0.5, "rewards/chosen": -3.436021566390991, "rewards/margins": 0.39365994930267334, "rewards/rejected": -3.829681396484375, "step": 99 }, { "epoch": 0.4444444444444444, "grad_norm": 7.170289993286133, "learning_rate": 4.609438899557964e-05, "logits/chosen": 2.3336665630340576, "logits/rejected": 2.3739962577819824, "logps/chosen": -443.4410095214844, "logps/rejected": -623.44384765625, "loss": 0.3145, "rewards/accuracies": 1.0, "rewards/chosen": -3.313551425933838, "rewards/margins": 2.143171787261963, "rewards/rejected": -5.456723213195801, "step": 100 }, { "epoch": 0.4488888888888889, "grad_norm": 1.4472028017044067, "learning_rate": 4.5995102285610906e-05, "logits/chosen": 2.0881364345550537, "logits/rejected": 2.0009138584136963, "logps/chosen": -430.8775634765625, "logps/rejected": -477.1756591796875, "loss": 0.0688, "rewards/accuracies": 1.0, "rewards/chosen": -2.0700042247772217, "rewards/margins": 3.3247146606445312, "rewards/rejected": -5.394719123840332, "step": 101 }, { "epoch": 0.4533333333333333, "grad_norm": 4.1629252433776855, "learning_rate": 4.589467910026411e-05, "logits/chosen": 1.530840516090393, "logits/rejected": 1.5147547721862793, "logps/chosen": -123.47633361816406, "logps/rejected": -148.20376586914062, "loss": 0.837, "rewards/accuracies": 0.5, "rewards/chosen": -0.8232139348983765, "rewards/margins": -0.18887022137641907, "rewards/rejected": -0.6343437433242798, "step": 102 }, { "epoch": 0.4577777777777778, "grad_norm": 2.2841246128082275, "learning_rate": 4.579312487549649e-05, "logits/chosen": 1.9281361103057861, "logits/rejected": 2.022286891937256, "logps/chosen": -349.215576171875, "logps/rejected": -505.2444763183594, "loss": 0.1805, "rewards/accuracies": 1.0, "rewards/chosen": -1.0184952020645142, "rewards/margins": 3.793022394180298, "rewards/rejected": -4.811517715454102, "step": 103 }, { "epoch": 0.4622222222222222, "grad_norm": 5.752498149871826, "learning_rate": 4.5690445108488964e-05, "logits/chosen": 2.1871137619018555, "logits/rejected": 2.213275909423828, "logps/chosen": -212.89849853515625, "logps/rejected": -298.1326599121094, "loss": 0.4131, "rewards/accuracies": 1.0, "rewards/chosen": -1.0786117315292358, "rewards/margins": 0.7388886213302612, "rewards/rejected": -1.817500352859497, "step": 104 }, { "epoch": 0.4666666666666667, "grad_norm": 2.0671677589416504, "learning_rate": 4.5586645357348636e-05, "logits/chosen": 1.9253795146942139, "logits/rejected": 1.9396390914916992, "logps/chosen": -283.0425720214844, "logps/rejected": -399.41876220703125, "loss": 0.2595, "rewards/accuracies": 1.0, "rewards/chosen": -0.7986934781074524, "rewards/margins": 3.9111409187316895, "rewards/rejected": -4.709834098815918, "step": 105 }, { "epoch": 0.4711111111111111, "grad_norm": 2.6245131492614746, "learning_rate": 4.548173124080789e-05, "logits/chosen": 2.175868511199951, "logits/rejected": 2.2268357276916504, "logps/chosen": -401.7663269042969, "logps/rejected": -339.01263427734375, "loss": 0.2061, "rewards/accuracies": 1.0, "rewards/chosen": -1.636662244796753, "rewards/margins": 2.116255283355713, "rewards/rejected": -3.752917528152466, "step": 106 }, { "epoch": 0.47555555555555556, "grad_norm": 6.8939642906188965, "learning_rate": 4.5375708437920284e-05, "logits/chosen": 2.150783061981201, "logits/rejected": 2.192080020904541, "logps/chosen": -327.8550720214844, "logps/rejected": -408.950439453125, "loss": 0.4908, "rewards/accuracies": 0.5, "rewards/chosen": -2.347644090652466, "rewards/margins": 1.237553358078003, "rewards/rejected": -3.5851974487304688, "step": 107 }, { "epoch": 0.48, "grad_norm": 3.782555103302002, "learning_rate": 4.526858268775313e-05, "logits/chosen": 1.875314712524414, "logits/rejected": 1.9068584442138672, "logps/chosen": -265.00701904296875, "logps/rejected": -323.212890625, "loss": 0.2026, "rewards/accuracies": 1.0, "rewards/chosen": -1.4442001581192017, "rewards/margins": 2.3625869750976562, "rewards/rejected": -3.8067870140075684, "step": 108 }, { "epoch": 0.48444444444444446, "grad_norm": 0.7733494639396667, "learning_rate": 4.516035978907681e-05, "logits/chosen": 1.999725103378296, "logits/rejected": 1.9631330966949463, "logps/chosen": -347.4098815917969, "logps/rejected": -423.82171630859375, "loss": 0.0474, "rewards/accuracies": 1.0, "rewards/chosen": -1.3635421991348267, "rewards/margins": 3.0967469215393066, "rewards/rejected": -4.460289001464844, "step": 109 }, { "epoch": 0.4888888888888889, "grad_norm": 2.2969586849212646, "learning_rate": 4.50510456000509e-05, "logits/chosen": 1.6264564990997314, "logits/rejected": 1.6307601928710938, "logps/chosen": -256.4902648925781, "logps/rejected": -350.5125427246094, "loss": 0.2928, "rewards/accuracies": 1.0, "rewards/chosen": -1.814366102218628, "rewards/margins": 3.2768468856811523, "rewards/rejected": -5.091212749481201, "step": 110 }, { "epoch": 0.49333333333333335, "grad_norm": 7.435751914978027, "learning_rate": 4.494064603790708e-05, "logits/chosen": 1.9676257371902466, "logits/rejected": 1.9142203330993652, "logps/chosen": -340.17498779296875, "logps/rejected": -344.9024658203125, "loss": 0.648, "rewards/accuracies": 0.5, "rewards/chosen": -1.670006513595581, "rewards/margins": 0.3124961853027344, "rewards/rejected": -1.9825026988983154, "step": 111 }, { "epoch": 0.49777777777777776, "grad_norm": 2.219264268875122, "learning_rate": 4.482916707862884e-05, "logits/chosen": 2.203705310821533, "logits/rejected": 2.050464153289795, "logps/chosen": -281.28857421875, "logps/rejected": -352.7940673828125, "loss": 0.1345, "rewards/accuracies": 1.0, "rewards/chosen": -0.9629112482070923, "rewards/margins": 2.2336831092834473, "rewards/rejected": -3.19659423828125, "step": 112 }, { "epoch": 0.5022222222222222, "grad_norm": 18.2978515625, "learning_rate": 4.471661475662792e-05, "logits/chosen": 1.856745719909668, "logits/rejected": 1.9189677238464355, "logps/chosen": -463.925048828125, "logps/rejected": -392.9632873535156, "loss": 2.6401, "rewards/accuracies": 0.5, "rewards/chosen": -5.859647750854492, "rewards/margins": -2.113811731338501, "rewards/rejected": -3.745835781097412, "step": 113 }, { "epoch": 0.5066666666666667, "grad_norm": 7.751908779144287, "learning_rate": 4.460299516441777e-05, "logits/chosen": 2.136542797088623, "logits/rejected": 2.011000156402588, "logps/chosen": -279.3929138183594, "logps/rejected": -225.48109436035156, "loss": 0.8985, "rewards/accuracies": 0.5, "rewards/chosen": -1.4863853454589844, "rewards/margins": 0.2738412022590637, "rewards/rejected": -1.7602264881134033, "step": 114 }, { "epoch": 0.5111111111111111, "grad_norm": 4.417598724365234, "learning_rate": 4.4488314452283675e-05, "logits/chosen": 1.400985836982727, "logits/rejected": 1.5235412120819092, "logps/chosen": -130.8572235107422, "logps/rejected": -137.48133850097656, "loss": 0.6302, "rewards/accuracies": 0.5, "rewards/chosen": -0.7593280673027039, "rewards/margins": 0.35355114936828613, "rewards/rejected": -1.1128792762756348, "step": 115 }, { "epoch": 0.5155555555555555, "grad_norm": 3.6468398571014404, "learning_rate": 4.437257882794991e-05, "logits/chosen": 2.243985414505005, "logits/rejected": 2.1406588554382324, "logps/chosen": -485.3035583496094, "logps/rejected": -437.01959228515625, "loss": 0.308, "rewards/accuracies": 1.0, "rewards/chosen": -2.8883087635040283, "rewards/margins": 1.3152587413787842, "rewards/rejected": -4.2035675048828125, "step": 116 }, { "epoch": 0.52, "grad_norm": 3.9299352169036865, "learning_rate": 4.425579455624364e-05, "logits/chosen": 1.9169459342956543, "logits/rejected": 1.82602858543396, "logps/chosen": -202.2696075439453, "logps/rejected": -183.041015625, "loss": 0.5871, "rewards/accuracies": 0.5, "rewards/chosen": -0.4907638430595398, "rewards/margins": 0.28335878252983093, "rewards/rejected": -0.7741226553916931, "step": 117 }, { "epoch": 0.5244444444444445, "grad_norm": 5.041739463806152, "learning_rate": 4.413796795875586e-05, "logits/chosen": 1.7944426536560059, "logits/rejected": 1.8221161365509033, "logps/chosen": -212.9854278564453, "logps/rejected": -245.502197265625, "loss": 0.6075, "rewards/accuracies": 0.5, "rewards/chosen": -1.522188663482666, "rewards/margins": 0.5173491835594177, "rewards/rejected": -2.0395379066467285, "step": 118 }, { "epoch": 0.5288888888888889, "grad_norm": 2.375946044921875, "learning_rate": 4.4019105413499164e-05, "logits/chosen": 2.1719205379486084, "logits/rejected": 2.069880962371826, "logps/chosen": -417.11004638671875, "logps/rejected": -380.632080078125, "loss": 0.1328, "rewards/accuracies": 1.0, "rewards/chosen": -2.728323459625244, "rewards/margins": 2.0067780017852783, "rewards/rejected": -4.735101699829102, "step": 119 }, { "epoch": 0.5333333333333333, "grad_norm": 4.835366725921631, "learning_rate": 4.389921335456253e-05, "logits/chosen": 2.228755474090576, "logits/rejected": 2.121194839477539, "logps/chosen": -456.9483337402344, "logps/rejected": -443.54644775390625, "loss": 0.4242, "rewards/accuracies": 0.5, "rewards/chosen": -1.9684982299804688, "rewards/margins": 3.314952850341797, "rewards/rejected": -5.283451080322266, "step": 120 }, { "epoch": 0.5377777777777778, "grad_norm": 9.487476348876953, "learning_rate": 4.3778298271762995e-05, "logits/chosen": 1.9492430686950684, "logits/rejected": 1.8923718929290771, "logps/chosen": -367.70208740234375, "logps/rejected": -292.9391174316406, "loss": 1.1745, "rewards/accuracies": 0.5, "rewards/chosen": -2.854750156402588, "rewards/margins": -0.488888680934906, "rewards/rejected": -2.365861415863037, "step": 121 }, { "epoch": 0.5422222222222223, "grad_norm": 8.035406112670898, "learning_rate": 4.365636671029445e-05, "logits/chosen": 1.636220097541809, "logits/rejected": 1.6882154941558838, "logps/chosen": -239.27474975585938, "logps/rejected": -248.72625732421875, "loss": 0.7769, "rewards/accuracies": 0.5, "rewards/chosen": -1.7846871614456177, "rewards/margins": 0.6797889471054077, "rewards/rejected": -2.4644761085510254, "step": 122 }, { "epoch": 0.5466666666666666, "grad_norm": 5.362706184387207, "learning_rate": 4.3533425270373216e-05, "logits/chosen": 2.0839271545410156, "logits/rejected": 2.0972325801849365, "logps/chosen": -387.6666564941406, "logps/rejected": -393.9215393066406, "loss": 0.2218, "rewards/accuracies": 1.0, "rewards/chosen": -2.9992005825042725, "rewards/margins": 3.244002342224121, "rewards/rejected": -6.243203163146973, "step": 123 }, { "epoch": 0.5511111111111111, "grad_norm": 2.3104119300842285, "learning_rate": 4.340948060688088e-05, "logits/chosen": 1.8295419216156006, "logits/rejected": 1.8262109756469727, "logps/chosen": -224.88113403320312, "logps/rejected": -227.4783477783203, "loss": 0.3689, "rewards/accuracies": 1.0, "rewards/chosen": -0.5778324604034424, "rewards/margins": 0.8512080907821655, "rewards/rejected": -1.429040551185608, "step": 124 }, { "epoch": 0.5555555555555556, "grad_norm": 4.099146366119385, "learning_rate": 4.328453942900402e-05, "logits/chosen": 1.97019362449646, "logits/rejected": 1.9800690412521362, "logps/chosen": -287.9169921875, "logps/rejected": -343.14788818359375, "loss": 0.3836, "rewards/accuracies": 0.5, "rewards/chosen": -0.4385543763637543, "rewards/margins": 1.3104095458984375, "rewards/rejected": -1.7489639520645142, "step": 125 }, { "epoch": 0.56, "grad_norm": 4.081305503845215, "learning_rate": 4.3158608499871024e-05, "logits/chosen": 2.1010217666625977, "logits/rejected": 2.029930830001831, "logps/chosen": -325.297119140625, "logps/rejected": -344.98284912109375, "loss": 0.342, "rewards/accuracies": 1.0, "rewards/chosen": -1.5290114879608154, "rewards/margins": 0.9010803699493408, "rewards/rejected": -2.4300918579101562, "step": 126 }, { "epoch": 0.5644444444444444, "grad_norm": 0.4260007441043854, "learning_rate": 4.3031694636186e-05, "logits/chosen": 2.314997434616089, "logits/rejected": 2.244847059249878, "logps/chosen": -407.2716369628906, "logps/rejected": -440.58770751953125, "loss": 0.0165, "rewards/accuracies": 1.0, "rewards/chosen": -0.8063247799873352, "rewards/margins": 4.322688579559326, "rewards/rejected": -5.1290130615234375, "step": 127 }, { "epoch": 0.5688888888888889, "grad_norm": 22.247989654541016, "learning_rate": 4.2903804707859835e-05, "logits/chosen": 1.9879422187805176, "logits/rejected": 2.056591749191284, "logps/chosen": -241.29771423339844, "logps/rejected": -261.1890563964844, "loss": 1.5621, "rewards/accuracies": 0.5, "rewards/chosen": -2.3791258335113525, "rewards/margins": -1.075968861579895, "rewards/rejected": -1.3031569719314575, "step": 128 }, { "epoch": 0.5733333333333334, "grad_norm": 3.094059705734253, "learning_rate": 4.2774945637638236e-05, "logits/chosen": 2.2621870040893555, "logits/rejected": 2.235694408416748, "logps/chosen": -403.2272644042969, "logps/rejected": -458.10858154296875, "loss": 0.1943, "rewards/accuracies": 1.0, "rewards/chosen": -1.8431869745254517, "rewards/margins": 2.3183135986328125, "rewards/rejected": -4.161500453948975, "step": 129 }, { "epoch": 0.5777777777777777, "grad_norm": 6.682619094848633, "learning_rate": 4.2645124400727074e-05, "logits/chosen": 1.864232063293457, "logits/rejected": 1.8359177112579346, "logps/chosen": -260.30364990234375, "logps/rejected": -291.6693420410156, "loss": 1.0362, "rewards/accuracies": 0.5, "rewards/chosen": -1.559638261795044, "rewards/margins": 0.7007129788398743, "rewards/rejected": -2.2603511810302734, "step": 130 }, { "epoch": 0.5822222222222222, "grad_norm": 4.2206597328186035, "learning_rate": 4.251434802441476e-05, "logits/chosen": 2.0464115142822266, "logits/rejected": 2.0911216735839844, "logps/chosen": -243.08462524414062, "logps/rejected": -254.49130249023438, "loss": 0.5624, "rewards/accuracies": 1.0, "rewards/chosen": 0.2900955379009247, "rewards/margins": 0.2913353145122528, "rewards/rejected": -0.001239776611328125, "step": 131 }, { "epoch": 0.5866666666666667, "grad_norm": 3.9296364784240723, "learning_rate": 4.238262358769192e-05, "logits/chosen": 2.281747341156006, "logits/rejected": 2.3363983631134033, "logps/chosen": -285.22998046875, "logps/rejected": -372.3949279785156, "loss": 0.2539, "rewards/accuracies": 1.0, "rewards/chosen": -0.6220627427101135, "rewards/margins": 1.3760398626327515, "rewards/rejected": -1.9981026649475098, "step": 132 }, { "epoch": 0.5911111111111111, "grad_norm": 2.2730822563171387, "learning_rate": 4.224995822086812e-05, "logits/chosen": 2.2451581954956055, "logits/rejected": 2.2252092361450195, "logps/chosen": -394.37017822265625, "logps/rejected": -501.9224853515625, "loss": 0.1271, "rewards/accuracies": 1.0, "rewards/chosen": 0.13392946124076843, "rewards/margins": 4.8956298828125, "rewards/rejected": -4.76170015335083, "step": 133 }, { "epoch": 0.5955555555555555, "grad_norm": 2.509901523590088, "learning_rate": 4.211635910518595e-05, "logits/chosen": 1.6302995681762695, "logits/rejected": 1.689335823059082, "logps/chosen": -152.21542358398438, "logps/rejected": -137.0872802734375, "loss": 0.4905, "rewards/accuracies": 1.0, "rewards/chosen": -0.1873771846294403, "rewards/margins": 0.48279035091400146, "rewards/rejected": -0.6701675653457642, "step": 134 }, { "epoch": 0.6, "grad_norm": 5.595450401306152, "learning_rate": 4.198183347243233e-05, "logits/chosen": 2.134934663772583, "logits/rejected": 2.091696262359619, "logps/chosen": -333.56829833984375, "logps/rejected": -360.50970458984375, "loss": 0.3094, "rewards/accuracies": 1.0, "rewards/chosen": -0.9313689470291138, "rewards/margins": 2.534543037414551, "rewards/rejected": -3.465911865234375, "step": 135 }, { "epoch": 0.6, "eval_logits/chosen": 2.2205488681793213, "eval_logits/rejected": 2.174258232116699, "eval_logps/chosen": -303.35302734375, "eval_logps/rejected": -344.1379089355469, "eval_loss": 0.43841180205345154, "eval_rewards/accuracies": 0.8035714030265808, "eval_rewards/chosen": -1.3060392141342163, "eval_rewards/margins": 1.5260727405548096, "eval_rewards/rejected": -2.8321120738983154, "eval_runtime": 17.3865, "eval_samples_per_second": 2.876, "eval_steps_per_second": 0.403, "step": 135 }, { "epoch": 0.6044444444444445, "grad_norm": 1.6538218259811401, "learning_rate": 4.184638860454696e-05, "logits/chosen": 1.9310147762298584, "logits/rejected": 1.8604496717453003, "logps/chosen": -251.001708984375, "logps/rejected": -290.62005615234375, "loss": 0.1732, "rewards/accuracies": 1.0, "rewards/chosen": -0.33168870210647583, "rewards/margins": 1.7937004566192627, "rewards/rejected": -2.1253890991210938, "step": 136 }, { "epoch": 0.6088888888888889, "grad_norm": 3.1406407356262207, "learning_rate": 4.1710031833228225e-05, "logits/chosen": 1.7651350498199463, "logits/rejected": 1.8405566215515137, "logps/chosen": -175.06227111816406, "logps/rejected": -272.6947326660156, "loss": 0.1498, "rewards/accuracies": 1.0, "rewards/chosen": -0.38391417264938354, "rewards/margins": 1.8765029907226562, "rewards/rejected": -2.2604172229766846, "step": 137 }, { "epoch": 0.6133333333333333, "grad_norm": 4.962210178375244, "learning_rate": 4.157277053953631e-05, "logits/chosen": 2.104128837585449, "logits/rejected": 2.078892230987549, "logps/chosen": -259.95330810546875, "logps/rejected": -243.87139892578125, "loss": 0.3842, "rewards/accuracies": 1.0, "rewards/chosen": -0.8645896911621094, "rewards/margins": 1.2094483375549316, "rewards/rejected": -2.074038028717041, "step": 138 }, { "epoch": 0.6177777777777778, "grad_norm": 5.0981316566467285, "learning_rate": 4.143461215349361e-05, "logits/chosen": 2.3514866828918457, "logits/rejected": 2.3009910583496094, "logps/chosen": -429.4484558105469, "logps/rejected": -535.6287231445312, "loss": 0.2313, "rewards/accuracies": 1.0, "rewards/chosen": -2.3444290161132812, "rewards/margins": 2.3630645275115967, "rewards/rejected": -4.707493782043457, "step": 139 }, { "epoch": 0.6222222222222222, "grad_norm": 4.2106523513793945, "learning_rate": 4.129556415368261e-05, "logits/chosen": 2.048675060272217, "logits/rejected": 2.0247228145599365, "logps/chosen": -283.77264404296875, "logps/rejected": -262.07989501953125, "loss": 0.4932, "rewards/accuracies": 0.5, "rewards/chosen": -1.2498573064804077, "rewards/margins": 0.9712372422218323, "rewards/rejected": -2.2210946083068848, "step": 140 }, { "epoch": 0.6266666666666667, "grad_norm": 10.738434791564941, "learning_rate": 4.115563406684103e-05, "logits/chosen": 2.148074150085449, "logits/rejected": 2.117837905883789, "logps/chosen": -340.57489013671875, "logps/rejected": -366.0396423339844, "loss": 1.6829, "rewards/accuracies": 0.5, "rewards/chosen": -0.95953369140625, "rewards/margins": 0.4566100835800171, "rewards/rejected": -1.416143774986267, "step": 141 }, { "epoch": 0.6311111111111111, "grad_norm": 8.303956985473633, "learning_rate": 4.101482946745439e-05, "logits/chosen": 2.478304386138916, "logits/rejected": 2.3817453384399414, "logps/chosen": -477.30059814453125, "logps/rejected": -419.9468994140625, "loss": 0.4244, "rewards/accuracies": 1.0, "rewards/chosen": -2.896254062652588, "rewards/margins": 0.6844373941421509, "rewards/rejected": -3.5806915760040283, "step": 142 }, { "epoch": 0.6355555555555555, "grad_norm": 5.196935653686523, "learning_rate": 4.0873157977346e-05, "logits/chosen": 2.295231342315674, "logits/rejected": 2.320071220397949, "logps/chosen": -327.35858154296875, "logps/rejected": -336.46234130859375, "loss": 0.3213, "rewards/accuracies": 1.0, "rewards/chosen": 0.11004638671875, "rewards/margins": 1.7110825777053833, "rewards/rejected": -1.6010361909866333, "step": 143 }, { "epoch": 0.64, "grad_norm": 0.837842583656311, "learning_rate": 4.073062726526443e-05, "logits/chosen": 2.3723278045654297, "logits/rejected": 2.1718640327453613, "logps/chosen": -361.47161865234375, "logps/rejected": -342.56048583984375, "loss": 0.0281, "rewards/accuracies": 1.0, "rewards/chosen": -0.03143996000289917, "rewards/margins": 3.5597052574157715, "rewards/rejected": -3.5911452770233154, "step": 144 }, { "epoch": 0.6444444444444445, "grad_norm": 1.8959003686904907, "learning_rate": 4.058724504646834e-05, "logits/chosen": 2.1700668334960938, "logits/rejected": 2.0942935943603516, "logps/chosen": -257.6955261230469, "logps/rejected": -254.59393310546875, "loss": 0.2177, "rewards/accuracies": 1.0, "rewards/chosen": 0.6744873523712158, "rewards/margins": 1.8073105812072754, "rewards/rejected": -1.13282310962677, "step": 145 }, { "epoch": 0.6488888888888888, "grad_norm": 3.8877060413360596, "learning_rate": 4.044301908230889e-05, "logits/chosen": 2.336484909057617, "logits/rejected": 2.250220775604248, "logps/chosen": -329.42779541015625, "logps/rejected": -461.0723876953125, "loss": 0.2504, "rewards/accuracies": 1.0, "rewards/chosen": 1.0788795948028564, "rewards/margins": 5.8840203285217285, "rewards/rejected": -4.805140972137451, "step": 146 }, { "epoch": 0.6533333333333333, "grad_norm": 4.329055309295654, "learning_rate": 4.0297957179809586e-05, "logits/chosen": 1.7940289974212646, "logits/rejected": 1.8224772214889526, "logps/chosen": -218.82870483398438, "logps/rejected": -236.94589233398438, "loss": 0.6222, "rewards/accuracies": 0.5, "rewards/chosen": -1.5779021978378296, "rewards/margins": 0.4461887776851654, "rewards/rejected": -2.0240910053253174, "step": 147 }, { "epoch": 0.6577777777777778, "grad_norm": 1.4359842538833618, "learning_rate": 4.0152067191243696e-05, "logits/chosen": 1.9685239791870117, "logits/rejected": 2.0288798809051514, "logps/chosen": -357.14801025390625, "logps/rejected": -412.6529541015625, "loss": 0.0852, "rewards/accuracies": 1.0, "rewards/chosen": -2.2776429653167725, "rewards/margins": 2.4739623069763184, "rewards/rejected": -4.75160551071167, "step": 148 }, { "epoch": 0.6622222222222223, "grad_norm": 12.442399978637695, "learning_rate": 4.000535701370921e-05, "logits/chosen": 1.7401182651519775, "logits/rejected": 1.621551752090454, "logps/chosen": -320.3838195800781, "logps/rejected": -198.91424560546875, "loss": 1.6628, "rewards/accuracies": 0.5, "rewards/chosen": -2.767268419265747, "rewards/margins": -0.9375503659248352, "rewards/rejected": -1.8297181129455566, "step": 149 }, { "epoch": 0.6666666666666666, "grad_norm": 17.23033332824707, "learning_rate": 3.985783458870134e-05, "logits/chosen": 2.1593716144561768, "logits/rejected": 2.1524195671081543, "logps/chosen": -375.254150390625, "logps/rejected": -289.0406494140625, "loss": 1.0413, "rewards/accuracies": 0.0, "rewards/chosen": -2.9647598266601562, "rewards/margins": -0.5915945768356323, "rewards/rejected": -2.3731651306152344, "step": 150 }, { "epoch": 0.6711111111111111, "grad_norm": 0.4952673614025116, "learning_rate": 3.9709507901682675e-05, "logits/chosen": 2.376957893371582, "logits/rejected": 2.328672409057617, "logps/chosen": -484.62518310546875, "logps/rejected": -505.002197265625, "loss": 0.0204, "rewards/accuracies": 1.0, "rewards/chosen": -0.708740234375, "rewards/margins": 4.536779403686523, "rewards/rejected": -5.245519638061523, "step": 151 }, { "epoch": 0.6755555555555556, "grad_norm": 4.280153751373291, "learning_rate": 3.95603849816509e-05, "logits/chosen": 2.2534637451171875, "logits/rejected": 2.347764015197754, "logps/chosen": -311.7674560546875, "logps/rejected": -343.17584228515625, "loss": 0.2535, "rewards/accuracies": 1.0, "rewards/chosen": -0.9193252325057983, "rewards/margins": 1.2428758144378662, "rewards/rejected": -2.162200927734375, "step": 152 }, { "epoch": 0.68, "grad_norm": 9.200690269470215, "learning_rate": 3.941047390070419e-05, "logits/chosen": 2.2525882720947266, "logits/rejected": 2.196587324142456, "logps/chosen": -419.79833984375, "logps/rejected": -368.85174560546875, "loss": 0.8223, "rewards/accuracies": 0.5, "rewards/chosen": -1.9374847412109375, "rewards/margins": 0.6514175534248352, "rewards/rejected": -2.588902235031128, "step": 153 }, { "epoch": 0.6844444444444444, "grad_norm": 1.1576966047286987, "learning_rate": 3.925978277360428e-05, "logits/chosen": 2.2419962882995605, "logits/rejected": 2.240370750427246, "logps/chosen": -354.83514404296875, "logps/rejected": -398.2762451171875, "loss": 0.0467, "rewards/accuracies": 1.0, "rewards/chosen": -1.2429847717285156, "rewards/margins": 3.4657950401306152, "rewards/rejected": -4.708779811859131, "step": 154 }, { "epoch": 0.6888888888888889, "grad_norm": 4.058935642242432, "learning_rate": 3.910831975733717e-05, "logits/chosen": 2.4752016067504883, "logits/rejected": 2.4061837196350098, "logps/chosen": -374.433349609375, "logps/rejected": -460.99163818359375, "loss": 0.1247, "rewards/accuracies": 1.0, "rewards/chosen": -1.6075196266174316, "rewards/margins": 2.1309003829956055, "rewards/rejected": -3.738420009613037, "step": 155 }, { "epoch": 0.6933333333333334, "grad_norm": 10.329422950744629, "learning_rate": 3.895609305067162e-05, "logits/chosen": 2.1816141605377197, "logits/rejected": 2.1901464462280273, "logps/chosen": -354.7869873046875, "logps/rejected": -334.8106384277344, "loss": 1.1709, "rewards/accuracies": 0.0, "rewards/chosen": -1.6891334056854248, "rewards/margins": -0.787255048751831, "rewards/rejected": -0.9018783569335938, "step": 156 }, { "epoch": 0.6977777777777778, "grad_norm": 2.0047101974487305, "learning_rate": 3.8803110893715334e-05, "logits/chosen": 2.076343536376953, "logits/rejected": 2.1251060962677, "logps/chosen": -226.72457885742188, "logps/rejected": -348.7533874511719, "loss": 0.1027, "rewards/accuracies": 1.0, "rewards/chosen": -0.06249618902802467, "rewards/margins": 5.404012680053711, "rewards/rejected": -5.4665093421936035, "step": 157 }, { "epoch": 0.7022222222222222, "grad_norm": 8.143394470214844, "learning_rate": 3.864938156746891e-05, "logits/chosen": 2.237619400024414, "logits/rejected": 2.3040781021118164, "logps/chosen": -429.0187683105469, "logps/rejected": -341.12957763671875, "loss": 0.6455, "rewards/accuracies": 0.5, "rewards/chosen": -1.7430390119552612, "rewards/margins": 1.6202683448791504, "rewards/rejected": -3.363307237625122, "step": 158 }, { "epoch": 0.7066666666666667, "grad_norm": 2.048825979232788, "learning_rate": 3.849491339337758e-05, "logits/chosen": 2.246427297592163, "logits/rejected": 2.1864523887634277, "logps/chosen": -253.68792724609375, "logps/rejected": -255.00852966308594, "loss": 0.1813, "rewards/accuracies": 1.0, "rewards/chosen": -1.2091705799102783, "rewards/margins": 1.7127196788787842, "rewards/rejected": -2.9218902587890625, "step": 159 }, { "epoch": 0.7111111111111111, "grad_norm": 9.93562126159668, "learning_rate": 3.833971473288084e-05, "logits/chosen": 2.260481357574463, "logits/rejected": 2.2601776123046875, "logps/chosen": -375.98193359375, "logps/rejected": -420.81494140625, "loss": 0.6181, "rewards/accuracies": 1.0, "rewards/chosen": -3.0803894996643066, "rewards/margins": 0.15620207786560059, "rewards/rejected": -3.2365915775299072, "step": 160 }, { "epoch": 0.7155555555555555, "grad_norm": 3.1551766395568848, "learning_rate": 3.818379398695969e-05, "logits/chosen": 1.9815887212753296, "logits/rejected": 1.9442949295043945, "logps/chosen": -316.49468994140625, "logps/rejected": -369.946044921875, "loss": 0.132, "rewards/accuracies": 1.0, "rewards/chosen": -0.701190173625946, "rewards/margins": 2.1365509033203125, "rewards/rejected": -2.8377411365509033, "step": 161 }, { "epoch": 0.72, "grad_norm": 13.20207405090332, "learning_rate": 3.802715959568205e-05, "logits/chosen": 2.195608377456665, "logits/rejected": 2.118527889251709, "logps/chosen": -396.55755615234375, "logps/rejected": -423.4131164550781, "loss": 0.8681, "rewards/accuracies": 0.5, "rewards/chosen": -2.12845778465271, "rewards/margins": 2.663003444671631, "rewards/rejected": -4.791460990905762, "step": 162 }, { "epoch": 0.7244444444444444, "grad_norm": 1.2311620712280273, "learning_rate": 3.7869820037745776e-05, "logits/chosen": 2.1139190196990967, "logits/rejected": 2.142258644104004, "logps/chosen": -275.6441955566406, "logps/rejected": -338.9689636230469, "loss": 0.1183, "rewards/accuracies": 1.0, "rewards/chosen": -1.2249420881271362, "rewards/margins": 3.1651391983032227, "rewards/rejected": -4.39008092880249, "step": 163 }, { "epoch": 0.7288888888888889, "grad_norm": 18.346521377563477, "learning_rate": 3.771178383001976e-05, "logits/chosen": 2.330061435699463, "logits/rejected": 2.222029685974121, "logps/chosen": -468.13232421875, "logps/rejected": -425.3597412109375, "loss": 0.6944, "rewards/accuracies": 0.5, "rewards/chosen": -3.2604997158050537, "rewards/margins": 1.2873930931091309, "rewards/rejected": -4.5478925704956055, "step": 164 }, { "epoch": 0.7333333333333333, "grad_norm": 4.46051025390625, "learning_rate": 3.7553059527082913e-05, "logits/chosen": 2.2368054389953613, "logits/rejected": 2.2174384593963623, "logps/chosen": -287.6416931152344, "logps/rejected": -241.71742248535156, "loss": 0.4803, "rewards/accuracies": 0.5, "rewards/chosen": -1.1810874938964844, "rewards/margins": 0.794731855392456, "rewards/rejected": -1.9758193492889404, "step": 165 }, { "epoch": 0.7377777777777778, "grad_norm": 5.725604057312012, "learning_rate": 3.739365572076105e-05, "logits/chosen": 2.309138536453247, "logits/rejected": 2.297959327697754, "logps/chosen": -313.10382080078125, "logps/rejected": -440.99212646484375, "loss": 0.3831, "rewards/accuracies": 1.0, "rewards/chosen": -1.8803879022598267, "rewards/margins": 1.21652090549469, "rewards/rejected": -3.0969088077545166, "step": 166 }, { "epoch": 0.7422222222222222, "grad_norm": 13.70535945892334, "learning_rate": 3.7233581039661874e-05, "logits/chosen": 2.021416187286377, "logits/rejected": 2.0485005378723145, "logps/chosen": -308.900146484375, "logps/rejected": -373.67474365234375, "loss": 0.6832, "rewards/accuracies": 0.5, "rewards/chosen": -0.8083343505859375, "rewards/margins": 1.9991533756256104, "rewards/rejected": -2.807487726211548, "step": 167 }, { "epoch": 0.7466666666666667, "grad_norm": 5.6754560470581055, "learning_rate": 3.707284414870786e-05, "logits/chosen": 2.3587806224823, "logits/rejected": 2.424814224243164, "logps/chosen": -379.08697509765625, "logps/rejected": -437.82147216796875, "loss": 0.2897, "rewards/accuracies": 1.0, "rewards/chosen": -1.100062608718872, "rewards/margins": 1.1164734363555908, "rewards/rejected": -2.216536045074463, "step": 168 }, { "epoch": 0.7511111111111111, "grad_norm": 13.687298774719238, "learning_rate": 3.691145374866723e-05, "logits/chosen": 2.0991220474243164, "logits/rejected": 2.0921735763549805, "logps/chosen": -251.90283203125, "logps/rejected": -294.06097412109375, "loss": 1.2515, "rewards/accuracies": 0.5, "rewards/chosen": -3.1877260208129883, "rewards/margins": -0.7983794212341309, "rewards/rejected": -2.3893463611602783, "step": 169 }, { "epoch": 0.7555555555555555, "grad_norm": 2.953883647918701, "learning_rate": 3.6749418575683e-05, "logits/chosen": 1.9750038385391235, "logits/rejected": 1.9700895547866821, "logps/chosen": -258.9554748535156, "logps/rejected": -268.0426940917969, "loss": 0.107, "rewards/accuracies": 1.0, "rewards/chosen": -1.537645697593689, "rewards/margins": 2.264012336730957, "rewards/rejected": -3.8016581535339355, "step": 170 }, { "epoch": 0.76, "grad_norm": 2.0586588382720947, "learning_rate": 3.658674740080004e-05, "logits/chosen": 2.249845504760742, "logits/rejected": 2.1530356407165527, "logps/chosen": -373.0096740722656, "logps/rejected": -384.4673767089844, "loss": 0.147, "rewards/accuracies": 1.0, "rewards/chosen": 0.49631041288375854, "rewards/margins": 3.1010565757751465, "rewards/rejected": -2.6047463417053223, "step": 171 }, { "epoch": 0.7644444444444445, "grad_norm": 9.671707153320312, "learning_rate": 3.642344902949034e-05, "logits/chosen": 2.1441969871520996, "logits/rejected": 2.0829548835754395, "logps/chosen": -375.480224609375, "logps/rejected": -306.5086975097656, "loss": 0.558, "rewards/accuracies": 0.5, "rewards/chosen": -2.187718152999878, "rewards/margins": 0.452168345451355, "rewards/rejected": -2.6398866176605225, "step": 172 }, { "epoch": 0.7688888888888888, "grad_norm": 0.8413982391357422, "learning_rate": 3.6259532301176335e-05, "logits/chosen": 1.6508468389511108, "logits/rejected": 1.683258056640625, "logps/chosen": -302.1081848144531, "logps/rejected": -320.01141357421875, "loss": 0.1217, "rewards/accuracies": 1.0, "rewards/chosen": -0.08628615736961365, "rewards/margins": 3.430840492248535, "rewards/rejected": -3.5171265602111816, "step": 173 }, { "epoch": 0.7733333333333333, "grad_norm": 11.386160850524902, "learning_rate": 3.6095006088752447e-05, "logits/chosen": 2.2143354415893555, "logits/rejected": 2.283841848373413, "logps/chosen": -428.9851379394531, "logps/rejected": -506.7236328125, "loss": 0.4424, "rewards/accuracies": 1.0, "rewards/chosen": -3.3689346313476562, "rewards/margins": 0.6975066661834717, "rewards/rejected": -4.066441535949707, "step": 174 }, { "epoch": 0.7777777777777778, "grad_norm": 2.112983226776123, "learning_rate": 3.592987929810476e-05, "logits/chosen": 1.9571995735168457, "logits/rejected": 1.922455072402954, "logps/chosen": -289.2620849609375, "logps/rejected": -418.0646057128906, "loss": 0.1977, "rewards/accuracies": 1.0, "rewards/chosen": -0.3730583190917969, "rewards/margins": 4.838759422302246, "rewards/rejected": -5.211817741394043, "step": 175 }, { "epoch": 0.7822222222222223, "grad_norm": 2.702089786529541, "learning_rate": 3.576416086762896e-05, "logits/chosen": 1.9095101356506348, "logits/rejected": 1.9017338752746582, "logps/chosen": -258.8391418457031, "logps/rejected": -258.28948974609375, "loss": 0.1737, "rewards/accuracies": 1.0, "rewards/chosen": -0.08884277939796448, "rewards/margins": 2.2755606174468994, "rewards/rejected": -2.364403486251831, "step": 176 }, { "epoch": 0.7866666666666666, "grad_norm": 3.3134331703186035, "learning_rate": 3.5597859767746524e-05, "logits/chosen": 2.0989699363708496, "logits/rejected": 2.059138774871826, "logps/chosen": -253.71551513671875, "logps/rejected": -245.14794921875, "loss": 0.2049, "rewards/accuracies": 1.0, "rewards/chosen": -2.4205713272094727, "rewards/margins": 1.4892207384109497, "rewards/rejected": -3.909791946411133, "step": 177 }, { "epoch": 0.7911111111111111, "grad_norm": 0.9361621141433716, "learning_rate": 3.543098500041906e-05, "logits/chosen": 2.1984782218933105, "logits/rejected": 2.0851023197174072, "logps/chosen": -280.2938537597656, "logps/rejected": -311.8728332519531, "loss": 0.0635, "rewards/accuracies": 1.0, "rewards/chosen": 0.518353283405304, "rewards/margins": 2.7252793312072754, "rewards/rejected": -2.206925868988037, "step": 178 }, { "epoch": 0.7955555555555556, "grad_norm": 35.653995513916016, "learning_rate": 3.526354559866113e-05, "logits/chosen": 2.0546839237213135, "logits/rejected": 2.004641532897949, "logps/chosen": -270.5155334472656, "logps/rejected": -285.1980285644531, "loss": 3.7422, "rewards/accuracies": 0.5, "rewards/chosen": -3.489393711090088, "rewards/margins": -3.35947322845459, "rewards/rejected": -0.1299205720424652, "step": 179 }, { "epoch": 0.8, "grad_norm": 21.546016693115234, "learning_rate": 3.509555062605121e-05, "logits/chosen": 2.0889358520507812, "logits/rejected": 2.1452298164367676, "logps/chosen": -404.2633056640625, "logps/rejected": -504.6707458496094, "loss": 0.8305, "rewards/accuracies": 0.5, "rewards/chosen": -3.7308120727539062, "rewards/margins": 0.4282197952270508, "rewards/rejected": -4.159031867980957, "step": 180 }, { "epoch": 0.8, "eval_logits/chosen": 2.1556396484375, "eval_logits/rejected": 2.1094348430633545, "eval_logps/chosen": -310.0359191894531, "eval_logps/rejected": -357.9092712402344, "eval_loss": 0.4349474012851715, "eval_rewards/accuracies": 0.7857142686843872, "eval_rewards/chosen": -1.9743303060531616, "eval_rewards/margins": 2.2349183559417725, "eval_rewards/rejected": -4.2092485427856445, "eval_runtime": 17.3856, "eval_samples_per_second": 2.876, "eval_steps_per_second": 0.403, "step": 180 }, { "epoch": 0.8044444444444444, "grad_norm": 1.8578166961669922, "learning_rate": 3.492700917624113e-05, "logits/chosen": 1.87994384765625, "logits/rejected": 1.8280099630355835, "logps/chosen": -235.28530883789062, "logps/rejected": -234.61431884765625, "loss": 0.1254, "rewards/accuracies": 1.0, "rewards/chosen": -0.8087150454521179, "rewards/margins": 2.258847236633301, "rewards/rejected": -3.0675621032714844, "step": 181 }, { "epoch": 0.8088888888888889, "grad_norm": 1.5471972227096558, "learning_rate": 3.4757930372463775e-05, "logits/chosen": 1.9977684020996094, "logits/rejected": 1.8153365850448608, "logps/chosen": -311.0814208984375, "logps/rejected": -285.3901672363281, "loss": 0.0704, "rewards/accuracies": 1.0, "rewards/chosen": 0.35354921221733093, "rewards/margins": 2.717729091644287, "rewards/rejected": -2.364180088043213, "step": 182 }, { "epoch": 0.8133333333333334, "grad_norm": 32.15199279785156, "learning_rate": 3.458832336703929e-05, "logits/chosen": 1.8522934913635254, "logits/rejected": 1.7468159198760986, "logps/chosen": -580.458740234375, "logps/rejected": -361.63299560546875, "loss": 4.9968, "rewards/accuracies": 0.5, "rewards/chosen": -10.116351127624512, "rewards/margins": -4.255195617675781, "rewards/rejected": -5.8611555099487305, "step": 183 }, { "epoch": 0.8177777777777778, "grad_norm": 21.2011661529541, "learning_rate": 3.4418197340879635e-05, "logits/chosen": 2.1630630493164062, "logits/rejected": 2.1617729663848877, "logps/chosen": -496.76910400390625, "logps/rejected": -401.410888671875, "loss": 2.7916, "rewards/accuracies": 0.0, "rewards/chosen": -6.106187343597412, "rewards/margins": -2.5850629806518555, "rewards/rejected": -3.5211243629455566, "step": 184 }, { "epoch": 0.8222222222222222, "grad_norm": 3.49141263961792, "learning_rate": 3.4247561502991604e-05, "logits/chosen": 2.1068267822265625, "logits/rejected": 2.1182143688201904, "logps/chosen": -385.7279052734375, "logps/rejected": -520.2858276367188, "loss": 0.2494, "rewards/accuracies": 1.0, "rewards/chosen": -3.5602035522460938, "rewards/margins": 3.892549991607666, "rewards/rejected": -7.45275354385376, "step": 185 }, { "epoch": 0.8266666666666667, "grad_norm": 10.206978797912598, "learning_rate": 3.407642508997838e-05, "logits/chosen": 1.9911150932312012, "logits/rejected": 1.9793498516082764, "logps/chosen": -355.60675048828125, "logps/rejected": -286.39141845703125, "loss": 0.5621, "rewards/accuracies": 1.0, "rewards/chosen": -3.6045563220977783, "rewards/margins": 0.29973304271698, "rewards/rejected": -3.9042892456054688, "step": 186 }, { "epoch": 0.8311111111111111, "grad_norm": 3.9811224937438965, "learning_rate": 3.3904797365539514e-05, "logits/chosen": 1.9419519901275635, "logits/rejected": 1.943634033203125, "logps/chosen": -345.35076904296875, "logps/rejected": -388.5447998046875, "loss": 0.3847, "rewards/accuracies": 0.5, "rewards/chosen": 0.5699470639228821, "rewards/margins": 2.5326507091522217, "rewards/rejected": -1.9627037048339844, "step": 187 }, { "epoch": 0.8355555555555556, "grad_norm": 7.058280944824219, "learning_rate": 3.37326876199695e-05, "logits/chosen": 2.377598524093628, "logits/rejected": 2.3797459602355957, "logps/chosen": -341.9767150878906, "logps/rejected": -513.3931884765625, "loss": 0.197, "rewards/accuracies": 1.0, "rewards/chosen": -1.686824083328247, "rewards/margins": 2.547370672225952, "rewards/rejected": -4.234194755554199, "step": 188 }, { "epoch": 0.84, "grad_norm": 1.5856622457504272, "learning_rate": 3.356010516965486e-05, "logits/chosen": 1.8028912544250488, "logits/rejected": 1.8407230377197266, "logps/chosen": -206.43429565429688, "logps/rejected": -305.61212158203125, "loss": 0.1631, "rewards/accuracies": 1.0, "rewards/chosen": 0.5213749408721924, "rewards/margins": 4.116602420806885, "rewards/rejected": -3.5952274799346924, "step": 189 }, { "epoch": 0.8444444444444444, "grad_norm": 9.480084419250488, "learning_rate": 3.3387059356569875e-05, "logits/chosen": 2.0444135665893555, "logits/rejected": 2.0769548416137695, "logps/chosen": -260.6351318359375, "logps/rejected": -255.31646728515625, "loss": 1.054, "rewards/accuracies": 0.5, "rewards/chosen": 0.5492042303085327, "rewards/margins": -0.4221389889717102, "rewards/rejected": 0.9713432192802429, "step": 190 }, { "epoch": 0.8488888888888889, "grad_norm": 5.375828742980957, "learning_rate": 3.321355954777087e-05, "logits/chosen": 2.0929622650146484, "logits/rejected": 2.0118932723999023, "logps/chosen": -271.47088623046875, "logps/rejected": -324.151123046875, "loss": 0.3697, "rewards/accuracies": 1.0, "rewards/chosen": -0.7393569946289062, "rewards/margins": 1.4251999855041504, "rewards/rejected": -2.1645569801330566, "step": 191 }, { "epoch": 0.8533333333333334, "grad_norm": 11.164929389953613, "learning_rate": 3.3039615134889206e-05, "logits/chosen": 2.168374538421631, "logits/rejected": 2.050518035888672, "logps/chosen": -425.24169921875, "logps/rejected": -436.30914306640625, "loss": 0.9201, "rewards/accuracies": 0.5, "rewards/chosen": -2.633542060852051, "rewards/margins": -0.2216278314590454, "rewards/rejected": -2.411914110183716, "step": 192 }, { "epoch": 0.8577777777777778, "grad_norm": 12.873815536499023, "learning_rate": 3.286523553362287e-05, "logits/chosen": 2.097388505935669, "logits/rejected": 2.0157864093780518, "logps/chosen": -274.7909851074219, "logps/rejected": -251.3322296142578, "loss": 1.2622, "rewards/accuracies": 0.5, "rewards/chosen": -1.8827362060546875, "rewards/margins": 0.5675584077835083, "rewards/rejected": -2.4502944946289062, "step": 193 }, { "epoch": 0.8622222222222222, "grad_norm": 8.37449836730957, "learning_rate": 3.269043018322681e-05, "logits/chosen": 2.1862282752990723, "logits/rejected": 2.087134838104248, "logps/chosen": -295.71875, "logps/rejected": -301.2806396484375, "loss": 0.4823, "rewards/accuracies": 0.5, "rewards/chosen": -0.7563506960868835, "rewards/margins": 2.514920234680176, "rewards/rejected": -3.271270751953125, "step": 194 }, { "epoch": 0.8666666666666667, "grad_norm": 6.452545642852783, "learning_rate": 3.2515208546002e-05, "logits/chosen": 1.9793057441711426, "logits/rejected": 2.004866600036621, "logps/chosen": -242.0079803466797, "logps/rejected": -277.5906677246094, "loss": 0.4756, "rewards/accuracies": 0.5, "rewards/chosen": -1.457282304763794, "rewards/margins": 1.3829689025878906, "rewards/rejected": -2.8402512073516846, "step": 195 }, { "epoch": 0.8711111111111111, "grad_norm": 25.117042541503906, "learning_rate": 3.233958010678322e-05, "logits/chosen": 2.0711257457733154, "logits/rejected": 2.1235086917877197, "logps/chosen": -509.37188720703125, "logps/rejected": -579.4296875, "loss": 0.7721, "rewards/accuracies": 0.5, "rewards/chosen": -4.672909736633301, "rewards/margins": 1.30540931224823, "rewards/rejected": -5.97831916809082, "step": 196 }, { "epoch": 0.8755555555555555, "grad_norm": 2.4943253993988037, "learning_rate": 3.216355437242564e-05, "logits/chosen": 2.0617835521698, "logits/rejected": 2.025505781173706, "logps/chosen": -224.05841064453125, "logps/rejected": -297.93768310546875, "loss": 0.1517, "rewards/accuracies": 1.0, "rewards/chosen": -0.27078327536582947, "rewards/margins": 1.8932006359100342, "rewards/rejected": -2.1639838218688965, "step": 197 }, { "epoch": 0.88, "grad_norm": 1.4619039297103882, "learning_rate": 3.1987140871290236e-05, "logits/chosen": 1.9786646366119385, "logits/rejected": 1.9430062770843506, "logps/chosen": -194.2051544189453, "logps/rejected": -184.22048950195312, "loss": 0.1253, "rewards/accuracies": 1.0, "rewards/chosen": 0.5335159301757812, "rewards/margins": 2.0488877296447754, "rewards/rejected": -1.5153717994689941, "step": 198 }, { "epoch": 0.8844444444444445, "grad_norm": 7.211589336395264, "learning_rate": 3.181034915272797e-05, "logits/chosen": 2.10782527923584, "logits/rejected": 2.206753730773926, "logps/chosen": -336.33038330078125, "logps/rejected": -462.14654541015625, "loss": 0.8607, "rewards/accuracies": 0.5, "rewards/chosen": -0.9421745538711548, "rewards/margins": 3.117570161819458, "rewards/rejected": -4.059744358062744, "step": 199 }, { "epoch": 0.8888888888888888, "grad_norm": 2.614577054977417, "learning_rate": 3.1633188786562914e-05, "logits/chosen": 1.9760353565216064, "logits/rejected": 1.902787685394287, "logps/chosen": -248.97714233398438, "logps/rejected": -256.932861328125, "loss": 0.282, "rewards/accuracies": 1.0, "rewards/chosen": 1.1982636451721191, "rewards/margins": 2.316314697265625, "rewards/rejected": -1.1180511713027954, "step": 200 }, { "epoch": 0.8933333333333333, "grad_norm": 10.176268577575684, "learning_rate": 3.1455669362574214e-05, "logits/chosen": 1.9834389686584473, "logits/rejected": 1.794731855392456, "logps/chosen": -350.74102783203125, "logps/rejected": -293.0185241699219, "loss": 0.6404, "rewards/accuracies": 0.5, "rewards/chosen": -1.7628203630447388, "rewards/margins": 2.335862159729004, "rewards/rejected": -4.098682403564453, "step": 201 }, { "epoch": 0.8977777777777778, "grad_norm": 1.9432324171066284, "learning_rate": 3.1277800489977e-05, "logits/chosen": 1.688536286354065, "logits/rejected": 1.688530445098877, "logps/chosen": -238.6612548828125, "logps/rejected": -331.52606201171875, "loss": 0.1152, "rewards/accuracies": 1.0, "rewards/chosen": 0.12506788969039917, "rewards/margins": 3.6676583290100098, "rewards/rejected": -3.542590618133545, "step": 202 }, { "epoch": 0.9022222222222223, "grad_norm": 1.8283239603042603, "learning_rate": 3.1099591796902215e-05, "logits/chosen": 2.159648895263672, "logits/rejected": 2.12233567237854, "logps/chosen": -421.9817199707031, "logps/rejected": -420.61663818359375, "loss": 0.0986, "rewards/accuracies": 1.0, "rewards/chosen": -3.661656141281128, "rewards/margins": 2.6688404083251953, "rewards/rejected": -6.330496311187744, "step": 203 }, { "epoch": 0.9066666666666666, "grad_norm": 17.021015167236328, "learning_rate": 3.092105292987548e-05, "logits/chosen": 1.7743968963623047, "logits/rejected": 1.7986412048339844, "logps/chosen": -164.3032989501953, "logps/rejected": -203.1534881591797, "loss": 1.8414, "rewards/accuracies": 0.0, "rewards/chosen": -3.1501376628875732, "rewards/margins": -1.529021143913269, "rewards/rejected": -1.6211166381835938, "step": 204 }, { "epoch": 0.9111111111111111, "grad_norm": 12.92226791381836, "learning_rate": 3.07421935532949e-05, "logits/chosen": 1.701080083847046, "logits/rejected": 1.747081995010376, "logps/chosen": -134.5420379638672, "logps/rejected": -180.4077911376953, "loss": 0.7422, "rewards/accuracies": 0.5, "rewards/chosen": -1.9285032749176025, "rewards/margins": 0.21163922548294067, "rewards/rejected": -2.1401424407958984, "step": 205 }, { "epoch": 0.9155555555555556, "grad_norm": 12.34039306640625, "learning_rate": 3.056302334890786e-05, "logits/chosen": 2.1032023429870605, "logits/rejected": 2.1539621353149414, "logps/chosen": -291.208984375, "logps/rejected": -335.7510986328125, "loss": 0.5075, "rewards/accuracies": 0.5, "rewards/chosen": 0.06739044934511185, "rewards/margins": 1.4255584478378296, "rewards/rejected": -1.3581680059432983, "step": 206 }, { "epoch": 0.92, "grad_norm": 0.44443604350090027, "learning_rate": 3.03835520152871e-05, "logits/chosen": 2.3766026496887207, "logits/rejected": 2.2064499855041504, "logps/chosen": -357.09075927734375, "logps/rejected": -521.0325927734375, "loss": 0.0117, "rewards/accuracies": 1.0, "rewards/chosen": -0.4761963188648224, "rewards/margins": 5.920969009399414, "rewards/rejected": -6.397165298461914, "step": 207 }, { "epoch": 0.9244444444444444, "grad_norm": 1.3140870332717896, "learning_rate": 3.0203789267305567e-05, "logits/chosen": 2.188861131668091, "logits/rejected": 2.1991310119628906, "logps/chosen": -318.3990783691406, "logps/rejected": -407.42578125, "loss": 0.179, "rewards/accuracies": 1.0, "rewards/chosen": -1.08726167678833, "rewards/margins": 3.1875786781311035, "rewards/rejected": -4.274840354919434, "step": 208 }, { "epoch": 0.9288888888888889, "grad_norm": 12.631479263305664, "learning_rate": 3.002374483561064e-05, "logits/chosen": 2.1044745445251465, "logits/rejected": 2.079122543334961, "logps/chosen": -422.732421875, "logps/rejected": -583.081298828125, "loss": 0.3081, "rewards/accuracies": 1.0, "rewards/chosen": -2.848828077316284, "rewards/margins": 4.261569499969482, "rewards/rejected": -7.1103973388671875, "step": 209 }, { "epoch": 0.9333333333333333, "grad_norm": 17.813030242919922, "learning_rate": 2.9843428466097385e-05, "logits/chosen": 2.1924643516540527, "logits/rejected": 2.1515285968780518, "logps/chosen": -404.7247314453125, "logps/rejected": -385.24407958984375, "loss": 1.7449, "rewards/accuracies": 0.5, "rewards/chosen": -3.990626335144043, "rewards/margins": -0.6515921354293823, "rewards/rejected": -3.33903431892395, "step": 210 }, { "epoch": 0.9377777777777778, "grad_norm": 3.6667134761810303, "learning_rate": 2.9662849919380976e-05, "logits/chosen": 1.874267339706421, "logits/rejected": 1.896430492401123, "logps/chosen": -279.3658447265625, "logps/rejected": -271.2750244140625, "loss": 0.4278, "rewards/accuracies": 0.5, "rewards/chosen": -0.9690204858779907, "rewards/margins": 2.050572633743286, "rewards/rejected": -3.0195930004119873, "step": 211 }, { "epoch": 0.9422222222222222, "grad_norm": 21.304107666015625, "learning_rate": 2.9482018970268393e-05, "logits/chosen": 2.03654408454895, "logits/rejected": 2.1858012676239014, "logps/chosen": -307.8687744140625, "logps/rejected": -392.89447021484375, "loss": 1.358, "rewards/accuracies": 0.0, "rewards/chosen": -3.445587158203125, "rewards/margins": -1.0587692260742188, "rewards/rejected": -2.3868179321289062, "step": 212 }, { "epoch": 0.9466666666666667, "grad_norm": 2.9452733993530273, "learning_rate": 2.930094540722927e-05, "logits/chosen": 2.1484179496765137, "logits/rejected": 2.2047171592712402, "logps/chosen": -240.11737060546875, "logps/rejected": -360.74237060546875, "loss": 0.1765, "rewards/accuracies": 1.0, "rewards/chosen": -1.6234557628631592, "rewards/margins": 1.6575775146484375, "rewards/rejected": -3.2810332775115967, "step": 213 }, { "epoch": 0.9511111111111111, "grad_norm": 22.092178344726562, "learning_rate": 2.911963903186606e-05, "logits/chosen": 2.0055017471313477, "logits/rejected": 1.9135019779205322, "logps/chosen": -241.86856079101562, "logps/rejected": -239.4849395751953, "loss": 1.1702, "rewards/accuracies": 0.5, "rewards/chosen": -1.504570722579956, "rewards/margins": -0.4858473837375641, "rewards/rejected": -1.0187233686447144, "step": 214 }, { "epoch": 0.9555555555555556, "grad_norm": 0.4408101439476013, "learning_rate": 2.8938109658383454e-05, "logits/chosen": 2.263948440551758, "logits/rejected": 2.16243314743042, "logps/chosen": -397.47772216796875, "logps/rejected": -574.7552490234375, "loss": 0.0163, "rewards/accuracies": 1.0, "rewards/chosen": -1.030035376548767, "rewards/margins": 8.122578620910645, "rewards/rejected": -9.152613639831543, "step": 215 }, { "epoch": 0.96, "grad_norm": 0.1885669231414795, "learning_rate": 2.8756367113057148e-05, "logits/chosen": 2.174750566482544, "logits/rejected": 2.1712284088134766, "logps/chosen": -379.27142333984375, "logps/rejected": -556.3659057617188, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": 0.10250397026538849, "rewards/margins": 6.272984504699707, "rewards/rejected": -6.170480728149414, "step": 216 }, { "epoch": 0.9644444444444444, "grad_norm": 2.534093141555786, "learning_rate": 2.857442123370195e-05, "logits/chosen": 1.986964225769043, "logits/rejected": 1.985724925994873, "logps/chosen": -314.1033020019531, "logps/rejected": -273.2991943359375, "loss": 0.3304, "rewards/accuracies": 1.0, "rewards/chosen": -1.2605462074279785, "rewards/margins": 1.8220291137695312, "rewards/rejected": -3.0825753211975098, "step": 217 }, { "epoch": 0.9688888888888889, "grad_norm": 9.718185424804688, "learning_rate": 2.8392281869139213e-05, "logits/chosen": 2.056429386138916, "logits/rejected": 2.060886859893799, "logps/chosen": -310.82916259765625, "logps/rejected": -361.8159484863281, "loss": 0.8212, "rewards/accuracies": 0.5, "rewards/chosen": -2.2504210472106934, "rewards/margins": 2.31430721282959, "rewards/rejected": -4.564728736877441, "step": 218 }, { "epoch": 0.9733333333333334, "grad_norm": 13.960983276367188, "learning_rate": 2.8209958878663778e-05, "logits/chosen": 2.2462310791015625, "logits/rejected": 2.28263521194458, "logps/chosen": -465.261474609375, "logps/rejected": -421.1733093261719, "loss": 0.728, "rewards/accuracies": 0.5, "rewards/chosen": -5.611042022705078, "rewards/margins": -0.04042929410934448, "rewards/rejected": -5.570612907409668, "step": 219 }, { "epoch": 0.9777777777777777, "grad_norm": 9.203478813171387, "learning_rate": 2.8027462131510208e-05, "logits/chosen": 1.9416842460632324, "logits/rejected": 1.7857491970062256, "logps/chosen": -345.6712646484375, "logps/rejected": -251.47076416015625, "loss": 0.8191, "rewards/accuracies": 0.5, "rewards/chosen": -3.3168740272521973, "rewards/margins": -0.017238736152648926, "rewards/rejected": -3.299635410308838, "step": 220 }, { "epoch": 0.9822222222222222, "grad_norm": 1.7355222702026367, "learning_rate": 2.7844801506318617e-05, "logits/chosen": 2.240471363067627, "logits/rejected": 2.2063498497009277, "logps/chosen": -328.5857238769531, "logps/rejected": -420.4921875, "loss": 0.0914, "rewards/accuracies": 1.0, "rewards/chosen": -1.8121414184570312, "rewards/margins": 2.7087008953094482, "rewards/rejected": -4.5208420753479, "step": 221 }, { "epoch": 0.9866666666666667, "grad_norm": 1.7263237237930298, "learning_rate": 2.7661986890599943e-05, "logits/chosen": 1.7395219802856445, "logits/rejected": 1.784384846687317, "logps/chosen": -214.15451049804688, "logps/rejected": -284.48638916015625, "loss": 0.3769, "rewards/accuracies": 0.5, "rewards/chosen": -0.4917289614677429, "rewards/margins": 2.53422212600708, "rewards/rejected": -3.0259511470794678, "step": 222 }, { "epoch": 0.9911111111111112, "grad_norm": 2.090808629989624, "learning_rate": 2.747902818020067e-05, "logits/chosen": 2.032662868499756, "logits/rejected": 1.8991634845733643, "logps/chosen": -398.57904052734375, "logps/rejected": -394.4957275390625, "loss": 0.0918, "rewards/accuracies": 1.0, "rewards/chosen": -3.1145386695861816, "rewards/margins": 2.4051780700683594, "rewards/rejected": -5.519716739654541, "step": 223 }, { "epoch": 0.9955555555555555, "grad_norm": 0.48346611857414246, "learning_rate": 2.7295935278767233e-05, "logits/chosen": 2.2755024433135986, "logits/rejected": 2.3426759243011475, "logps/chosen": -392.48272705078125, "logps/rejected": -447.3723449707031, "loss": 0.0161, "rewards/accuracies": 1.0, "rewards/chosen": -1.353735327720642, "rewards/margins": 4.4421586990356445, "rewards/rejected": -5.795893669128418, "step": 224 }, { "epoch": 1.0, "grad_norm": 19.339488983154297, "learning_rate": 2.711271809720986e-05, "logits/chosen": 2.270242214202881, "logits/rejected": 2.0706443786621094, "logps/chosen": -485.1645202636719, "logps/rejected": -381.00018310546875, "loss": 1.2152, "rewards/accuracies": 0.0, "rewards/chosen": -3.213038682937622, "rewards/margins": -0.8432999849319458, "rewards/rejected": -2.369738817214966, "step": 225 }, { "epoch": 1.0, "eval_logits/chosen": 2.1288914680480957, "eval_logits/rejected": 2.083587408065796, "eval_logps/chosen": -313.3813781738281, "eval_logps/rejected": -365.7991027832031, "eval_loss": 0.4423667788505554, "eval_rewards/accuracies": 0.8214285969734192, "eval_rewards/chosen": -2.3088743686676025, "eval_rewards/margins": 2.689358949661255, "eval_rewards/rejected": -4.998233318328857, "eval_runtime": 17.388, "eval_samples_per_second": 2.876, "eval_steps_per_second": 0.403, "step": 225 }, { "epoch": 1.0044444444444445, "grad_norm": 1.6202038526535034, "learning_rate": 2.6929386553166164e-05, "logits/chosen": 2.034777879714966, "logits/rejected": 1.9399088621139526, "logps/chosen": -270.7170104980469, "logps/rejected": -321.8533020019531, "loss": 0.0773, "rewards/accuracies": 1.0, "rewards/chosen": -0.10045319050550461, "rewards/margins": 4.125720977783203, "rewards/rejected": -4.226174354553223, "step": 226 }, { "epoch": 1.008888888888889, "grad_norm": 0.899365246295929, "learning_rate": 2.6745950570464212e-05, "logits/chosen": 1.856791377067566, "logits/rejected": 1.838975429534912, "logps/chosen": -187.313720703125, "logps/rejected": -206.15573120117188, "loss": 0.1061, "rewards/accuracies": 1.0, "rewards/chosen": -0.18259316682815552, "rewards/margins": 2.7951650619506836, "rewards/rejected": -2.9777581691741943, "step": 227 }, { "epoch": 1.0133333333333334, "grad_norm": 24.894893646240234, "learning_rate": 2.6562420078585433e-05, "logits/chosen": 2.2783782482147217, "logits/rejected": 2.3907415866851807, "logps/chosen": -562.4542846679688, "logps/rejected": -450.45098876953125, "loss": 2.5241, "rewards/accuracies": 0.5, "rewards/chosen": -8.394607543945312, "rewards/margins": -0.29980039596557617, "rewards/rejected": -8.094807624816895, "step": 228 }, { "epoch": 1.0177777777777777, "grad_norm": 10.114426612854004, "learning_rate": 2.637880501212705e-05, "logits/chosen": 2.3603391647338867, "logits/rejected": 2.353978157043457, "logps/chosen": -342.9703674316406, "logps/rejected": -352.3290710449219, "loss": 0.623, "rewards/accuracies": 0.5, "rewards/chosen": -1.0490471124649048, "rewards/margins": 2.3047902584075928, "rewards/rejected": -3.353837490081787, "step": 229 }, { "epoch": 1.0222222222222221, "grad_norm": 0.2897517681121826, "learning_rate": 2.619511531026436e-05, "logits/chosen": 2.2957711219787598, "logits/rejected": 2.2823054790496826, "logps/chosen": -380.21844482421875, "logps/rejected": -504.15045166015625, "loss": 0.0094, "rewards/accuracies": 1.0, "rewards/chosen": -0.6248611211776733, "rewards/margins": 6.897383213043213, "rewards/rejected": -7.522244453430176, "step": 230 }, { "epoch": 1.0266666666666666, "grad_norm": 1.0437899827957153, "learning_rate": 2.6011360916212734e-05, "logits/chosen": 2.2502756118774414, "logits/rejected": 2.169267177581787, "logps/chosen": -248.31495666503906, "logps/rejected": -254.9483642578125, "loss": 0.0581, "rewards/accuracies": 1.0, "rewards/chosen": 0.568634033203125, "rewards/margins": 2.8449950218200684, "rewards/rejected": -2.2763609886169434, "step": 231 }, { "epoch": 1.031111111111111, "grad_norm": 2.103546142578125, "learning_rate": 2.5827551776689323e-05, "logits/chosen": 1.5411741733551025, "logits/rejected": 1.4888989925384521, "logps/chosen": -151.44955444335938, "logps/rejected": -160.1112518310547, "loss": 0.14, "rewards/accuracies": 1.0, "rewards/chosen": -0.6025131344795227, "rewards/margins": 2.4829578399658203, "rewards/rejected": -3.0854709148406982, "step": 232 }, { "epoch": 1.0355555555555556, "grad_norm": 2.739473581314087, "learning_rate": 2.564369784137472e-05, "logits/chosen": 1.7263026237487793, "logits/rejected": 1.783945083618164, "logps/chosen": -234.08197021484375, "logps/rejected": -273.28668212890625, "loss": 0.2093, "rewards/accuracies": 1.0, "rewards/chosen": -0.8093013763427734, "rewards/margins": 2.4584481716156006, "rewards/rejected": -3.267749547958374, "step": 233 }, { "epoch": 1.04, "grad_norm": 3.7832846641540527, "learning_rate": 2.54598090623743e-05, "logits/chosen": 1.8052504062652588, "logits/rejected": 1.766016960144043, "logps/chosen": -296.1597900390625, "logps/rejected": -256.9058532714844, "loss": 0.3296, "rewards/accuracies": 1.0, "rewards/chosen": -1.5597388744354248, "rewards/margins": 0.9551147222518921, "rewards/rejected": -2.5148537158966064, "step": 234 }, { "epoch": 1.0444444444444445, "grad_norm": 2.446428060531616, "learning_rate": 2.527589539367956e-05, "logits/chosen": 2.325028896331787, "logits/rejected": 2.232288360595703, "logps/chosen": -376.904052734375, "logps/rejected": -400.5955505371094, "loss": 0.0538, "rewards/accuracies": 1.0, "rewards/chosen": -3.687849521636963, "rewards/margins": 3.5299012660980225, "rewards/rejected": -7.217750549316406, "step": 235 }, { "epoch": 1.048888888888889, "grad_norm": 0.11488201469182968, "learning_rate": 2.50919667906293e-05, "logits/chosen": 1.7712184190750122, "logits/rejected": 1.815232515335083, "logps/chosen": -246.863037109375, "logps/rejected": -351.25238037109375, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": -0.2323249876499176, "rewards/margins": 5.702582359313965, "rewards/rejected": -5.93490743637085, "step": 236 }, { "epoch": 1.0533333333333332, "grad_norm": 0.46934670209884644, "learning_rate": 2.4908033209370705e-05, "logits/chosen": 2.1205997467041016, "logits/rejected": 2.0082976818084717, "logps/chosen": -443.21771240234375, "logps/rejected": -437.1765441894531, "loss": 0.0109, "rewards/accuracies": 1.0, "rewards/chosen": -0.834454357624054, "rewards/margins": 6.396831512451172, "rewards/rejected": -7.23128604888916, "step": 237 }, { "epoch": 1.0577777777777777, "grad_norm": 0.6756075620651245, "learning_rate": 2.4724104606320445e-05, "logits/chosen": 2.1717934608459473, "logits/rejected": 2.1605606079101562, "logps/chosen": -349.8464050292969, "logps/rejected": -456.8037109375, "loss": 0.0355, "rewards/accuracies": 1.0, "rewards/chosen": -0.42097780108451843, "rewards/margins": 6.1533660888671875, "rewards/rejected": -6.574343681335449, "step": 238 }, { "epoch": 1.0622222222222222, "grad_norm": 0.5172697901725769, "learning_rate": 2.4540190937625708e-05, "logits/chosen": 2.2337419986724854, "logits/rejected": 2.2302417755126953, "logps/chosen": -269.57037353515625, "logps/rejected": -465.9024658203125, "loss": 0.0192, "rewards/accuracies": 1.0, "rewards/chosen": -1.4147706031799316, "rewards/margins": 4.075361728668213, "rewards/rejected": -5.4901323318481445, "step": 239 }, { "epoch": 1.0666666666666667, "grad_norm": 0.2905597984790802, "learning_rate": 2.4356302158625288e-05, "logits/chosen": 2.1833200454711914, "logits/rejected": 2.207943916320801, "logps/chosen": -326.02178955078125, "logps/rejected": -411.3348388671875, "loss": 0.012, "rewards/accuracies": 1.0, "rewards/chosen": 0.16143493354320526, "rewards/margins": 5.066305637359619, "rewards/rejected": -4.904870510101318, "step": 240 }, { "epoch": 1.0711111111111111, "grad_norm": 0.2205185443162918, "learning_rate": 2.4172448223310682e-05, "logits/chosen": 1.7023603916168213, "logits/rejected": 1.5953627824783325, "logps/chosen": -170.35665893554688, "logps/rejected": -241.34259033203125, "loss": 0.0123, "rewards/accuracies": 1.0, "rewards/chosen": -1.1372658014297485, "rewards/margins": 4.4668803215026855, "rewards/rejected": -5.6041460037231445, "step": 241 }, { "epoch": 1.0755555555555556, "grad_norm": 0.5335175395011902, "learning_rate": 2.3988639083787272e-05, "logits/chosen": 1.9782161712646484, "logits/rejected": 1.9465917348861694, "logps/chosen": -299.92816162109375, "logps/rejected": -304.3218994140625, "loss": 0.0304, "rewards/accuracies": 1.0, "rewards/chosen": 0.17397230863571167, "rewards/margins": 3.7744734287261963, "rewards/rejected": -3.600501298904419, "step": 242 }, { "epoch": 1.08, "grad_norm": 9.141175270080566, "learning_rate": 2.3804884689735642e-05, "logits/chosen": 2.1661500930786133, "logits/rejected": 2.1720974445343018, "logps/chosen": -267.519775390625, "logps/rejected": -319.2179260253906, "loss": 0.4535, "rewards/accuracies": 0.5, "rewards/chosen": -1.1500840187072754, "rewards/margins": 3.123685598373413, "rewards/rejected": -4.273769378662109, "step": 243 }, { "epoch": 1.0844444444444445, "grad_norm": 0.07122190296649933, "learning_rate": 2.3621194987872955e-05, "logits/chosen": 2.0959739685058594, "logits/rejected": 1.9889600276947021, "logps/chosen": -352.41363525390625, "logps/rejected": -451.14453125, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": 0.7250168323516846, "rewards/margins": 7.163854598999023, "rewards/rejected": -6.438838005065918, "step": 244 }, { "epoch": 1.0888888888888888, "grad_norm": 6.129761695861816, "learning_rate": 2.3437579921414573e-05, "logits/chosen": 2.127330780029297, "logits/rejected": 1.9886295795440674, "logps/chosen": -403.7293701171875, "logps/rejected": -486.6906433105469, "loss": 0.2336, "rewards/accuracies": 1.0, "rewards/chosen": -0.07745209336280823, "rewards/margins": 5.993661403656006, "rewards/rejected": -6.071113586425781, "step": 245 }, { "epoch": 1.0933333333333333, "grad_norm": 32.1196174621582, "learning_rate": 2.325404942953579e-05, "logits/chosen": 1.87540864944458, "logits/rejected": 1.9732717275619507, "logps/chosen": -407.6719665527344, "logps/rejected": -368.49078369140625, "loss": 5.0416, "rewards/accuracies": 0.0, "rewards/chosen": -7.963120937347412, "rewards/margins": -4.95430850982666, "rewards/rejected": -3.00881290435791, "step": 246 }, { "epoch": 1.0977777777777777, "grad_norm": 0.005643940530717373, "learning_rate": 2.3070613446833842e-05, "logits/chosen": 2.0901176929473877, "logits/rejected": 2.1729612350463867, "logps/chosen": -399.19500732421875, "logps/rejected": -582.9959716796875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -2.139540195465088, "rewards/margins": 9.500133514404297, "rewards/rejected": -11.639673233032227, "step": 247 }, { "epoch": 1.1022222222222222, "grad_norm": 13.798872947692871, "learning_rate": 2.288728190279014e-05, "logits/chosen": 2.2860894203186035, "logits/rejected": 2.2638816833496094, "logps/chosen": -482.7290954589844, "logps/rejected": -440.7649230957031, "loss": 1.9496, "rewards/accuracies": 0.5, "rewards/chosen": -2.7536072731018066, "rewards/margins": 3.122256278991699, "rewards/rejected": -5.875863552093506, "step": 248 }, { "epoch": 1.1066666666666667, "grad_norm": 0.43348434567451477, "learning_rate": 2.270406472123277e-05, "logits/chosen": 2.0960116386413574, "logits/rejected": 2.1119589805603027, "logps/chosen": -236.1478271484375, "logps/rejected": -304.01947021484375, "loss": 0.0264, "rewards/accuracies": 1.0, "rewards/chosen": -0.2858787775039673, "rewards/margins": 3.6734910011291504, "rewards/rejected": -3.959369659423828, "step": 249 }, { "epoch": 1.1111111111111112, "grad_norm": 3.9212396144866943, "learning_rate": 2.2520971819799328e-05, "logits/chosen": 2.0135841369628906, "logits/rejected": 1.9526537656784058, "logps/chosen": -181.21868896484375, "logps/rejected": -253.9673614501953, "loss": 0.2664, "rewards/accuracies": 1.0, "rewards/chosen": -0.40475767850875854, "rewards/margins": 1.227912187576294, "rewards/rejected": -1.6326699256896973, "step": 250 }, { "epoch": 1.1155555555555556, "grad_norm": 1.6736501455307007, "learning_rate": 2.2338013109400056e-05, "logits/chosen": 2.3246517181396484, "logits/rejected": 2.323885679244995, "logps/chosen": -365.00140380859375, "logps/rejected": -421.6229553222656, "loss": 0.0485, "rewards/accuracies": 1.0, "rewards/chosen": 0.9462188482284546, "rewards/margins": 4.564491271972656, "rewards/rejected": -3.618272542953491, "step": 251 }, { "epoch": 1.12, "grad_norm": 20.39493179321289, "learning_rate": 2.215519849368138e-05, "logits/chosen": 2.3385281562805176, "logits/rejected": 2.2685837745666504, "logps/chosen": -561.4131469726562, "logps/rejected": -440.8316955566406, "loss": 0.8448, "rewards/accuracies": 0.5, "rewards/chosen": -7.6562395095825195, "rewards/margins": -0.15807795524597168, "rewards/rejected": -7.498161315917969, "step": 252 }, { "epoch": 1.1244444444444444, "grad_norm": 0.006911400239914656, "learning_rate": 2.1972537868489797e-05, "logits/chosen": 2.0427744388580322, "logits/rejected": 2.068326473236084, "logps/chosen": -424.22393798828125, "logps/rejected": -637.6040649414062, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -2.107600450515747, "rewards/margins": 11.686300277709961, "rewards/rejected": -13.793901443481445, "step": 253 }, { "epoch": 1.1288888888888888, "grad_norm": 9.414591789245605, "learning_rate": 2.1790041121336225e-05, "logits/chosen": 1.8931891918182373, "logits/rejected": 1.7515565156936646, "logps/chosen": -474.2149353027344, "logps/rejected": -366.8948059082031, "loss": 0.4807, "rewards/accuracies": 0.5, "rewards/chosen": -4.580873012542725, "rewards/margins": 0.6344245672225952, "rewards/rejected": -5.215297698974609, "step": 254 }, { "epoch": 1.1333333333333333, "grad_norm": 0.31954923272132874, "learning_rate": 2.1607718130860782e-05, "logits/chosen": 2.2394070625305176, "logits/rejected": 2.203941822052002, "logps/chosen": -317.05987548828125, "logps/rejected": -365.3947448730469, "loss": 0.0186, "rewards/accuracies": 1.0, "rewards/chosen": -0.9734389781951904, "rewards/margins": 4.665194988250732, "rewards/rejected": -5.638634204864502, "step": 255 }, { "epoch": 1.1377777777777778, "grad_norm": 0.7436378598213196, "learning_rate": 2.142557876629805e-05, "logits/chosen": 2.0395288467407227, "logits/rejected": 2.0879290103912354, "logps/chosen": -368.682373046875, "logps/rejected": -483.10504150390625, "loss": 0.0241, "rewards/accuracies": 1.0, "rewards/chosen": -2.940394639968872, "rewards/margins": 4.465216636657715, "rewards/rejected": -7.405611038208008, "step": 256 }, { "epoch": 1.1422222222222222, "grad_norm": 0.5857129096984863, "learning_rate": 2.124363288694285e-05, "logits/chosen": 2.189476251602173, "logits/rejected": 2.2042782306671143, "logps/chosen": -297.9048767089844, "logps/rejected": -462.4341125488281, "loss": 0.0125, "rewards/accuracies": 1.0, "rewards/chosen": -2.4767074584960938, "rewards/margins": 5.004980564117432, "rewards/rejected": -7.481688022613525, "step": 257 }, { "epoch": 1.1466666666666667, "grad_norm": 1.8212025165557861, "learning_rate": 2.1061890341616558e-05, "logits/chosen": 2.0773448944091797, "logits/rejected": 1.953477144241333, "logps/chosen": -466.6849670410156, "logps/rejected": -428.0595703125, "loss": 0.0469, "rewards/accuracies": 1.0, "rewards/chosen": -5.071170330047607, "rewards/margins": 3.6165356636047363, "rewards/rejected": -8.687705993652344, "step": 258 }, { "epoch": 1.1511111111111112, "grad_norm": 0.49154847860336304, "learning_rate": 2.0880360968133954e-05, "logits/chosen": 1.9941173791885376, "logits/rejected": 1.903878927230835, "logps/chosen": -443.1854553222656, "logps/rejected": -469.02886962890625, "loss": 0.0185, "rewards/accuracies": 1.0, "rewards/chosen": -2.136099338531494, "rewards/margins": 5.983541965484619, "rewards/rejected": -8.119641304016113, "step": 259 }, { "epoch": 1.1555555555555554, "grad_norm": 0.7263330817222595, "learning_rate": 2.0699054592770737e-05, "logits/chosen": 2.341273307800293, "logits/rejected": 2.2972702980041504, "logps/chosen": -383.4603271484375, "logps/rejected": -436.1636962890625, "loss": 0.0187, "rewards/accuracies": 1.0, "rewards/chosen": -0.9301009774208069, "rewards/margins": 4.749177932739258, "rewards/rejected": -5.67927885055542, "step": 260 }, { "epoch": 1.16, "grad_norm": 1.4579602479934692, "learning_rate": 2.0517981029731616e-05, "logits/chosen": 2.224546432495117, "logits/rejected": 2.069706916809082, "logps/chosen": -441.466064453125, "logps/rejected": -530.3856201171875, "loss": 0.0534, "rewards/accuracies": 1.0, "rewards/chosen": -0.12378081679344177, "rewards/margins": 5.265193462371826, "rewards/rejected": -5.388974189758301, "step": 261 }, { "epoch": 1.1644444444444444, "grad_norm": 0.380569189786911, "learning_rate": 2.0337150080619033e-05, "logits/chosen": 1.961578130722046, "logits/rejected": 1.9327723979949951, "logps/chosen": -419.30328369140625, "logps/rejected": -416.2418212890625, "loss": 0.0136, "rewards/accuracies": 1.0, "rewards/chosen": -1.4250718355178833, "rewards/margins": 6.134527683258057, "rewards/rejected": -7.55959939956665, "step": 262 }, { "epoch": 1.1688888888888889, "grad_norm": 13.600119590759277, "learning_rate": 2.0156571533902627e-05, "logits/chosen": 1.979763388633728, "logits/rejected": 1.8556675910949707, "logps/chosen": -295.2734069824219, "logps/rejected": -243.45123291015625, "loss": 0.5916, "rewards/accuracies": 0.5, "rewards/chosen": -3.295403242111206, "rewards/margins": 0.9111607074737549, "rewards/rejected": -4.206563949584961, "step": 263 }, { "epoch": 1.1733333333333333, "grad_norm": 16.93592071533203, "learning_rate": 1.997625516438937e-05, "logits/chosen": 2.4186158180236816, "logits/rejected": 2.3498375415802, "logps/chosen": -611.62841796875, "logps/rejected": -590.39306640625, "loss": 0.8758, "rewards/accuracies": 0.5, "rewards/chosen": -9.509602546691895, "rewards/margins": 1.1703383922576904, "rewards/rejected": -10.679941177368164, "step": 264 }, { "epoch": 1.1777777777777778, "grad_norm": 2.826754331588745, "learning_rate": 1.9796210732694442e-05, "logits/chosen": 1.9747998714447021, "logits/rejected": 1.9943749904632568, "logps/chosen": -265.30975341796875, "logps/rejected": -372.72296142578125, "loss": 0.0875, "rewards/accuracies": 1.0, "rewards/chosen": -1.4182487726211548, "rewards/margins": 4.4431986808776855, "rewards/rejected": -5.861447334289551, "step": 265 }, { "epoch": 1.1822222222222223, "grad_norm": 5.334752082824707, "learning_rate": 1.9616447984712914e-05, "logits/chosen": 1.6797964572906494, "logits/rejected": 1.7042231559753418, "logps/chosen": -150.200439453125, "logps/rejected": -205.3989715576172, "loss": 0.1681, "rewards/accuracies": 1.0, "rewards/chosen": -1.8328521251678467, "rewards/margins": 1.797693133354187, "rewards/rejected": -3.630545139312744, "step": 266 }, { "epoch": 1.1866666666666668, "grad_norm": 4.4336323738098145, "learning_rate": 1.9436976651092144e-05, "logits/chosen": 1.883796215057373, "logits/rejected": 1.7938141822814941, "logps/chosen": -158.01724243164062, "logps/rejected": -183.39508056640625, "loss": 0.3805, "rewards/accuracies": 1.0, "rewards/chosen": 0.2773910462856293, "rewards/margins": 0.9651764035224915, "rewards/rejected": -0.6877853870391846, "step": 267 }, { "epoch": 1.1911111111111112, "grad_norm": 9.777835845947266, "learning_rate": 1.9257806446705116e-05, "logits/chosen": 2.2978739738464355, "logits/rejected": 2.2143282890319824, "logps/chosen": -346.0021667480469, "logps/rejected": -279.994384765625, "loss": 0.4109, "rewards/accuracies": 0.5, "rewards/chosen": -0.30205535888671875, "rewards/margins": 2.2577598094940186, "rewards/rejected": -2.5598151683807373, "step": 268 }, { "epoch": 1.1955555555555555, "grad_norm": 0.7445770502090454, "learning_rate": 1.9078947070124523e-05, "logits/chosen": 1.995645523071289, "logits/rejected": 2.0005688667297363, "logps/chosen": -301.2586669921875, "logps/rejected": -353.81927490234375, "loss": 0.0387, "rewards/accuracies": 1.0, "rewards/chosen": -0.5910667181015015, "rewards/margins": 3.9083282947540283, "rewards/rejected": -4.499395370483398, "step": 269 }, { "epoch": 1.2, "grad_norm": 7.762486457824707, "learning_rate": 1.8900408203097787e-05, "logits/chosen": 1.905322551727295, "logits/rejected": 1.7590628862380981, "logps/chosen": -250.0103759765625, "logps/rejected": -283.5964050292969, "loss": 0.2447, "rewards/accuracies": 1.0, "rewards/chosen": -1.9317550659179688, "rewards/margins": 3.9607110023498535, "rewards/rejected": -5.892466068267822, "step": 270 }, { "epoch": 1.2, "eval_logits/chosen": 2.1097989082336426, "eval_logits/rejected": 2.0639405250549316, "eval_logps/chosen": -318.4396057128906, "eval_logps/rejected": -374.51708984375, "eval_loss": 0.4267149865627289, "eval_rewards/accuracies": 0.8214285969734192, "eval_rewards/chosen": -2.8146941661834717, "eval_rewards/margins": 3.0553336143493652, "eval_rewards/rejected": -5.8700270652771, "eval_runtime": 17.407, "eval_samples_per_second": 2.872, "eval_steps_per_second": 0.402, "step": 270 }, { "epoch": 1.2044444444444444, "grad_norm": 1.0685392618179321, "learning_rate": 1.8722199510023012e-05, "logits/chosen": 2.049793004989624, "logits/rejected": 2.014737606048584, "logps/chosen": -357.0670166015625, "logps/rejected": -461.2732238769531, "loss": 0.0457, "rewards/accuracies": 1.0, "rewards/chosen": -4.979574680328369, "rewards/margins": 3.4011411666870117, "rewards/rejected": -8.380716323852539, "step": 271 }, { "epoch": 1.208888888888889, "grad_norm": 0.4621816575527191, "learning_rate": 1.854433063742579e-05, "logits/chosen": 2.1473259925842285, "logits/rejected": 2.1311964988708496, "logps/chosen": -233.38954162597656, "logps/rejected": -311.93170166015625, "loss": 0.0227, "rewards/accuracies": 1.0, "rewards/chosen": 1.1365326642990112, "rewards/margins": 3.9022364616394043, "rewards/rejected": -2.7657036781311035, "step": 272 }, { "epoch": 1.2133333333333334, "grad_norm": 3.8542513847351074, "learning_rate": 1.8366811213437092e-05, "logits/chosen": 2.036423921585083, "logits/rejected": 2.026139974594116, "logps/chosen": -301.99395751953125, "logps/rejected": -330.7635498046875, "loss": 0.1548, "rewards/accuracies": 1.0, "rewards/chosen": -1.5693366527557373, "rewards/margins": 1.7907044887542725, "rewards/rejected": -3.3600411415100098, "step": 273 }, { "epoch": 1.2177777777777778, "grad_norm": 0.18561817705631256, "learning_rate": 1.8189650847272037e-05, "logits/chosen": 2.12514066696167, "logits/rejected": 2.1645846366882324, "logps/chosen": -372.3150939941406, "logps/rejected": -337.253173828125, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": -3.3940017223358154, "rewards/margins": 5.315286159515381, "rewards/rejected": -8.709287643432617, "step": 274 }, { "epoch": 1.2222222222222223, "grad_norm": 9.204042434692383, "learning_rate": 1.8012859128709766e-05, "logits/chosen": 1.8135225772857666, "logits/rejected": 1.8564603328704834, "logps/chosen": -192.2001953125, "logps/rejected": -243.14703369140625, "loss": 0.5688, "rewards/accuracies": 0.5, "rewards/chosen": -0.7113640308380127, "rewards/margins": 0.32638704776763916, "rewards/rejected": -1.0377510786056519, "step": 275 }, { "epoch": 1.2266666666666666, "grad_norm": 0.10035301744937897, "learning_rate": 1.783644562757436e-05, "logits/chosen": 2.467790126800537, "logits/rejected": 2.2855076789855957, "logps/chosen": -284.48785400390625, "logps/rejected": -418.7071533203125, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": 0.9228286743164062, "rewards/margins": 6.6048784255981445, "rewards/rejected": -5.682049751281738, "step": 276 }, { "epoch": 1.231111111111111, "grad_norm": 0.995993435382843, "learning_rate": 1.7660419893216785e-05, "logits/chosen": 2.3109967708587646, "logits/rejected": 2.2218995094299316, "logps/chosen": -331.196044921875, "logps/rejected": -288.77044677734375, "loss": 0.0384, "rewards/accuracies": 1.0, "rewards/chosen": 0.583350419998169, "rewards/margins": 3.377068519592285, "rewards/rejected": -2.793717861175537, "step": 277 }, { "epoch": 1.2355555555555555, "grad_norm": 0.09387421607971191, "learning_rate": 1.7484791453998006e-05, "logits/chosen": 2.2291605472564697, "logits/rejected": 2.205029249191284, "logps/chosen": -329.593994140625, "logps/rejected": -481.59332275390625, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -0.18573302030563354, "rewards/margins": 7.1214494705200195, "rewards/rejected": -7.307182312011719, "step": 278 }, { "epoch": 1.24, "grad_norm": 1.2010304927825928, "learning_rate": 1.7309569816773193e-05, "logits/chosen": 1.4943195581436157, "logits/rejected": 1.5665395259857178, "logps/chosen": -118.16926574707031, "logps/rejected": -229.98049926757812, "loss": 0.0459, "rewards/accuracies": 1.0, "rewards/chosen": 0.5613498687744141, "rewards/margins": 3.4735331535339355, "rewards/rejected": -2.9121835231781006, "step": 279 }, { "epoch": 1.2444444444444445, "grad_norm": 9.927587509155273, "learning_rate": 1.7134764466377136e-05, "logits/chosen": 2.1885547637939453, "logits/rejected": 2.239022731781006, "logps/chosen": -398.05059814453125, "logps/rejected": -413.83624267578125, "loss": 0.4304, "rewards/accuracies": 1.0, "rewards/chosen": -0.5941482782363892, "rewards/margins": 0.6228576898574829, "rewards/rejected": -1.217005968093872, "step": 280 }, { "epoch": 1.248888888888889, "grad_norm": 5.716561794281006, "learning_rate": 1.69603848651108e-05, "logits/chosen": 1.9646629095077515, "logits/rejected": 1.9000844955444336, "logps/chosen": -269.88067626953125, "logps/rejected": -253.2309112548828, "loss": 0.3016, "rewards/accuracies": 1.0, "rewards/chosen": -2.193873643875122, "rewards/margins": 1.1279609203338623, "rewards/rejected": -3.3218345642089844, "step": 281 }, { "epoch": 1.2533333333333334, "grad_norm": 0.3801816999912262, "learning_rate": 1.6786440452229134e-05, "logits/chosen": 2.2147722244262695, "logits/rejected": 2.1242868900299072, "logps/chosen": -451.5334167480469, "logps/rejected": -462.0749206542969, "loss": 0.0097, "rewards/accuracies": 1.0, "rewards/chosen": -3.0167465209960938, "rewards/margins": 5.644117832183838, "rewards/rejected": -8.660863876342773, "step": 282 }, { "epoch": 1.2577777777777777, "grad_norm": 0.1913336217403412, "learning_rate": 1.6612940643430138e-05, "logits/chosen": 2.109816551208496, "logits/rejected": 2.2030255794525146, "logps/chosen": -255.25289916992188, "logps/rejected": -452.1776123046875, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": -1.056928277015686, "rewards/margins": 7.02421760559082, "rewards/rejected": -8.081146240234375, "step": 283 }, { "epoch": 1.2622222222222224, "grad_norm": 12.739026069641113, "learning_rate": 1.6439894830345143e-05, "logits/chosen": 1.5907762050628662, "logits/rejected": 1.58561372756958, "logps/chosen": -197.10760498046875, "logps/rejected": -260.8538818359375, "loss": 0.6363, "rewards/accuracies": 0.5, "rewards/chosen": -2.7191286087036133, "rewards/margins": 1.830248236656189, "rewards/rejected": -4.549376487731934, "step": 284 }, { "epoch": 1.2666666666666666, "grad_norm": 1.540250539779663, "learning_rate": 1.6267312380030506e-05, "logits/chosen": 2.0246505737304688, "logits/rejected": 1.983656883239746, "logps/chosen": -283.9218444824219, "logps/rejected": -365.05535888671875, "loss": 0.0741, "rewards/accuracies": 1.0, "rewards/chosen": -0.7960922122001648, "rewards/margins": 5.497363567352295, "rewards/rejected": -6.293455600738525, "step": 285 }, { "epoch": 1.271111111111111, "grad_norm": 3.0418801307678223, "learning_rate": 1.609520263446049e-05, "logits/chosen": 2.055178165435791, "logits/rejected": 2.165806293487549, "logps/chosen": -276.8818054199219, "logps/rejected": -422.906005859375, "loss": 0.2023, "rewards/accuracies": 1.0, "rewards/chosen": -2.977742910385132, "rewards/margins": 3.4164116382598877, "rewards/rejected": -6.3941545486450195, "step": 286 }, { "epoch": 1.2755555555555556, "grad_norm": 1.8675150871276855, "learning_rate": 1.5923574910021624e-05, "logits/chosen": 1.7561115026474, "logits/rejected": 1.7464289665222168, "logps/chosen": -180.07662963867188, "logps/rejected": -193.69613647460938, "loss": 0.1926, "rewards/accuracies": 1.0, "rewards/chosen": -0.2910804748535156, "rewards/margins": 3.408679246902466, "rewards/rejected": -3.6997597217559814, "step": 287 }, { "epoch": 1.28, "grad_norm": 11.50502872467041, "learning_rate": 1.5752438497008405e-05, "logits/chosen": 1.795478343963623, "logits/rejected": 1.8433971405029297, "logps/chosen": -314.85888671875, "logps/rejected": -392.9442138671875, "loss": 0.4887, "rewards/accuracies": 0.5, "rewards/chosen": -1.7009613513946533, "rewards/margins": 4.940203666687012, "rewards/rejected": -6.641165256500244, "step": 288 }, { "epoch": 1.2844444444444445, "grad_norm": 0.6544823050498962, "learning_rate": 1.558180265912037e-05, "logits/chosen": 2.1740856170654297, "logits/rejected": 2.0556159019470215, "logps/chosen": -291.9125671386719, "logps/rejected": -370.18865966796875, "loss": 0.021, "rewards/accuracies": 1.0, "rewards/chosen": -1.0329957008361816, "rewards/margins": 4.283731460571289, "rewards/rejected": -5.3167266845703125, "step": 289 }, { "epoch": 1.2888888888888888, "grad_norm": 1.4096264839172363, "learning_rate": 1.5411676632960713e-05, "logits/chosen": 1.9917266368865967, "logits/rejected": 2.0029103755950928, "logps/chosen": -215.18414306640625, "logps/rejected": -224.6077117919922, "loss": 0.0592, "rewards/accuracies": 1.0, "rewards/chosen": 1.029798150062561, "rewards/margins": 3.0806381702423096, "rewards/rejected": -2.050839900970459, "step": 290 }, { "epoch": 1.2933333333333334, "grad_norm": 3.500157117843628, "learning_rate": 1.5242069627536225e-05, "logits/chosen": 2.0910866260528564, "logits/rejected": 2.094388484954834, "logps/chosen": -268.5745544433594, "logps/rejected": -347.7729797363281, "loss": 0.1554, "rewards/accuracies": 1.0, "rewards/chosen": -0.5284897089004517, "rewards/margins": 1.9156463146209717, "rewards/rejected": -2.444136142730713, "step": 291 }, { "epoch": 1.2977777777777777, "grad_norm": 7.339416027069092, "learning_rate": 1.5072990823758871e-05, "logits/chosen": 2.043335437774658, "logits/rejected": 1.9694390296936035, "logps/chosen": -267.2714538574219, "logps/rejected": -318.1759338378906, "loss": 0.5384, "rewards/accuracies": 0.5, "rewards/chosen": -1.948584794998169, "rewards/margins": 0.3823028802871704, "rewards/rejected": -2.33088755607605, "step": 292 }, { "epoch": 1.3022222222222222, "grad_norm": 2.8766441345214844, "learning_rate": 1.490444937394879e-05, "logits/chosen": 1.6397064924240112, "logits/rejected": 1.58877432346344, "logps/chosen": -206.1599578857422, "logps/rejected": -258.51251220703125, "loss": 0.2138, "rewards/accuracies": 1.0, "rewards/chosen": -0.5400612354278564, "rewards/margins": 1.5494178533554077, "rewards/rejected": -2.0894789695739746, "step": 293 }, { "epoch": 1.3066666666666666, "grad_norm": 0.5003223419189453, "learning_rate": 1.4736454401338872e-05, "logits/chosen": 2.242143154144287, "logits/rejected": 2.218358039855957, "logps/chosen": -492.30926513671875, "logps/rejected": -563.084716796875, "loss": 0.0071, "rewards/accuracies": 1.0, "rewards/chosen": -5.345100402832031, "rewards/margins": 8.439008712768555, "rewards/rejected": -13.784109115600586, "step": 294 }, { "epoch": 1.3111111111111111, "grad_norm": 9.198110580444336, "learning_rate": 1.4569014999580937e-05, "logits/chosen": 2.083486795425415, "logits/rejected": 2.0191855430603027, "logps/chosen": -514.994384765625, "logps/rejected": -621.652587890625, "loss": 0.1112, "rewards/accuracies": 1.0, "rewards/chosen": -8.739448547363281, "rewards/margins": 2.7005691528320312, "rewards/rejected": -11.440017700195312, "step": 295 }, { "epoch": 1.3155555555555556, "grad_norm": 18.878862380981445, "learning_rate": 1.4402140232253486e-05, "logits/chosen": 2.4532670974731445, "logits/rejected": 2.3905553817749023, "logps/chosen": -382.56829833984375, "logps/rejected": -470.17926025390625, "loss": 0.5653, "rewards/accuracies": 0.5, "rewards/chosen": -3.797785758972168, "rewards/margins": 4.899234771728516, "rewards/rejected": -8.697020530700684, "step": 296 }, { "epoch": 1.32, "grad_norm": 0.9160619974136353, "learning_rate": 1.4235839132371038e-05, "logits/chosen": 2.1893036365509033, "logits/rejected": 2.259230136871338, "logps/chosen": -349.79779052734375, "logps/rejected": -420.09796142578125, "loss": 0.0345, "rewards/accuracies": 1.0, "rewards/chosen": -1.3005684614181519, "rewards/margins": 5.341613292694092, "rewards/rejected": -6.642181396484375, "step": 297 }, { "epoch": 1.3244444444444445, "grad_norm": 1.0921589136123657, "learning_rate": 1.407012070189524e-05, "logits/chosen": 1.6390105485916138, "logits/rejected": 1.6804091930389404, "logps/chosen": -323.83782958984375, "logps/rejected": -502.33782958984375, "loss": 0.0708, "rewards/accuracies": 1.0, "rewards/chosen": -4.742315769195557, "rewards/margins": 5.594854831695557, "rewards/rejected": -10.337170600891113, "step": 298 }, { "epoch": 1.3288888888888888, "grad_norm": 0.014698788523674011, "learning_rate": 1.3904993911247561e-05, "logits/chosen": 2.3741211891174316, "logits/rejected": 2.227184295654297, "logps/chosen": -381.3302307128906, "logps/rejected": -436.189697265625, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 0.0916748046875, "rewards/margins": 8.013049125671387, "rewards/rejected": -7.921374797821045, "step": 299 }, { "epoch": 1.3333333333333333, "grad_norm": 12.252896308898926, "learning_rate": 1.3740467698823662e-05, "logits/chosen": 2.037738800048828, "logits/rejected": 1.979736566543579, "logps/chosen": -299.84228515625, "logps/rejected": -390.47296142578125, "loss": 0.8992, "rewards/accuracies": 0.5, "rewards/chosen": -3.8193671703338623, "rewards/margins": 2.5206613540649414, "rewards/rejected": -6.340028762817383, "step": 300 }, { "epoch": 1.3377777777777777, "grad_norm": 12.054088592529297, "learning_rate": 1.3576550970509666e-05, "logits/chosen": 1.7213351726531982, "logits/rejected": 1.7385635375976562, "logps/chosen": -322.126708984375, "logps/rejected": -534.2659912109375, "loss": 0.7611, "rewards/accuracies": 0.5, "rewards/chosen": -3.1543197631835938, "rewards/margins": 7.169881343841553, "rewards/rejected": -10.324201583862305, "step": 301 }, { "epoch": 1.3422222222222222, "grad_norm": 1.8875739574432373, "learning_rate": 1.341325259919996e-05, "logits/chosen": 2.0664215087890625, "logits/rejected": 2.0522894859313965, "logps/chosen": -221.20150756835938, "logps/rejected": -283.83770751953125, "loss": 0.0715, "rewards/accuracies": 1.0, "rewards/chosen": -0.671368420124054, "rewards/margins": 2.8214011192321777, "rewards/rejected": -3.492769718170166, "step": 302 }, { "epoch": 1.3466666666666667, "grad_norm": 11.853067398071289, "learning_rate": 1.325058142431701e-05, "logits/chosen": 1.6087274551391602, "logits/rejected": 1.6155368089675903, "logps/chosen": -160.04257202148438, "logps/rejected": -224.98550415039062, "loss": 0.6955, "rewards/accuracies": 0.5, "rewards/chosen": -0.0727493166923523, "rewards/margins": 2.172374725341797, "rewards/rejected": -2.245124101638794, "step": 303 }, { "epoch": 1.3511111111111112, "grad_norm": 2.025550603866577, "learning_rate": 1.3088546251332772e-05, "logits/chosen": 1.9365671873092651, "logits/rejected": 1.9474778175354004, "logps/chosen": -421.8943786621094, "logps/rejected": -382.8135070800781, "loss": 0.1339, "rewards/accuracies": 1.0, "rewards/chosen": -1.756591796875, "rewards/margins": 5.394895076751709, "rewards/rejected": -7.151486873626709, "step": 304 }, { "epoch": 1.3555555555555556, "grad_norm": 0.24477213621139526, "learning_rate": 1.2927155851292145e-05, "logits/chosen": 1.8736495971679688, "logits/rejected": 1.8653249740600586, "logps/chosen": -244.5578155517578, "logps/rejected": -318.7236328125, "loss": 0.0104, "rewards/accuracies": 1.0, "rewards/chosen": -1.7046654224395752, "rewards/margins": 4.947877883911133, "rewards/rejected": -6.652543067932129, "step": 305 }, { "epoch": 1.3599999999999999, "grad_norm": 53.01292037963867, "learning_rate": 1.2766418960338128e-05, "logits/chosen": 2.017364025115967, "logits/rejected": 1.9819977283477783, "logps/chosen": -376.64581298828125, "logps/rejected": -330.441162109375, "loss": 2.673, "rewards/accuracies": 0.0, "rewards/chosen": -9.418815612792969, "rewards/margins": -2.5403449535369873, "rewards/rejected": -6.878470420837402, "step": 306 }, { "epoch": 1.3644444444444446, "grad_norm": 6.39654541015625, "learning_rate": 1.260634427923896e-05, "logits/chosen": 1.2513247728347778, "logits/rejected": 1.2842328548431396, "logps/chosen": -137.0913543701172, "logps/rejected": -154.89610290527344, "loss": 0.3182, "rewards/accuracies": 1.0, "rewards/chosen": -0.5993289947509766, "rewards/margins": 2.3029263019561768, "rewards/rejected": -2.9022552967071533, "step": 307 }, { "epoch": 1.3688888888888888, "grad_norm": 0.30323663353919983, "learning_rate": 1.2446940472917099e-05, "logits/chosen": 2.068608283996582, "logits/rejected": 2.0424392223358154, "logps/chosen": -323.708740234375, "logps/rejected": -347.22998046875, "loss": 0.0097, "rewards/accuracies": 1.0, "rewards/chosen": -2.0544846057891846, "rewards/margins": 4.629805088043213, "rewards/rejected": -6.684289932250977, "step": 308 }, { "epoch": 1.3733333333333333, "grad_norm": 1.1456687450408936, "learning_rate": 1.2288216169980243e-05, "logits/chosen": 1.7509403228759766, "logits/rejected": 1.8240234851837158, "logps/chosen": -181.7119903564453, "logps/rejected": -259.93548583984375, "loss": 0.067, "rewards/accuracies": 1.0, "rewards/chosen": -1.8926734924316406, "rewards/margins": 3.159923553466797, "rewards/rejected": -5.0525970458984375, "step": 309 }, { "epoch": 1.3777777777777778, "grad_norm": 22.11775779724121, "learning_rate": 1.213017996225424e-05, "logits/chosen": 1.9033875465393066, "logits/rejected": 1.7332472801208496, "logps/chosen": -356.8746337890625, "logps/rejected": -316.99420166015625, "loss": 1.4625, "rewards/accuracies": 0.5, "rewards/chosen": -4.758178234100342, "rewards/margins": 3.5830559730529785, "rewards/rejected": -8.34123420715332, "step": 310 }, { "epoch": 1.3822222222222222, "grad_norm": 7.521746635437012, "learning_rate": 1.1972840404317961e-05, "logits/chosen": 2.111452102661133, "logits/rejected": 1.976496934890747, "logps/chosen": -428.1273193359375, "logps/rejected": -545.6732788085938, "loss": 0.1086, "rewards/accuracies": 1.0, "rewards/chosen": -10.942473411560059, "rewards/margins": 3.5422022342681885, "rewards/rejected": -14.484675407409668, "step": 311 }, { "epoch": 1.3866666666666667, "grad_norm": 0.11550098657608032, "learning_rate": 1.1816206013040313e-05, "logits/chosen": 1.9739415645599365, "logits/rejected": 1.9305256605148315, "logps/chosen": -288.68017578125, "logps/rejected": -353.60418701171875, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -1.820286512374878, "rewards/margins": 5.378190040588379, "rewards/rejected": -7.198476791381836, "step": 312 }, { "epoch": 1.3911111111111112, "grad_norm": 19.025943756103516, "learning_rate": 1.1660285267119167e-05, "logits/chosen": 2.469484806060791, "logits/rejected": 2.470264196395874, "logps/chosen": -707.8921508789062, "logps/rejected": -753.548828125, "loss": 0.4269, "rewards/accuracies": 0.5, "rewards/chosen": -6.678134441375732, "rewards/margins": 6.853320121765137, "rewards/rejected": -13.531454086303711, "step": 313 }, { "epoch": 1.3955555555555557, "grad_norm": 5.186333179473877, "learning_rate": 1.150508660662242e-05, "logits/chosen": 1.8456952571868896, "logits/rejected": 1.8528308868408203, "logps/chosen": -483.7690734863281, "logps/rejected": -454.6048583984375, "loss": 0.1185, "rewards/accuracies": 1.0, "rewards/chosen": -7.390696048736572, "rewards/margins": 2.1679461002349854, "rewards/rejected": -9.558642387390137, "step": 314 }, { "epoch": 1.4, "grad_norm": 2.9729480743408203, "learning_rate": 1.1350618432531098e-05, "logits/chosen": 1.75775945186615, "logits/rejected": 1.7443618774414062, "logps/chosen": -313.0162048339844, "logps/rejected": -333.1780090332031, "loss": 0.0539, "rewards/accuracies": 1.0, "rewards/chosen": -5.031980991363525, "rewards/margins": 3.3457300662994385, "rewards/rejected": -8.377711296081543, "step": 315 }, { "epoch": 1.4, "eval_logits/chosen": 2.042860269546509, "eval_logits/rejected": 1.9980798959732056, "eval_logps/chosen": -329.1581726074219, "eval_logps/rejected": -387.9140930175781, "eval_loss": 0.446563184261322, "eval_rewards/accuracies": 0.8035714030265808, "eval_rewards/chosen": -3.886552572250366, "eval_rewards/margins": 3.3231773376464844, "eval_rewards/rejected": -7.2097296714782715, "eval_runtime": 17.4013, "eval_samples_per_second": 2.873, "eval_steps_per_second": 0.402, "step": 315 }, { "epoch": 1.4044444444444444, "grad_norm": 11.154594421386719, "learning_rate": 1.1196889106284669e-05, "logits/chosen": 1.755511999130249, "logits/rejected": 1.762006163597107, "logps/chosen": -291.4748229980469, "logps/rejected": -280.0130615234375, "loss": 0.4381, "rewards/accuracies": 0.5, "rewards/chosen": -5.212803840637207, "rewards/margins": 3.228024959564209, "rewards/rejected": -8.440828323364258, "step": 316 }, { "epoch": 1.4088888888888889, "grad_norm": 1.097583532333374, "learning_rate": 1.1043906949328387e-05, "logits/chosen": 1.9886606931686401, "logits/rejected": 1.96701979637146, "logps/chosen": -252.5841827392578, "logps/rejected": -349.20745849609375, "loss": 0.0325, "rewards/accuracies": 1.0, "rewards/chosen": -2.4393486976623535, "rewards/margins": 3.46529221534729, "rewards/rejected": -5.904641151428223, "step": 317 }, { "epoch": 1.4133333333333333, "grad_norm": 0.9752767086029053, "learning_rate": 1.0891680242662835e-05, "logits/chosen": 2.0138909816741943, "logits/rejected": 1.918421983718872, "logps/chosen": -300.9414978027344, "logps/rejected": -353.33404541015625, "loss": 0.0272, "rewards/accuracies": 1.0, "rewards/chosen": -5.531005859375, "rewards/margins": 3.710988759994507, "rewards/rejected": -9.241994857788086, "step": 318 }, { "epoch": 1.4177777777777778, "grad_norm": 16.44363784790039, "learning_rate": 1.0740217226395724e-05, "logits/chosen": 2.0399329662323, "logits/rejected": 1.9319369792938232, "logps/chosen": -391.0609130859375, "logps/rejected": -363.857177734375, "loss": 0.4585, "rewards/accuracies": 0.5, "rewards/chosen": -4.746470928192139, "rewards/margins": 1.3634958267211914, "rewards/rejected": -6.10996675491333, "step": 319 }, { "epoch": 1.4222222222222223, "grad_norm": 33.33434295654297, "learning_rate": 1.0589526099295816e-05, "logits/chosen": 2.1048226356506348, "logits/rejected": 1.9768327474594116, "logps/chosen": -581.8522338867188, "logps/rejected": -463.3633728027344, "loss": 2.6066, "rewards/accuracies": 0.5, "rewards/chosen": -9.7820405960083, "rewards/margins": -0.41506481170654297, "rewards/rejected": -9.366975784301758, "step": 320 }, { "epoch": 1.4266666666666667, "grad_norm": 0.01582302711904049, "learning_rate": 1.0439615018349109e-05, "logits/chosen": 1.9011285305023193, "logits/rejected": 1.9159646034240723, "logps/chosen": -358.68157958984375, "logps/rejected": -534.3988037109375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -1.9317246675491333, "rewards/margins": 9.084466934204102, "rewards/rejected": -11.016191482543945, "step": 321 }, { "epoch": 1.431111111111111, "grad_norm": 6.929481506347656, "learning_rate": 1.029049209831733e-05, "logits/chosen": 1.9835437536239624, "logits/rejected": 1.9601792097091675, "logps/chosen": -285.4648132324219, "logps/rejected": -420.031005859375, "loss": 0.3011, "rewards/accuracies": 1.0, "rewards/chosen": -1.0451195240020752, "rewards/margins": 8.181748390197754, "rewards/rejected": -9.22686767578125, "step": 322 }, { "epoch": 1.4355555555555555, "grad_norm": 12.37388801574707, "learning_rate": 1.0142165411298662e-05, "logits/chosen": 2.131269693374634, "logits/rejected": 2.1731009483337402, "logps/chosen": -299.0472717285156, "logps/rejected": -404.6293029785156, "loss": 0.3707, "rewards/accuracies": 0.5, "rewards/chosen": -1.4244904518127441, "rewards/margins": 3.208432197570801, "rewards/rejected": -4.632922649383545, "step": 323 }, { "epoch": 1.44, "grad_norm": 0.8624144196510315, "learning_rate": 9.994642986290797e-06, "logits/chosen": 2.057706832885742, "logits/rejected": 2.074605941772461, "logps/chosen": -324.3763427734375, "logps/rejected": -441.5903015136719, "loss": 0.0174, "rewards/accuracies": 1.0, "rewards/chosen": -3.426907539367676, "rewards/margins": 5.264835834503174, "rewards/rejected": -8.691743850708008, "step": 324 }, { "epoch": 1.4444444444444444, "grad_norm": 5.4227118492126465, "learning_rate": 9.847932808756308e-06, "logits/chosen": 2.1998391151428223, "logits/rejected": 2.201568126678467, "logps/chosen": -312.435791015625, "logps/rejected": -431.78179931640625, "loss": 0.1895, "rewards/accuracies": 1.0, "rewards/chosen": -1.3775376081466675, "rewards/margins": 3.8356902599334717, "rewards/rejected": -5.213228225708008, "step": 325 }, { "epoch": 1.448888888888889, "grad_norm": 2.2142040729522705, "learning_rate": 9.702042820190415e-06, "logits/chosen": 1.5558602809906006, "logits/rejected": 1.7116918563842773, "logps/chosen": -197.79367065429688, "logps/rejected": -246.2130126953125, "loss": 0.0913, "rewards/accuracies": 1.0, "rewards/chosen": -1.3813637495040894, "rewards/margins": 2.347956418991089, "rewards/rejected": -3.7293200492858887, "step": 326 }, { "epoch": 1.4533333333333334, "grad_norm": 8.604325294494629, "learning_rate": 9.556980917691116e-06, "logits/chosen": 1.6613447666168213, "logits/rejected": 1.7617850303649902, "logps/chosen": -347.57830810546875, "logps/rejected": -387.5290222167969, "loss": 0.4048, "rewards/accuracies": 1.0, "rewards/chosen": -3.3771653175354004, "rewards/margins": 0.8042678833007812, "rewards/rejected": -4.181433200836182, "step": 327 }, { "epoch": 1.4577777777777778, "grad_norm": 0.06135905534029007, "learning_rate": 9.412754953531663e-06, "logits/chosen": 2.1208975315093994, "logits/rejected": 1.9472355842590332, "logps/chosen": -416.2113342285156, "logps/rejected": -469.04803466796875, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -3.4611542224884033, "rewards/margins": 6.7728729248046875, "rewards/rejected": -10.234027862548828, "step": 328 }, { "epoch": 1.462222222222222, "grad_norm": 4.409871578216553, "learning_rate": 9.269372734735577e-06, "logits/chosen": 1.9324915409088135, "logits/rejected": 1.8667106628417969, "logps/chosen": -224.12960815429688, "logps/rejected": -257.76385498046875, "loss": 0.1307, "rewards/accuracies": 1.0, "rewards/chosen": -4.500388145446777, "rewards/margins": 2.151371955871582, "rewards/rejected": -6.651760101318359, "step": 329 }, { "epoch": 1.4666666666666668, "grad_norm": 40.46012496948242, "learning_rate": 9.126842022654003e-06, "logits/chosen": 2.013392686843872, "logits/rejected": 2.085439682006836, "logps/chosen": -343.09381103515625, "logps/rejected": -382.8746337890625, "loss": 1.476, "rewards/accuracies": 0.5, "rewards/chosen": -7.1528754234313965, "rewards/margins": 1.4587280750274658, "rewards/rejected": -8.611603736877441, "step": 330 }, { "epoch": 1.471111111111111, "grad_norm": 20.39794158935547, "learning_rate": 8.985170532545622e-06, "logits/chosen": 2.2019968032836914, "logits/rejected": 2.255478858947754, "logps/chosen": -461.25250244140625, "logps/rejected": -522.5560302734375, "loss": 0.6261, "rewards/accuracies": 0.5, "rewards/chosen": -7.226644515991211, "rewards/margins": 1.7861032485961914, "rewards/rejected": -9.012747764587402, "step": 331 }, { "epoch": 1.4755555555555555, "grad_norm": 0.8205786347389221, "learning_rate": 8.844365933158973e-06, "logits/chosen": 2.0666050910949707, "logits/rejected": 2.1347484588623047, "logps/chosen": -518.1484375, "logps/rejected": -621.5133666992188, "loss": 0.0122, "rewards/accuracies": 1.0, "rewards/chosen": -6.620497703552246, "rewards/margins": 7.697022914886475, "rewards/rejected": -14.317520141601562, "step": 332 }, { "epoch": 1.48, "grad_norm": 2.1303653717041016, "learning_rate": 8.704435846317386e-06, "logits/chosen": 1.9880008697509766, "logits/rejected": 2.010342836380005, "logps/chosen": -331.64874267578125, "logps/rejected": -382.2865905761719, "loss": 0.051, "rewards/accuracies": 1.0, "rewards/chosen": -1.6854767799377441, "rewards/margins": 3.8334319591522217, "rewards/rejected": -5.518908500671387, "step": 333 }, { "epoch": 1.4844444444444445, "grad_norm": 5.076652526855469, "learning_rate": 8.565387846506395e-06, "logits/chosen": 1.8777854442596436, "logits/rejected": 1.8597569465637207, "logps/chosen": -285.7859191894531, "logps/rejected": -302.11083984375, "loss": 0.2184, "rewards/accuracies": 1.0, "rewards/chosen": -4.237748146057129, "rewards/margins": 1.4745651483535767, "rewards/rejected": -5.712313175201416, "step": 334 }, { "epoch": 1.488888888888889, "grad_norm": 1.7670209407806396, "learning_rate": 8.427229460463696e-06, "logits/chosen": 2.0296010971069336, "logits/rejected": 2.046407461166382, "logps/chosen": -446.6829528808594, "logps/rejected": -450.63531494140625, "loss": 0.0681, "rewards/accuracies": 1.0, "rewards/chosen": -2.3315367698669434, "rewards/margins": 3.1756088733673096, "rewards/rejected": -5.507145881652832, "step": 335 }, { "epoch": 1.4933333333333334, "grad_norm": 7.628573894500732, "learning_rate": 8.28996816677177e-06, "logits/chosen": 1.8448824882507324, "logits/rejected": 1.8389427661895752, "logps/chosen": -414.1676025390625, "logps/rejected": -435.0848083496094, "loss": 0.1649, "rewards/accuracies": 1.0, "rewards/chosen": -5.346246242523193, "rewards/margins": 4.283938407897949, "rewards/rejected": -9.6301851272583, "step": 336 }, { "epoch": 1.4977777777777779, "grad_norm": 5.303137302398682, "learning_rate": 8.153611395453045e-06, "logits/chosen": 1.9505963325500488, "logits/rejected": 1.9452285766601562, "logps/chosen": -294.60015869140625, "logps/rejected": -381.62030029296875, "loss": 0.2634, "rewards/accuracies": 1.0, "rewards/chosen": -2.9013266563415527, "rewards/margins": 1.7531030178070068, "rewards/rejected": -4.654429912567139, "step": 337 }, { "epoch": 1.5022222222222221, "grad_norm": 11.043461799621582, "learning_rate": 8.018166527567672e-06, "logits/chosen": 2.0403127670288086, "logits/rejected": 1.9792909622192383, "logps/chosen": -424.71881103515625, "logps/rejected": -502.0084228515625, "loss": 0.4432, "rewards/accuracies": 0.5, "rewards/chosen": -7.235692024230957, "rewards/margins": 4.200654983520508, "rewards/rejected": -11.436347007751465, "step": 338 }, { "epoch": 1.5066666666666668, "grad_norm": 0.34782156348228455, "learning_rate": 7.883640894814043e-06, "logits/chosen": 2.0710644721984863, "logits/rejected": 2.027409553527832, "logps/chosen": -235.57655334472656, "logps/rejected": -296.014892578125, "loss": 0.02, "rewards/accuracies": 1.0, "rewards/chosen": -0.5666137337684631, "rewards/margins": 5.824798107147217, "rewards/rejected": -6.391411781311035, "step": 339 }, { "epoch": 1.511111111111111, "grad_norm": 3.823324203491211, "learning_rate": 7.75004177913188e-06, "logits/chosen": 2.1038994789123535, "logits/rejected": 2.085219383239746, "logps/chosen": -384.879638671875, "logps/rejected": -339.5116882324219, "loss": 0.074, "rewards/accuracies": 1.0, "rewards/chosen": -2.644735813140869, "rewards/margins": 4.097072601318359, "rewards/rejected": -6.7418084144592285, "step": 340 }, { "epoch": 1.5155555555555555, "grad_norm": 0.00015221821377053857, "learning_rate": 7.617376412308083e-06, "logits/chosen": 2.0240237712860107, "logits/rejected": 1.9871121644973755, "logps/chosen": -351.7294616699219, "logps/rejected": -592.8782958984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.147392749786377, "rewards/margins": 13.006977081298828, "rewards/rejected": -15.154369354248047, "step": 341 }, { "epoch": 1.52, "grad_norm": 1.3263826370239258, "learning_rate": 7.485651975585236e-06, "logits/chosen": 1.8890652656555176, "logits/rejected": 1.8711776733398438, "logps/chosen": -324.95245361328125, "logps/rejected": -423.5935974121094, "loss": 0.0299, "rewards/accuracies": 1.0, "rewards/chosen": -6.109809875488281, "rewards/margins": 5.546566009521484, "rewards/rejected": -11.656375885009766, "step": 342 }, { "epoch": 1.5244444444444445, "grad_norm": 4.467953205108643, "learning_rate": 7.354875599272928e-06, "logits/chosen": 1.321131944656372, "logits/rejected": 1.298929214477539, "logps/chosen": -137.98204040527344, "logps/rejected": -102.74874877929688, "loss": 0.3132, "rewards/accuracies": 1.0, "rewards/chosen": -1.748739242553711, "rewards/margins": 1.1912882328033447, "rewards/rejected": -2.9400274753570557, "step": 343 }, { "epoch": 1.528888888888889, "grad_norm": 2.1642391681671143, "learning_rate": 7.2250543623617685e-06, "logits/chosen": 2.2403130531311035, "logits/rejected": 2.209939956665039, "logps/chosen": -361.478759765625, "logps/rejected": -535.7804565429688, "loss": 0.0569, "rewards/accuracies": 1.0, "rewards/chosen": -2.326800584793091, "rewards/margins": 6.651968479156494, "rewards/rejected": -8.978769302368164, "step": 344 }, { "epoch": 1.5333333333333332, "grad_norm": 0.018650932237505913, "learning_rate": 7.096195292140173e-06, "logits/chosen": 1.889040470123291, "logits/rejected": 1.954929232597351, "logps/chosen": -331.3338623046875, "logps/rejected": -661.5016479492188, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.6049957275390625, "rewards/margins": 11.392202377319336, "rewards/rejected": -13.997198104858398, "step": 345 }, { "epoch": 1.537777777777778, "grad_norm": 0.07977497577667236, "learning_rate": 6.968305363814001e-06, "logits/chosen": 2.1549904346466064, "logits/rejected": 2.1628024578094482, "logps/chosen": -401.5853271484375, "logps/rejected": -551.8875732421875, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -4.6251935958862305, "rewards/margins": 7.631624221801758, "rewards/rejected": -12.256817817687988, "step": 346 }, { "epoch": 1.5422222222222222, "grad_norm": 9.609312057495117, "learning_rate": 6.841391500128982e-06, "logits/chosen": 1.9556026458740234, "logits/rejected": 2.000077247619629, "logps/chosen": -265.30108642578125, "logps/rejected": -381.78399658203125, "loss": 0.1607, "rewards/accuracies": 1.0, "rewards/chosen": -4.087409973144531, "rewards/margins": 1.7995681762695312, "rewards/rejected": -5.8869781494140625, "step": 347 }, { "epoch": 1.5466666666666666, "grad_norm": 0.00032588234171271324, "learning_rate": 6.715460570995988e-06, "logits/chosen": 2.1855061054229736, "logits/rejected": 2.1387851238250732, "logps/chosen": -434.9403076171875, "logps/rejected": -738.8302612304688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.6018309593200684, "rewards/margins": 11.849609375, "rewards/rejected": -14.451440811157227, "step": 348 }, { "epoch": 1.551111111111111, "grad_norm": 1.5105094909667969, "learning_rate": 6.5905193931191235e-06, "logits/chosen": 2.143610715866089, "logits/rejected": 2.2100303173065186, "logps/chosen": -380.15618896484375, "logps/rejected": -462.9521789550781, "loss": 0.0708, "rewards/accuracies": 1.0, "rewards/chosen": -1.9106452465057373, "rewards/margins": 4.415860176086426, "rewards/rejected": -6.326505661010742, "step": 349 }, { "epoch": 1.5555555555555556, "grad_norm": 0.15542344748973846, "learning_rate": 6.46657472962679e-06, "logits/chosen": 2.022047758102417, "logits/rejected": 1.9033942222595215, "logps/chosen": -345.0733642578125, "logps/rejected": -438.015380859375, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -0.09320831298828125, "rewards/margins": 8.476028442382812, "rewards/rejected": -8.569236755371094, "step": 350 }, { "epoch": 1.56, "grad_norm": 3.5860283374786377, "learning_rate": 6.343633289705555e-06, "logits/chosen": 1.9373621940612793, "logits/rejected": 1.8331228494644165, "logps/chosen": -333.8984069824219, "logps/rejected": -243.1856231689453, "loss": 0.0951, "rewards/accuracies": 1.0, "rewards/chosen": -3.346085548400879, "rewards/margins": 2.6990599632263184, "rewards/rejected": -6.045145511627197, "step": 351 }, { "epoch": 1.5644444444444443, "grad_norm": 1.0526074171066284, "learning_rate": 6.221701728237009e-06, "logits/chosen": 1.999690055847168, "logits/rejected": 2.0342698097229004, "logps/chosen": -347.6802978515625, "logps/rejected": -321.7969970703125, "loss": 0.036, "rewards/accuracies": 1.0, "rewards/chosen": -4.819159507751465, "rewards/margins": 3.3115005493164062, "rewards/rejected": -8.130660057067871, "step": 352 }, { "epoch": 1.568888888888889, "grad_norm": 0.5887879133224487, "learning_rate": 6.100786645437481e-06, "logits/chosen": 1.1116806268692017, "logits/rejected": 1.1577059030532837, "logps/chosen": -76.88727569580078, "logps/rejected": -119.89225006103516, "loss": 0.0565, "rewards/accuracies": 1.0, "rewards/chosen": 0.09421806037425995, "rewards/margins": 2.9204256534576416, "rewards/rejected": -2.8262076377868652, "step": 353 }, { "epoch": 1.5733333333333333, "grad_norm": 0.1103566437959671, "learning_rate": 5.980894586500841e-06, "logits/chosen": 2.108466625213623, "logits/rejected": 2.086857795715332, "logps/chosen": -387.1605224609375, "logps/rejected": -562.6422119140625, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -5.905645847320557, "rewards/margins": 7.762504577636719, "rewards/rejected": -13.668149948120117, "step": 354 }, { "epoch": 1.5777777777777777, "grad_norm": 8.797289848327637, "learning_rate": 5.8620320412441475e-06, "logits/chosen": 1.8724584579467773, "logits/rejected": 1.9823896884918213, "logps/chosen": -301.8065185546875, "logps/rejected": -380.43853759765625, "loss": 0.4206, "rewards/accuracies": 0.5, "rewards/chosen": -2.267813205718994, "rewards/margins": 1.2750152349472046, "rewards/rejected": -3.542828321456909, "step": 355 }, { "epoch": 1.5822222222222222, "grad_norm": 6.784573078155518, "learning_rate": 5.744205443756364e-06, "logits/chosen": 1.9750076532363892, "logits/rejected": 2.0933682918548584, "logps/chosen": -423.0281677246094, "logps/rejected": -526.6314697265625, "loss": 0.2207, "rewards/accuracies": 1.0, "rewards/chosen": -7.2557196617126465, "rewards/margins": 1.4171913862228394, "rewards/rejected": -8.672910690307617, "step": 356 }, { "epoch": 1.5866666666666667, "grad_norm": 2.033442258834839, "learning_rate": 5.627421172050096e-06, "logits/chosen": 1.7873187065124512, "logits/rejected": 1.7788472175598145, "logps/chosen": -231.81149291992188, "logps/rejected": -276.6201171875, "loss": 0.179, "rewards/accuracies": 1.0, "rewards/chosen": -1.4262077808380127, "rewards/margins": 4.1126708984375, "rewards/rejected": -5.538878440856934, "step": 357 }, { "epoch": 1.5911111111111111, "grad_norm": 0.03710145130753517, "learning_rate": 5.511685547716328e-06, "logits/chosen": 2.091726064682007, "logits/rejected": 2.0658438205718994, "logps/chosen": -469.0487060546875, "logps/rejected": -553.4110107421875, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -3.09385085105896, "rewards/margins": 8.56124496459961, "rewards/rejected": -11.655096054077148, "step": 358 }, { "epoch": 1.5955555555555554, "grad_norm": 0.9721232056617737, "learning_rate": 5.397004835582242e-06, "logits/chosen": 2.050297498703003, "logits/rejected": 2.0740513801574707, "logps/chosen": -394.7935485839844, "logps/rejected": -656.4100341796875, "loss": 0.0184, "rewards/accuracies": 1.0, "rewards/chosen": -3.355221748352051, "rewards/margins": 11.149545669555664, "rewards/rejected": -14.504767417907715, "step": 359 }, { "epoch": 1.6, "grad_norm": 0.1482057273387909, "learning_rate": 5.2833852433720855e-06, "logits/chosen": 2.2320728302001953, "logits/rejected": 2.227717876434326, "logps/chosen": -449.0965881347656, "logps/rejected": -448.2038879394531, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -6.938412666320801, "rewards/margins": 5.938919544219971, "rewards/rejected": -12.87733268737793, "step": 360 }, { "epoch": 1.6, "eval_logits/chosen": 2.020325183868408, "eval_logits/rejected": 1.97914719581604, "eval_logps/chosen": -332.6748352050781, "eval_logps/rejected": -392.21588134765625, "eval_loss": 0.41243118047714233, "eval_rewards/accuracies": 0.8035714030265808, "eval_rewards/chosen": -4.238221645355225, "eval_rewards/margins": 3.401686429977417, "eval_rewards/rejected": -7.639908313751221, "eval_runtime": 17.4022, "eval_samples_per_second": 2.873, "eval_steps_per_second": 0.402, "step": 360 }, { "epoch": 1.6044444444444443, "grad_norm": 26.702436447143555, "learning_rate": 5.170832921371163e-06, "logits/chosen": 2.1601366996765137, "logits/rejected": 2.0925962924957275, "logps/chosen": -737.0338134765625, "logps/rejected": -734.032958984375, "loss": 0.6978, "rewards/accuracies": 0.5, "rewards/chosen": -11.603326797485352, "rewards/margins": 4.569097995758057, "rewards/rejected": -16.17242431640625, "step": 361 }, { "epoch": 1.608888888888889, "grad_norm": 4.281423091888428, "learning_rate": 5.059353962092917e-06, "logits/chosen": 1.8992071151733398, "logits/rejected": 1.9108917713165283, "logps/chosen": -202.74957275390625, "logps/rejected": -191.141357421875, "loss": 0.1472, "rewards/accuracies": 1.0, "rewards/chosen": -0.23720017075538635, "rewards/margins": 1.9029862880706787, "rewards/rejected": -2.140186309814453, "step": 362 }, { "epoch": 1.6133333333333333, "grad_norm": 0.12904126942157745, "learning_rate": 4.9489543999491045e-06, "logits/chosen": 2.1836905479431152, "logits/rejected": 2.0876262187957764, "logps/chosen": -383.00787353515625, "logps/rejected": -463.5958251953125, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -0.0018112175166606903, "rewards/margins": 6.877252578735352, "rewards/rejected": -6.879063606262207, "step": 363 }, { "epoch": 1.6177777777777778, "grad_norm": 0.3242693245410919, "learning_rate": 4.839640210923197e-06, "logits/chosen": 1.981348991394043, "logits/rejected": 1.8337197303771973, "logps/chosen": -231.2086944580078, "logps/rejected": -257.2821044921875, "loss": 0.0119, "rewards/accuracies": 1.0, "rewards/chosen": -1.9566452503204346, "rewards/margins": 4.5026397705078125, "rewards/rejected": -6.459284782409668, "step": 364 }, { "epoch": 1.6222222222222222, "grad_norm": 0.7935138940811157, "learning_rate": 4.731417312246877e-06, "logits/chosen": 1.637596607208252, "logits/rejected": 1.6671159267425537, "logps/chosen": -181.33856201171875, "logps/rejected": -310.20806884765625, "loss": 0.0237, "rewards/accuracies": 1.0, "rewards/chosen": -1.1557204723358154, "rewards/margins": 6.343451023101807, "rewards/rejected": -7.499171257019043, "step": 365 }, { "epoch": 1.6266666666666667, "grad_norm": 0.17146991193294525, "learning_rate": 4.624291562079719e-06, "logits/chosen": 1.5095144510269165, "logits/rejected": 1.552412748336792, "logps/chosen": -283.5166015625, "logps/rejected": -318.7493591308594, "loss": 0.0065, "rewards/accuracies": 1.0, "rewards/chosen": -3.3418242931365967, "rewards/margins": 5.029508590698242, "rewards/rejected": -8.371332168579102, "step": 366 }, { "epoch": 1.6311111111111112, "grad_norm": 10.096573829650879, "learning_rate": 4.518268759192115e-06, "logits/chosen": 2.28369402885437, "logits/rejected": 2.316972255706787, "logps/chosen": -435.752685546875, "logps/rejected": -479.5185241699219, "loss": 0.1262, "rewards/accuracies": 1.0, "rewards/chosen": -5.010312080383301, "rewards/margins": 4.6889142990112305, "rewards/rejected": -9.699226379394531, "step": 367 }, { "epoch": 1.6355555555555554, "grad_norm": 0.13205423951148987, "learning_rate": 4.413354642651369e-06, "logits/chosen": 2.1447973251342773, "logits/rejected": 2.208026885986328, "logps/chosen": -333.09185791015625, "logps/rejected": -571.9179077148438, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -2.114415168762207, "rewards/margins": 6.399701118469238, "rewards/rejected": -8.514116287231445, "step": 368 }, { "epoch": 1.6400000000000001, "grad_norm": 1.1356106996536255, "learning_rate": 4.309554891511036e-06, "logits/chosen": 2.151458740234375, "logits/rejected": 2.0680348873138428, "logps/chosen": -406.594970703125, "logps/rejected": -577.03662109375, "loss": 0.0331, "rewards/accuracies": 1.0, "rewards/chosen": -1.874485731124878, "rewards/margins": 9.36522102355957, "rewards/rejected": -11.239706039428711, "step": 369 }, { "epoch": 1.6444444444444444, "grad_norm": 0.09853781759738922, "learning_rate": 4.206875124503506e-06, "logits/chosen": 2.1071839332580566, "logits/rejected": 2.133695602416992, "logps/chosen": -299.2562255859375, "logps/rejected": -481.5537109375, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -4.580816745758057, "rewards/margins": 6.295032024383545, "rewards/rejected": -10.875848770141602, "step": 370 }, { "epoch": 1.6488888888888888, "grad_norm": 1.1135728359222412, "learning_rate": 4.105320899735882e-06, "logits/chosen": 1.5641443729400635, "logits/rejected": 1.5768111944198608, "logps/chosen": -173.53851318359375, "logps/rejected": -211.27960205078125, "loss": 0.0758, "rewards/accuracies": 1.0, "rewards/chosen": -0.20349428057670593, "rewards/margins": 2.9661078453063965, "rewards/rejected": -3.169602155685425, "step": 371 }, { "epoch": 1.6533333333333333, "grad_norm": 0.5490663647651672, "learning_rate": 4.004897714389103e-06, "logits/chosen": 2.0074357986450195, "logits/rejected": 2.0224769115448, "logps/chosen": -344.1773681640625, "logps/rejected": -435.5508728027344, "loss": 0.0108, "rewards/accuracies": 1.0, "rewards/chosen": -3.9361190795898438, "rewards/margins": 5.090202331542969, "rewards/rejected": -9.026321411132812, "step": 372 }, { "epoch": 1.6577777777777778, "grad_norm": 1.7369130849838257, "learning_rate": 3.90561100442036e-06, "logits/chosen": 1.8831748962402344, "logits/rejected": 1.8279378414154053, "logps/chosen": -236.67889404296875, "logps/rejected": -348.7945251464844, "loss": 0.074, "rewards/accuracies": 1.0, "rewards/chosen": -0.25758588314056396, "rewards/margins": 6.053953170776367, "rewards/rejected": -6.311539173126221, "step": 373 }, { "epoch": 1.6622222222222223, "grad_norm": 0.015169711783528328, "learning_rate": 3.8074661442688868e-06, "logits/chosen": 2.0551671981811523, "logits/rejected": 1.9679946899414062, "logps/chosen": -285.3905944824219, "logps/rejected": -503.6309814453125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -1.3399689197540283, "rewards/margins": 8.514545440673828, "rewards/rejected": -9.854513168334961, "step": 374 }, { "epoch": 1.6666666666666665, "grad_norm": 0.20316235721111298, "learning_rate": 3.710468446565005e-06, "logits/chosen": 1.994492769241333, "logits/rejected": 1.9339189529418945, "logps/chosen": -295.31878662109375, "logps/rejected": -363.3540954589844, "loss": 0.0074, "rewards/accuracies": 1.0, "rewards/chosen": -2.135162353515625, "rewards/margins": 4.933624267578125, "rewards/rejected": -7.06878662109375, "step": 375 }, { "epoch": 1.6711111111111112, "grad_norm": 1.6665427684783936, "learning_rate": 3.6146231618425646e-06, "logits/chosen": 1.9451243877410889, "logits/rejected": 2.0122804641723633, "logps/chosen": -426.1136169433594, "logps/rejected": -602.61279296875, "loss": 0.0643, "rewards/accuracies": 1.0, "rewards/chosen": -1.325921654701233, "rewards/margins": 8.843436241149902, "rewards/rejected": -10.169357299804688, "step": 376 }, { "epoch": 1.6755555555555555, "grad_norm": 76.74038696289062, "learning_rate": 3.5199354782547156e-06, "logits/chosen": 2.1591286659240723, "logits/rejected": 2.029425859451294, "logps/chosen": -350.96746826171875, "logps/rejected": -409.662841796875, "loss": 2.5142, "rewards/accuracies": 0.5, "rewards/chosen": -5.8959503173828125, "rewards/margins": -0.014461994171142578, "rewards/rejected": -5.88148832321167, "step": 377 }, { "epoch": 1.6800000000000002, "grad_norm": 1.8813796043395996, "learning_rate": 3.4264105212930915e-06, "logits/chosen": 1.5345783233642578, "logits/rejected": 1.5260515213012695, "logps/chosen": -137.23216247558594, "logps/rejected": -172.87872314453125, "loss": 0.3194, "rewards/accuracies": 1.0, "rewards/chosen": -0.7019901275634766, "rewards/margins": 2.383763313293457, "rewards/rejected": -3.0857534408569336, "step": 378 }, { "epoch": 1.6844444444444444, "grad_norm": 2.3558857440948486, "learning_rate": 3.3340533535103467e-06, "logits/chosen": 1.783468246459961, "logits/rejected": 1.840031385421753, "logps/chosen": -209.6715087890625, "logps/rejected": -231.9936065673828, "loss": 0.1628, "rewards/accuracies": 1.0, "rewards/chosen": -0.41487425565719604, "rewards/margins": 1.7598159313201904, "rewards/rejected": -2.1746902465820312, "step": 379 }, { "epoch": 1.6888888888888889, "grad_norm": 1.7316559553146362, "learning_rate": 3.2428689742461188e-06, "logits/chosen": 2.061565399169922, "logits/rejected": 2.0696067810058594, "logps/chosen": -303.8612060546875, "logps/rejected": -330.27777099609375, "loss": 0.0797, "rewards/accuracies": 1.0, "rewards/chosen": -1.3209824562072754, "rewards/margins": 5.411049842834473, "rewards/rejected": -6.732032299041748, "step": 380 }, { "epoch": 1.6933333333333334, "grad_norm": 12.0430269241333, "learning_rate": 3.152862319356428e-06, "logits/chosen": 1.936488389968872, "logits/rejected": 1.8861385583877563, "logps/chosen": -475.26220703125, "logps/rejected": -372.1424560546875, "loss": 0.1641, "rewards/accuracies": 1.0, "rewards/chosen": -6.164106845855713, "rewards/margins": 3.2129530906677246, "rewards/rejected": -9.377059936523438, "step": 381 }, { "epoch": 1.6977777777777778, "grad_norm": 0.1999710500240326, "learning_rate": 3.064038260946478e-06, "logits/chosen": 2.0421996116638184, "logits/rejected": 1.8888078927993774, "logps/chosen": -289.0198059082031, "logps/rejected": -385.6482849121094, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": -1.0051002502441406, "rewards/margins": 6.449030876159668, "rewards/rejected": -7.454131126403809, "step": 382 }, { "epoch": 1.7022222222222223, "grad_norm": 0.18681201338768005, "learning_rate": 2.9764016071069434e-06, "logits/chosen": 2.0140395164489746, "logits/rejected": 2.0304765701293945, "logps/chosen": -271.6080627441406, "logps/rejected": -379.2652893066406, "loss": 0.0112, "rewards/accuracies": 1.0, "rewards/chosen": -1.2312802076339722, "rewards/margins": 5.010470867156982, "rewards/rejected": -6.241751194000244, "step": 383 }, { "epoch": 1.7066666666666666, "grad_norm": 0.5151910185813904, "learning_rate": 2.8899571016536786e-06, "logits/chosen": 1.9135384559631348, "logits/rejected": 1.8421276807785034, "logps/chosen": -327.75213623046875, "logps/rejected": -383.0606689453125, "loss": 0.0062, "rewards/accuracies": 1.0, "rewards/chosen": -3.634103298187256, "rewards/margins": 5.9485626220703125, "rewards/rejected": -9.582666397094727, "step": 384 }, { "epoch": 1.7111111111111112, "grad_norm": 0.13173969089984894, "learning_rate": 2.8047094238709633e-06, "logits/chosen": 2.236691474914551, "logits/rejected": 2.233146905899048, "logps/chosen": -469.94879150390625, "logps/rejected": -569.8831176757812, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -7.104136943817139, "rewards/margins": 5.658421516418457, "rewards/rejected": -12.762557983398438, "step": 385 }, { "epoch": 1.7155555555555555, "grad_norm": 0.7114121317863464, "learning_rate": 2.720663188258199e-06, "logits/chosen": 1.9220545291900635, "logits/rejected": 1.9399724006652832, "logps/chosen": -412.7835693359375, "logps/rejected": -462.8909606933594, "loss": 0.0238, "rewards/accuracies": 1.0, "rewards/chosen": -4.260728359222412, "rewards/margins": 4.300273895263672, "rewards/rejected": -8.561002731323242, "step": 386 }, { "epoch": 1.72, "grad_norm": 25.617097854614258, "learning_rate": 2.637822944280116e-06, "logits/chosen": 1.6366169452667236, "logits/rejected": 1.6212671995162964, "logps/chosen": -231.51577758789062, "logps/rejected": -197.76779174804688, "loss": 1.1373, "rewards/accuracies": 0.0, "rewards/chosen": -3.7230637073516846, "rewards/margins": -0.7492774724960327, "rewards/rejected": -2.9737863540649414, "step": 387 }, { "epoch": 1.7244444444444444, "grad_norm": 2.152416706085205, "learning_rate": 2.5561931761205082e-06, "logits/chosen": 1.781626582145691, "logits/rejected": 1.814887523651123, "logps/chosen": -261.75830078125, "logps/rejected": -285.1453857421875, "loss": 0.0976, "rewards/accuracies": 1.0, "rewards/chosen": -1.2200870513916016, "rewards/margins": 5.611637115478516, "rewards/rejected": -6.831724166870117, "step": 388 }, { "epoch": 1.728888888888889, "grad_norm": 28.067974090576172, "learning_rate": 2.475778302439524e-06, "logits/chosen": 1.615212321281433, "logits/rejected": 1.6488571166992188, "logps/chosen": -317.0101318359375, "logps/rejected": -197.44625854492188, "loss": 2.2031, "rewards/accuracies": 0.5, "rewards/chosen": -4.956669330596924, "rewards/margins": 0.2624635696411133, "rewards/rejected": -5.219132423400879, "step": 389 }, { "epoch": 1.7333333333333334, "grad_norm": 19.71435546875, "learning_rate": 2.396582676134462e-06, "logits/chosen": 2.0480542182922363, "logits/rejected": 2.0675265789031982, "logps/chosen": -269.1255187988281, "logps/rejected": -295.6541442871094, "loss": 1.3784, "rewards/accuracies": 0.0, "rewards/chosen": -4.372478485107422, "rewards/margins": -0.9686035513877869, "rewards/rejected": -3.4038748741149902, "step": 390 }, { "epoch": 1.7377777777777776, "grad_norm": 3.7129645347595215, "learning_rate": 2.318610584104142e-06, "logits/chosen": 1.7886816263198853, "logits/rejected": 1.657137393951416, "logps/chosen": -400.5063781738281, "logps/rejected": -427.5238037109375, "loss": 0.0983, "rewards/accuracies": 1.0, "rewards/chosen": -4.910498142242432, "rewards/margins": 5.50076961517334, "rewards/rejected": -10.411267280578613, "step": 391 }, { "epoch": 1.7422222222222223, "grad_norm": 16.56305503845215, "learning_rate": 2.241866247016869e-06, "logits/chosen": 2.101799488067627, "logits/rejected": 2.064134120941162, "logps/chosen": -435.5615234375, "logps/rejected": -487.712158203125, "loss": 0.6052, "rewards/accuracies": 0.5, "rewards/chosen": -6.707480430603027, "rewards/margins": 3.5099639892578125, "rewards/rejected": -10.21744441986084, "step": 392 }, { "epoch": 1.7466666666666666, "grad_norm": 0.02198374643921852, "learning_rate": 2.166353819081968e-06, "logits/chosen": 2.1594762802124023, "logits/rejected": 2.2261545658111572, "logps/chosen": -441.47528076171875, "logps/rejected": -560.4810791015625, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.149522304534912, "rewards/margins": 8.681150436401367, "rewards/rejected": -11.830673217773438, "step": 393 }, { "epoch": 1.751111111111111, "grad_norm": 14.180363655090332, "learning_rate": 2.092077387824884e-06, "logits/chosen": 2.0479955673217773, "logits/rejected": 1.947251796722412, "logps/chosen": -368.0101623535156, "logps/rejected": -422.68780517578125, "loss": 0.3844, "rewards/accuracies": 0.5, "rewards/chosen": -5.151028633117676, "rewards/margins": 3.9921188354492188, "rewards/rejected": -9.143147468566895, "step": 394 }, { "epoch": 1.7555555555555555, "grad_norm": 0.006763577461242676, "learning_rate": 2.0190409738659653e-06, "logits/chosen": 2.2438273429870605, "logits/rejected": 2.2101495265960693, "logps/chosen": -518.1273803710938, "logps/rejected": -702.2735595703125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -4.715429782867432, "rewards/margins": 12.057371139526367, "rewards/rejected": -16.77280044555664, "step": 395 }, { "epoch": 1.76, "grad_norm": 0.18149635195732117, "learning_rate": 1.9472485307027945e-06, "logits/chosen": 2.1681084632873535, "logits/rejected": 2.203289270401001, "logps/chosen": -347.28582763671875, "logps/rejected": -500.73455810546875, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -2.5240983963012695, "rewards/margins": 6.283698081970215, "rewards/rejected": -8.807796478271484, "step": 396 }, { "epoch": 1.7644444444444445, "grad_norm": 4.91969108581543, "learning_rate": 1.876703944496197e-06, "logits/chosen": 1.9020869731903076, "logits/rejected": 1.7357096672058105, "logps/chosen": -352.68695068359375, "logps/rejected": -305.7750244140625, "loss": 0.1342, "rewards/accuracies": 1.0, "rewards/chosen": -2.5846099853515625, "rewards/margins": 4.8691534996032715, "rewards/rejected": -7.453763961791992, "step": 397 }, { "epoch": 1.7688888888888887, "grad_norm": 57.27471923828125, "learning_rate": 1.8074110338598682e-06, "logits/chosen": 2.0281596183776855, "logits/rejected": 1.8878042697906494, "logps/chosen": -631.1502685546875, "logps/rejected": -437.7010498046875, "loss": 3.8594, "rewards/accuracies": 0.5, "rewards/chosen": -13.262107849121094, "rewards/margins": -1.1633968353271484, "rewards/rejected": -12.098711013793945, "step": 398 }, { "epoch": 1.7733333333333334, "grad_norm": 1.8617291450500488, "learning_rate": 1.7393735496536944e-06, "logits/chosen": 2.0407192707061768, "logits/rejected": 1.8599579334259033, "logps/chosen": -491.43707275390625, "logps/rejected": -467.81060791015625, "loss": 0.0171, "rewards/accuracies": 1.0, "rewards/chosen": -8.183530807495117, "rewards/margins": 7.340937614440918, "rewards/rejected": -15.524469375610352, "step": 399 }, { "epoch": 1.7777777777777777, "grad_norm": 2.8930482864379883, "learning_rate": 1.6725951747806918e-06, "logits/chosen": 1.4244745969772339, "logits/rejected": 1.4205752611160278, "logps/chosen": -128.19241333007812, "logps/rejected": -157.7073974609375, "loss": 0.3631, "rewards/accuracies": 0.5, "rewards/chosen": -0.8688427209854126, "rewards/margins": 1.723132848739624, "rewards/rejected": -2.591975450515747, "step": 400 }, { "epoch": 1.7822222222222224, "grad_norm": 3.2958927154541016, "learning_rate": 1.6070795239876618e-06, "logits/chosen": 2.2915682792663574, "logits/rejected": 2.2581777572631836, "logps/chosen": -368.1529541015625, "logps/rejected": -491.930419921875, "loss": 0.0333, "rewards/accuracies": 1.0, "rewards/chosen": -3.686471700668335, "rewards/margins": 7.626599311828613, "rewards/rejected": -11.313071250915527, "step": 401 }, { "epoch": 1.7866666666666666, "grad_norm": 3.986274480819702, "learning_rate": 1.5428301436695159e-06, "logits/chosen": 1.5831184387207031, "logits/rejected": 1.6049795150756836, "logps/chosen": -165.51271057128906, "logps/rejected": -185.55548095703125, "loss": 0.2804, "rewards/accuracies": 1.0, "rewards/chosen": -0.1501312404870987, "rewards/margins": 1.7175559997558594, "rewards/rejected": -1.8676872253417969, "step": 402 }, { "epoch": 1.791111111111111, "grad_norm": 29.756507873535156, "learning_rate": 1.479850511677322e-06, "logits/chosen": 2.165071487426758, "logits/rejected": 2.0931169986724854, "logps/chosen": -528.3546142578125, "logps/rejected": -583.2621459960938, "loss": 1.384, "rewards/accuracies": 0.5, "rewards/chosen": -6.661299228668213, "rewards/margins": 4.992590427398682, "rewards/rejected": -11.653889656066895, "step": 403 }, { "epoch": 1.7955555555555556, "grad_norm": 6.813007354736328, "learning_rate": 1.4181440371300342e-06, "logits/chosen": 1.9760260581970215, "logits/rejected": 1.9418466091156006, "logps/chosen": -360.95574951171875, "logps/rejected": -394.4306335449219, "loss": 0.1623, "rewards/accuracies": 1.0, "rewards/chosen": -1.3200485706329346, "rewards/margins": 4.02227783203125, "rewards/rejected": -5.3423261642456055, "step": 404 }, { "epoch": 1.8, "grad_norm": 0.347569078207016, "learning_rate": 1.3577140602299448e-06, "logits/chosen": 2.0109634399414062, "logits/rejected": 1.9843730926513672, "logps/chosen": -423.1788330078125, "logps/rejected": -527.6969604492188, "loss": 0.0099, "rewards/accuracies": 1.0, "rewards/chosen": -2.6651382446289062, "rewards/margins": 6.632790565490723, "rewards/rejected": -9.297929763793945, "step": 405 }, { "epoch": 1.8, "eval_logits/chosen": 2.008713722229004, "eval_logits/rejected": 1.9683387279510498, "eval_logps/chosen": -333.8675231933594, "eval_logps/rejected": -395.43695068359375, "eval_loss": 0.38143062591552734, "eval_rewards/accuracies": 0.8214285969734192, "eval_rewards/chosen": -4.357491493225098, "eval_rewards/margins": 3.6045258045196533, "eval_rewards/rejected": -7.96201753616333, "eval_runtime": 17.3982, "eval_samples_per_second": 2.874, "eval_steps_per_second": 0.402, "step": 405 } ], "logging_steps": 1, "max_steps": 450, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 45, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }