diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6395 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 100, + "global_step": 3821, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00026171159382360636, + "grad_norm": 7.5399394035339355, + "learning_rate": 1.3054830287206266e-09, + "logits/chosen": -3.2296347618103027, + "logits/rejected": -3.202975034713745, + "logps/chosen": -402.0491638183594, + "logps/rejected": -447.69073486328125, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0026171159382360636, + "grad_norm": 7.425467491149902, + "learning_rate": 1.3054830287206264e-08, + "logits/chosen": -3.1455202102661133, + "logits/rejected": -3.127438545227051, + "logps/chosen": -350.64984130859375, + "logps/rejected": -302.1429443359375, + "loss": 0.6934, + "rewards/accuracies": 0.4236111044883728, + "rewards/chosen": -0.0005491668125614524, + "rewards/margins": -0.0004519576614256948, + "rewards/rejected": -9.720920934341848e-05, + "step": 10 + }, + { + "epoch": 0.005234231876472127, + "grad_norm": 8.282913208007812, + "learning_rate": 2.610966057441253e-08, + "logits/chosen": -3.162764310836792, + "logits/rejected": -3.1438052654266357, + "logps/chosen": -390.9164123535156, + "logps/rejected": -291.6170654296875, + "loss": 0.6932, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -0.00015077728312462568, + "rewards/margins": -0.00010697855759644881, + "rewards/rejected": -4.379871461424045e-05, + "step": 20 + }, + { + "epoch": 0.007851347814708191, + "grad_norm": 7.730243682861328, + "learning_rate": 3.91644908616188e-08, + "logits/chosen": -3.145042657852173, + "logits/rejected": -3.1386446952819824, + "logps/chosen": -333.2342224121094, + "logps/rejected": -318.4365234375, + "loss": 0.6931, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -9.905405022436753e-05, + "rewards/margins": 8.009998418856412e-05, + "rewards/rejected": -0.00017915402713697404, + "step": 30 + }, + { + "epoch": 0.010468463752944255, + "grad_norm": 6.743426322937012, + "learning_rate": 5.221932114882506e-08, + "logits/chosen": -3.1190810203552246, + "logits/rejected": -3.1290841102600098, + "logps/chosen": -278.45318603515625, + "logps/rejected": -271.45623779296875, + "loss": 0.6929, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.00019631900067906827, + "rewards/margins": 0.00044984457781538367, + "rewards/rejected": -0.0002535254752729088, + "step": 40 + }, + { + "epoch": 0.01308557969118032, + "grad_norm": 6.665031433105469, + "learning_rate": 6.527415143603133e-08, + "logits/chosen": -3.2044689655303955, + "logits/rejected": -3.1922316551208496, + "logps/chosen": -344.5279235839844, + "logps/rejected": -289.36700439453125, + "loss": 0.6933, + "rewards/accuracies": 0.41874998807907104, + "rewards/chosen": -0.0005578735726885498, + "rewards/margins": -0.00022401110618375242, + "rewards/rejected": -0.0003338624082971364, + "step": 50 + }, + { + "epoch": 0.015702695629416383, + "grad_norm": 7.2192559242248535, + "learning_rate": 7.83289817232376e-08, + "logits/chosen": -3.1214582920074463, + "logits/rejected": -3.118607759475708, + "logps/chosen": -327.0560607910156, + "logps/rejected": -280.2232971191406, + "loss": 0.6932, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.00027051212964579463, + "rewards/margins": -2.3294798666029237e-05, + "rewards/rejected": -0.0002472173946443945, + "step": 60 + }, + { + "epoch": 0.018319811567652448, + "grad_norm": 7.3564372062683105, + "learning_rate": 9.138381201044386e-08, + "logits/chosen": -3.1655259132385254, + "logits/rejected": -3.1472818851470947, + "logps/chosen": -345.159912109375, + "logps/rejected": -303.17254638671875, + "loss": 0.6933, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.00026873586466535926, + "rewards/margins": -0.0002816948399413377, + "rewards/rejected": 1.2959059858985711e-05, + "step": 70 + }, + { + "epoch": 0.02093692750588851, + "grad_norm": 8.323124885559082, + "learning_rate": 1.0443864229765012e-07, + "logits/chosen": -3.0945706367492676, + "logits/rejected": -3.090824842453003, + "logps/chosen": -339.45819091796875, + "logps/rejected": -308.2747497558594, + "loss": 0.6931, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -0.0003364879812579602, + "rewards/margins": 5.1764400268439204e-05, + "rewards/rejected": -0.0003882523742504418, + "step": 80 + }, + { + "epoch": 0.023554043444124574, + "grad_norm": 7.6322407722473145, + "learning_rate": 1.174934725848564e-07, + "logits/chosen": -3.1316659450531006, + "logits/rejected": -3.1392831802368164, + "logps/chosen": -320.486572265625, + "logps/rejected": -294.0498352050781, + "loss": 0.6929, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.0007914910092949867, + "rewards/margins": 0.0005054398206993937, + "rewards/rejected": -0.0012969308299943805, + "step": 90 + }, + { + "epoch": 0.02617115938236064, + "grad_norm": 7.107323169708252, + "learning_rate": 1.3054830287206266e-07, + "logits/chosen": -3.1230645179748535, + "logits/rejected": -3.142228364944458, + "logps/chosen": -323.48577880859375, + "logps/rejected": -288.4997863769531, + "loss": 0.693, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.0014953254722058773, + "rewards/margins": 0.00022950551647227257, + "rewards/rejected": -0.0017248311778530478, + "step": 100 + }, + { + "epoch": 0.02617115938236064, + "eval_logits/chosen": -3.136061429977417, + "eval_logits/rejected": -3.1227903366088867, + "eval_logps/chosen": -336.35565185546875, + "eval_logps/rejected": -297.29937744140625, + "eval_loss": 0.6928624510765076, + "eval_rewards/accuracies": 0.5320000052452087, + "eval_rewards/chosen": -0.0013669482432305813, + "eval_rewards/margins": 0.0005784809472970665, + "eval_rewards/rejected": -0.0019454291323199868, + "eval_runtime": 305.21, + "eval_samples_per_second": 6.553, + "eval_steps_per_second": 0.819, + "step": 100 + }, + { + "epoch": 0.028788275320596704, + "grad_norm": 7.128358364105225, + "learning_rate": 1.4360313315926893e-07, + "logits/chosen": -3.166736125946045, + "logits/rejected": -3.151339054107666, + "logps/chosen": -353.5014343261719, + "logps/rejected": -290.27734375, + "loss": 0.6932, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.002510789781808853, + "rewards/margins": -3.944762283936143e-06, + "rewards/rejected": -0.0025068449322134256, + "step": 110 + }, + { + "epoch": 0.031405391258832765, + "grad_norm": 6.817880630493164, + "learning_rate": 1.566579634464752e-07, + "logits/chosen": -3.145479917526245, + "logits/rejected": -3.143561601638794, + "logps/chosen": -369.2731628417969, + "logps/rejected": -329.998291015625, + "loss": 0.6926, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.0027725263498723507, + "rewards/margins": 0.0011518350802361965, + "rewards/rejected": -0.003924361430108547, + "step": 120 + }, + { + "epoch": 0.03402250719706883, + "grad_norm": 7.847947597503662, + "learning_rate": 1.6971279373368143e-07, + "logits/chosen": -3.109908103942871, + "logits/rejected": -3.0981078147888184, + "logps/chosen": -329.1286926269531, + "logps/rejected": -307.6993713378906, + "loss": 0.692, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.00396283995360136, + "rewards/margins": 0.0023776493035256863, + "rewards/rejected": -0.006340488791465759, + "step": 130 + }, + { + "epoch": 0.036639623135304895, + "grad_norm": 8.320524215698242, + "learning_rate": 1.8276762402088773e-07, + "logits/chosen": -3.164989948272705, + "logits/rejected": -3.1148316860198975, + "logps/chosen": -355.7626037597656, + "logps/rejected": -275.7682800292969, + "loss": 0.6924, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.006009591277688742, + "rewards/margins": 0.0014354930026456714, + "rewards/rejected": -0.007445084396749735, + "step": 140 + }, + { + "epoch": 0.03925673907354096, + "grad_norm": 7.911156177520752, + "learning_rate": 1.95822454308094e-07, + "logits/chosen": -3.152989625930786, + "logits/rejected": -3.1499996185302734, + "logps/chosen": -357.94586181640625, + "logps/rejected": -294.5421142578125, + "loss": 0.6918, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.006876949220895767, + "rewards/margins": 0.00266222539357841, + "rewards/rejected": -0.009539174847304821, + "step": 150 + }, + { + "epoch": 0.04187385501177702, + "grad_norm": 8.252726554870605, + "learning_rate": 2.0887728459530023e-07, + "logits/chosen": -3.139591932296753, + "logits/rejected": -3.137519359588623, + "logps/chosen": -324.2300720214844, + "logps/rejected": -310.9124450683594, + "loss": 0.6921, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.012208414264023304, + "rewards/margins": 0.002138238400220871, + "rewards/rejected": -0.01434665359556675, + "step": 160 + }, + { + "epoch": 0.04449097095001309, + "grad_norm": 8.332549095153809, + "learning_rate": 2.2193211488250652e-07, + "logits/chosen": -3.1174330711364746, + "logits/rejected": -3.1148269176483154, + "logps/chosen": -286.5895690917969, + "logps/rejected": -268.8719177246094, + "loss": 0.6918, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.01521300245076418, + "rewards/margins": 0.002711162669584155, + "rewards/rejected": -0.01792416349053383, + "step": 170 + }, + { + "epoch": 0.04710808688824915, + "grad_norm": 6.830652713775635, + "learning_rate": 2.349869451697128e-07, + "logits/chosen": -3.139569044113159, + "logits/rejected": -3.118330955505371, + "logps/chosen": -325.8437805175781, + "logps/rejected": -292.931884765625, + "loss": 0.6919, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.019671550020575523, + "rewards/margins": 0.0024939056020230055, + "rewards/rejected": -0.022165456786751747, + "step": 180 + }, + { + "epoch": 0.04972520282648522, + "grad_norm": 7.171481132507324, + "learning_rate": 2.4804177545691903e-07, + "logits/chosen": -3.167520046234131, + "logits/rejected": -3.1782307624816895, + "logps/chosen": -348.24981689453125, + "logps/rejected": -295.1228942871094, + "loss": 0.6903, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.01866580918431282, + "rewards/margins": 0.0057288832031190395, + "rewards/rejected": -0.024394694715738297, + "step": 190 + }, + { + "epoch": 0.05234231876472128, + "grad_norm": 6.543122291564941, + "learning_rate": 2.610966057441253e-07, + "logits/chosen": -3.1437888145446777, + "logits/rejected": -3.152954578399658, + "logps/chosen": -319.37432861328125, + "logps/rejected": -268.0211181640625, + "loss": 0.6887, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.026566719636321068, + "rewards/margins": 0.009126237593591213, + "rewards/rejected": -0.03569295257329941, + "step": 200 + }, + { + "epoch": 0.05234231876472128, + "eval_logits/chosen": -3.13464617729187, + "eval_logits/rejected": -3.1214706897735596, + "eval_logps/chosen": -339.234130859375, + "eval_logps/rejected": -300.934814453125, + "eval_loss": 0.6891751885414124, + "eval_rewards/accuracies": 0.6159999966621399, + "eval_rewards/chosen": -0.03015170618891716, + "eval_rewards/margins": 0.008148480206727982, + "eval_rewards/rejected": -0.03830018267035484, + "eval_runtime": 305.3758, + "eval_samples_per_second": 6.549, + "eval_steps_per_second": 0.819, + "step": 200 + }, + { + "epoch": 0.05495943470295734, + "grad_norm": 6.885265827178955, + "learning_rate": 2.7415143603133156e-07, + "logits/chosen": -3.1693227291107178, + "logits/rejected": -3.1635937690734863, + "logps/chosen": -338.8418273925781, + "logps/rejected": -288.22625732421875, + "loss": 0.6876, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.0318625271320343, + "rewards/margins": 0.01143469475209713, + "rewards/rejected": -0.04329722374677658, + "step": 210 + }, + { + "epoch": 0.05757655064119341, + "grad_norm": 7.088402271270752, + "learning_rate": 2.8720626631853785e-07, + "logits/chosen": -3.113354206085205, + "logits/rejected": -3.1317508220672607, + "logps/chosen": -328.857421875, + "logps/rejected": -287.6562805175781, + "loss": 0.6872, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.04106331616640091, + "rewards/margins": 0.01231370773166418, + "rewards/rejected": -0.053377024829387665, + "step": 220 + }, + { + "epoch": 0.06019366657942947, + "grad_norm": 7.345415115356445, + "learning_rate": 3.002610966057441e-07, + "logits/chosen": -3.194124937057495, + "logits/rejected": -3.1830177307128906, + "logps/chosen": -392.96978759765625, + "logps/rejected": -345.01678466796875, + "loss": 0.6874, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.048357121646404266, + "rewards/margins": 0.011998703703284264, + "rewards/rejected": -0.06035583093762398, + "step": 230 + }, + { + "epoch": 0.06281078251766553, + "grad_norm": 7.639434814453125, + "learning_rate": 3.133159268929504e-07, + "logits/chosen": -3.146155834197998, + "logits/rejected": -3.1524384021759033, + "logps/chosen": -372.24835205078125, + "logps/rejected": -348.30535888671875, + "loss": 0.6877, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.06389515846967697, + "rewards/margins": 0.01166953518986702, + "rewards/rejected": -0.07556469738483429, + "step": 240 + }, + { + "epoch": 0.06542789845590159, + "grad_norm": 8.062237739562988, + "learning_rate": 3.263707571801567e-07, + "logits/chosen": -3.1019375324249268, + "logits/rejected": -3.125652313232422, + "logps/chosen": -342.43292236328125, + "logps/rejected": -297.35906982421875, + "loss": 0.6848, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.06669998914003372, + "rewards/margins": 0.017446473240852356, + "rewards/rejected": -0.08414646238088608, + "step": 250 + }, + { + "epoch": 0.06804501439413765, + "grad_norm": 7.048512935638428, + "learning_rate": 3.3942558746736286e-07, + "logits/chosen": -3.1437766551971436, + "logits/rejected": -3.1343367099761963, + "logps/chosen": -357.90447998046875, + "logps/rejected": -309.511962890625, + "loss": 0.6842, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.08034952729940414, + "rewards/margins": 0.018954290077090263, + "rewards/rejected": -0.09930381923913956, + "step": 260 + }, + { + "epoch": 0.07066213033237373, + "grad_norm": 6.4997453689575195, + "learning_rate": 3.5248041775456916e-07, + "logits/chosen": -3.1495137214660645, + "logits/rejected": -3.1338560581207275, + "logps/chosen": -339.3167419433594, + "logps/rejected": -293.10089111328125, + "loss": 0.6833, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0888824537396431, + "rewards/margins": 0.02101912908256054, + "rewards/rejected": -0.10990158468484879, + "step": 270 + }, + { + "epoch": 0.07327924627060979, + "grad_norm": 7.67672872543335, + "learning_rate": 3.6553524804177545e-07, + "logits/chosen": -3.147304058074951, + "logits/rejected": -3.116621494293213, + "logps/chosen": -342.96771240234375, + "logps/rejected": -300.9717102050781, + "loss": 0.6834, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.09658826142549515, + "rewards/margins": 0.021063242107629776, + "rewards/rejected": -0.11765149980783463, + "step": 280 + }, + { + "epoch": 0.07589636220884585, + "grad_norm": 7.983977317810059, + "learning_rate": 3.785900783289817e-07, + "logits/chosen": -3.1366944313049316, + "logits/rejected": -3.142000675201416, + "logps/chosen": -365.61944580078125, + "logps/rejected": -324.70709228515625, + "loss": 0.6787, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.0843677967786789, + "rewards/margins": 0.031160688027739525, + "rewards/rejected": -0.11552847921848297, + "step": 290 + }, + { + "epoch": 0.07851347814708191, + "grad_norm": 7.798567771911621, + "learning_rate": 3.91644908616188e-07, + "logits/chosen": -3.087780237197876, + "logits/rejected": -3.045769214630127, + "logps/chosen": -329.89642333984375, + "logps/rejected": -295.91046142578125, + "loss": 0.6789, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.08805381506681442, + "rewards/margins": 0.030454417690634727, + "rewards/rejected": -0.11850825697183609, + "step": 300 + }, + { + "epoch": 0.07851347814708191, + "eval_logits/chosen": -3.121612548828125, + "eval_logits/rejected": -3.1093947887420654, + "eval_logps/chosen": -344.1050720214844, + "eval_logps/rejected": -307.97979736328125, + "eval_loss": 0.6793686747550964, + "eval_rewards/accuracies": 0.6359999775886536, + "eval_rewards/chosen": -0.07886076718568802, + "eval_rewards/margins": 0.029889002442359924, + "eval_rewards/rejected": -0.10874976962804794, + "eval_runtime": 305.2365, + "eval_samples_per_second": 6.552, + "eval_steps_per_second": 0.819, + "step": 300 + }, + { + "epoch": 0.08113059408531798, + "grad_norm": 8.230989456176758, + "learning_rate": 4.046997389033943e-07, + "logits/chosen": -3.1827919483184814, + "logits/rejected": -3.1549506187438965, + "logps/chosen": -373.86859130859375, + "logps/rejected": -295.2381591796875, + "loss": 0.6715, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06812898814678192, + "rewards/margins": 0.04669738933444023, + "rewards/rejected": -0.11482638120651245, + "step": 310 + }, + { + "epoch": 0.08374771002355404, + "grad_norm": 7.9239606857299805, + "learning_rate": 4.1775456919060046e-07, + "logits/chosen": -3.1656203269958496, + "logits/rejected": -3.14418625831604, + "logps/chosen": -340.9632873535156, + "logps/rejected": -307.0127868652344, + "loss": 0.6775, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.0765252560377121, + "rewards/margins": 0.03432226926088333, + "rewards/rejected": -0.11084753274917603, + "step": 320 + }, + { + "epoch": 0.08636482596179011, + "grad_norm": 7.936288356781006, + "learning_rate": 4.3080939947780675e-07, + "logits/chosen": -3.1270527839660645, + "logits/rejected": -3.1329689025878906, + "logps/chosen": -338.80364990234375, + "logps/rejected": -306.50103759765625, + "loss": 0.678, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.08645441383123398, + "rewards/margins": 0.03408702462911606, + "rewards/rejected": -0.12054143846035004, + "step": 330 + }, + { + "epoch": 0.08898194190002617, + "grad_norm": 8.688923835754395, + "learning_rate": 4.4386422976501305e-07, + "logits/chosen": -3.165428638458252, + "logits/rejected": -3.1660349369049072, + "logps/chosen": -370.370361328125, + "logps/rejected": -334.38140869140625, + "loss": 0.6741, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.08854931592941284, + "rewards/margins": 0.04290110990405083, + "rewards/rejected": -0.13145044445991516, + "step": 340 + }, + { + "epoch": 0.09159905783826224, + "grad_norm": 7.534700393676758, + "learning_rate": 4.569190600522193e-07, + "logits/chosen": -3.093208074569702, + "logits/rejected": -3.0936062335968018, + "logps/chosen": -376.98089599609375, + "logps/rejected": -351.4696350097656, + "loss": 0.6701, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.10356112569570541, + "rewards/margins": 0.052223630249500275, + "rewards/rejected": -0.1557847559452057, + "step": 350 + }, + { + "epoch": 0.0942161737764983, + "grad_norm": 6.478003978729248, + "learning_rate": 4.699738903394256e-07, + "logits/chosen": -3.131310224533081, + "logits/rejected": -3.118582248687744, + "logps/chosen": -317.2442932128906, + "logps/rejected": -290.4730529785156, + "loss": 0.6722, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.12776212394237518, + "rewards/margins": 0.047861333936452866, + "rewards/rejected": -0.17562346160411835, + "step": 360 + }, + { + "epoch": 0.09683328971473436, + "grad_norm": 9.120585441589355, + "learning_rate": 4.830287206266319e-07, + "logits/chosen": -3.106093406677246, + "logits/rejected": -3.0930678844451904, + "logps/chosen": -363.0904846191406, + "logps/rejected": -299.1496276855469, + "loss": 0.6611, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.10671161115169525, + "rewards/margins": 0.07157553732395172, + "rewards/rejected": -0.17828714847564697, + "step": 370 + }, + { + "epoch": 0.09945040565297043, + "grad_norm": 8.681933403015137, + "learning_rate": 4.960835509138381e-07, + "logits/chosen": -3.0945727825164795, + "logits/rejected": -3.0602633953094482, + "logps/chosen": -382.96539306640625, + "logps/rejected": -332.73968505859375, + "loss": 0.6631, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.10888388007879257, + "rewards/margins": 0.06903685629367828, + "rewards/rejected": -0.17792072892189026, + "step": 380 + }, + { + "epoch": 0.1020675215912065, + "grad_norm": 10.902252197265625, + "learning_rate": 4.999948856244767e-07, + "logits/chosen": -3.0760512351989746, + "logits/rejected": -3.0887162685394287, + "logps/chosen": -368.3498840332031, + "logps/rejected": -337.3893737792969, + "loss": 0.6596, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.09421645104885101, + "rewards/margins": 0.07887949794530869, + "rewards/rejected": -0.1730959713459015, + "step": 390 + }, + { + "epoch": 0.10468463752944256, + "grad_norm": 8.231829643249512, + "learning_rate": 4.999698361256577e-07, + "logits/chosen": -3.104179859161377, + "logits/rejected": -3.0953407287597656, + "logps/chosen": -345.273681640625, + "logps/rejected": -292.58612060546875, + "loss": 0.6624, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.13569284975528717, + "rewards/margins": 0.0712323784828186, + "rewards/rejected": -0.20692522823810577, + "step": 400 + }, + { + "epoch": 0.10468463752944256, + "eval_logits/chosen": -3.077139377593994, + "eval_logits/rejected": -3.0663528442382812, + "eval_logps/chosen": -354.28900146484375, + "eval_logps/rejected": -322.285400390625, + "eval_loss": 0.6634809970855713, + "eval_rewards/accuracies": 0.6389999985694885, + "eval_rewards/chosen": -0.18070009350776672, + "eval_rewards/margins": 0.07110566645860672, + "eval_rewards/rejected": -0.25180572271347046, + "eval_runtime": 305.333, + "eval_samples_per_second": 6.55, + "eval_steps_per_second": 0.819, + "step": 400 + }, + { + "epoch": 0.10730175346767862, + "grad_norm": 8.956136703491211, + "learning_rate": 4.99923914217458e-07, + "logits/chosen": -3.0722298622131348, + "logits/rejected": -3.0702052116394043, + "logps/chosen": -326.5191955566406, + "logps/rejected": -319.8614807128906, + "loss": 0.6786, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.19534249603748322, + "rewards/margins": 0.04017153009772301, + "rewards/rejected": -0.23551401495933533, + "step": 410 + }, + { + "epoch": 0.10991886940591468, + "grad_norm": 11.84124755859375, + "learning_rate": 4.99857123734344e-07, + "logits/chosen": -3.0375618934631348, + "logits/rejected": -3.0047786235809326, + "logps/chosen": -316.19671630859375, + "logps/rejected": -287.55462646484375, + "loss": 0.6703, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.18900929391384125, + "rewards/margins": 0.05846139043569565, + "rewards/rejected": -0.2474706918001175, + "step": 420 + }, + { + "epoch": 0.11253598534415074, + "grad_norm": 10.265412330627441, + "learning_rate": 4.997694702533016e-07, + "logits/chosen": -3.0566139221191406, + "logits/rejected": -3.0296247005462646, + "logps/chosen": -365.24237060546875, + "logps/rejected": -334.05352783203125, + "loss": 0.6575, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.13795217871665955, + "rewards/margins": 0.08431630581617355, + "rewards/rejected": -0.2222684919834137, + "step": 430 + }, + { + "epoch": 0.11515310128238682, + "grad_norm": 8.988418579101562, + "learning_rate": 4.996609610933712e-07, + "logits/chosen": -3.1079680919647217, + "logits/rejected": -3.115506649017334, + "logps/chosen": -354.38226318359375, + "logps/rejected": -315.77880859375, + "loss": 0.6676, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.10598504543304443, + "rewards/margins": 0.06034703180193901, + "rewards/rejected": -0.16633208096027374, + "step": 440 + }, + { + "epoch": 0.11777021722062288, + "grad_norm": 9.168876647949219, + "learning_rate": 4.995316053150366e-07, + "logits/chosen": -3.029470920562744, + "logits/rejected": -3.0375399589538574, + "logps/chosen": -347.7948303222656, + "logps/rejected": -318.85504150390625, + "loss": 0.6619, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.08917012065649033, + "rewards/margins": 0.07738355547189713, + "rewards/rejected": -0.16655369102954865, + "step": 450 + }, + { + "epoch": 0.12038733315885894, + "grad_norm": 12.874235153198242, + "learning_rate": 4.99381413719468e-07, + "logits/chosen": -3.0497491359710693, + "logits/rejected": -3.053278923034668, + "logps/chosen": -356.35498046875, + "logps/rejected": -337.4245300292969, + "loss": 0.6446, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.1851172149181366, + "rewards/margins": 0.11571681499481201, + "rewards/rejected": -0.3008340001106262, + "step": 460 + }, + { + "epoch": 0.123004449097095, + "grad_norm": 15.89233112335205, + "learning_rate": 4.992103988476205e-07, + "logits/chosen": -3.0434165000915527, + "logits/rejected": -3.0270047187805176, + "logps/chosen": -341.6517333984375, + "logps/rejected": -321.88836669921875, + "loss": 0.6467, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.2506132423877716, + "rewards/margins": 0.11548835039138794, + "rewards/rejected": -0.36610156297683716, + "step": 470 + }, + { + "epoch": 0.12562156503533106, + "grad_norm": 12.271796226501465, + "learning_rate": 4.990185749791864e-07, + "logits/chosen": -3.062100887298584, + "logits/rejected": -3.052743673324585, + "logps/chosen": -350.6018981933594, + "logps/rejected": -336.20159912109375, + "loss": 0.6468, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.20994243025779724, + "rewards/margins": 0.11367367208003998, + "rewards/rejected": -0.323616087436676, + "step": 480 + }, + { + "epoch": 0.12823868097356714, + "grad_norm": 16.589054107666016, + "learning_rate": 4.988059581314039e-07, + "logits/chosen": -3.0361151695251465, + "logits/rejected": -3.0559310913085938, + "logps/chosen": -389.7374267578125, + "logps/rejected": -341.6214599609375, + "loss": 0.6457, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.2314407378435135, + "rewards/margins": 0.11587095260620117, + "rewards/rejected": -0.34731167554855347, + "step": 490 + }, + { + "epoch": 0.13085579691180318, + "grad_norm": 11.054908752441406, + "learning_rate": 4.985725660577184e-07, + "logits/chosen": -3.0402045249938965, + "logits/rejected": -3.028681755065918, + "logps/chosen": -382.54754638671875, + "logps/rejected": -331.24432373046875, + "loss": 0.6373, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.29573333263397217, + "rewards/margins": 0.13905784487724304, + "rewards/rejected": -0.43479123711586, + "step": 500 + }, + { + "epoch": 0.13085579691180318, + "eval_logits/chosen": -2.983909845352173, + "eval_logits/rejected": -2.9692585468292236, + "eval_logps/chosen": -366.0958557128906, + "eval_logps/rejected": -338.3079833984375, + "eval_loss": 0.6503274440765381, + "eval_rewards/accuracies": 0.6424999833106995, + "eval_rewards/chosen": -0.2987686097621918, + "eval_rewards/margins": 0.11326280236244202, + "eval_rewards/rejected": -0.4120314419269562, + "eval_runtime": 305.3597, + "eval_samples_per_second": 6.55, + "eval_steps_per_second": 0.819, + "step": 500 + }, + { + "epoch": 0.13347291285003926, + "grad_norm": 12.795198440551758, + "learning_rate": 4.983184182463008e-07, + "logits/chosen": -3.0068187713623047, + "logits/rejected": -3.0001091957092285, + "logps/chosen": -374.67767333984375, + "logps/rejected": -341.01715087890625, + "loss": 0.64, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.24733750522136688, + "rewards/margins": 0.1369626820087433, + "rewards/rejected": -0.384300172328949, + "step": 510 + }, + { + "epoch": 0.1360900287882753, + "grad_norm": 15.100224494934082, + "learning_rate": 4.980435359184203e-07, + "logits/chosen": -3.0251529216766357, + "logits/rejected": -3.0229506492614746, + "logps/chosen": -370.63250732421875, + "logps/rejected": -348.49871826171875, + "loss": 0.6491, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.27271249890327454, + "rewards/margins": 0.11933918297290802, + "rewards/rejected": -0.392051637172699, + "step": 520 + }, + { + "epoch": 0.13870714472651138, + "grad_norm": 13.007247924804688, + "learning_rate": 4.977479420266723e-07, + "logits/chosen": -2.944122791290283, + "logits/rejected": -2.9631760120391846, + "logps/chosen": -376.72235107421875, + "logps/rejected": -384.7277526855469, + "loss": 0.6455, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.37087589502334595, + "rewards/margins": 0.13121145963668823, + "rewards/rejected": -0.5020872950553894, + "step": 530 + }, + { + "epoch": 0.14132426066474746, + "grad_norm": 12.603479385375977, + "learning_rate": 4.974316612530614e-07, + "logits/chosen": -2.947624683380127, + "logits/rejected": -2.930647850036621, + "logps/chosen": -396.7923889160156, + "logps/rejected": -340.41363525390625, + "loss": 0.6186, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.4305369257926941, + "rewards/margins": 0.18575596809387207, + "rewards/rejected": -0.6162929534912109, + "step": 540 + }, + { + "epoch": 0.1439413766029835, + "grad_norm": 15.142595291137695, + "learning_rate": 4.970947200069415e-07, + "logits/chosen": -2.9528539180755615, + "logits/rejected": -2.966151237487793, + "logps/chosen": -370.8870544433594, + "logps/rejected": -354.8753967285156, + "loss": 0.6612, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.2849995791912079, + "rewards/margins": 0.11011602729558945, + "rewards/rejected": -0.39511561393737793, + "step": 550 + }, + { + "epoch": 0.14655849254121958, + "grad_norm": 11.820856094360352, + "learning_rate": 4.967371464228095e-07, + "logits/chosen": -3.029913902282715, + "logits/rejected": -3.0248889923095703, + "logps/chosen": -353.63006591796875, + "logps/rejected": -352.70806884765625, + "loss": 0.6449, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.2146318405866623, + "rewards/margins": 0.13045233488082886, + "rewards/rejected": -0.34508416056632996, + "step": 560 + }, + { + "epoch": 0.14917560847945563, + "grad_norm": 12.829729080200195, + "learning_rate": 4.963589703579569e-07, + "logits/chosen": -3.086268186569214, + "logits/rejected": -3.0621392726898193, + "logps/chosen": -407.73565673828125, + "logps/rejected": -370.2496337890625, + "loss": 0.6455, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.29791611433029175, + "rewards/margins": 0.135451078414917, + "rewards/rejected": -0.43336719274520874, + "step": 570 + }, + { + "epoch": 0.1517927244176917, + "grad_norm": 14.211217880249023, + "learning_rate": 4.959602233899761e-07, + "logits/chosen": -3.047476053237915, + "logits/rejected": -3.0107288360595703, + "logps/chosen": -416.2422790527344, + "logps/rejected": -366.1787414550781, + "loss": 0.6202, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.34379562735557556, + "rewards/margins": 0.19932778179645538, + "rewards/rejected": -0.5431233644485474, + "step": 580 + }, + { + "epoch": 0.15440984035592778, + "grad_norm": 14.383965492248535, + "learning_rate": 4.955409388141243e-07, + "logits/chosen": -2.9714608192443848, + "logits/rejected": -2.9554789066314697, + "logps/chosen": -369.77392578125, + "logps/rejected": -347.66058349609375, + "loss": 0.6443, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.44513049721717834, + "rewards/margins": 0.15323522686958313, + "rewards/rejected": -0.5983657240867615, + "step": 590 + }, + { + "epoch": 0.15702695629416383, + "grad_norm": 18.99432945251465, + "learning_rate": 4.951011516405429e-07, + "logits/chosen": -2.9896721839904785, + "logits/rejected": -3.010812520980835, + "logps/chosen": -368.553466796875, + "logps/rejected": -354.1104736328125, + "loss": 0.6423, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.4014170169830322, + "rewards/margins": 0.15394745767116547, + "rewards/rejected": -0.5553644895553589, + "step": 600 + }, + { + "epoch": 0.15702695629416383, + "eval_logits/chosen": -2.953789710998535, + "eval_logits/rejected": -2.9371681213378906, + "eval_logps/chosen": -375.1290588378906, + "eval_logps/rejected": -350.5517578125, + "eval_loss": 0.6456736326217651, + "eval_rewards/accuracies": 0.637499988079071, + "eval_rewards/chosen": -0.3891007900238037, + "eval_rewards/margins": 0.1453685462474823, + "eval_rewards/rejected": -0.5344693660736084, + "eval_runtime": 305.2345, + "eval_samples_per_second": 6.552, + "eval_steps_per_second": 0.819, + "step": 600 + }, + { + "epoch": 0.1596440722323999, + "grad_norm": 14.88294792175293, + "learning_rate": 4.946408985913344e-07, + "logits/chosen": -2.9678356647491455, + "logits/rejected": -2.9549574851989746, + "logps/chosen": -354.6939392089844, + "logps/rejected": -326.92181396484375, + "loss": 0.6583, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.3528710603713989, + "rewards/margins": 0.1104646697640419, + "rewards/rejected": -0.46333569288253784, + "step": 610 + }, + { + "epoch": 0.16226118817063595, + "grad_norm": 16.416210174560547, + "learning_rate": 4.941602180974958e-07, + "logits/chosen": -2.9832162857055664, + "logits/rejected": -2.9431064128875732, + "logps/chosen": -400.99798583984375, + "logps/rejected": -325.7860412597656, + "loss": 0.6386, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3458956480026245, + "rewards/margins": 0.15582728385925293, + "rewards/rejected": -0.5017229318618774, + "step": 620 + }, + { + "epoch": 0.16487830410887203, + "grad_norm": 11.9800386428833, + "learning_rate": 4.936591502957101e-07, + "logits/chosen": -2.984358787536621, + "logits/rejected": -2.9828975200653076, + "logps/chosen": -356.4893493652344, + "logps/rejected": -343.63519287109375, + "loss": 0.6146, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.371735155582428, + "rewards/margins": 0.22488275170326233, + "rewards/rejected": -0.5966178178787231, + "step": 630 + }, + { + "epoch": 0.16749542004710807, + "grad_norm": 13.97966194152832, + "learning_rate": 4.931377370249945e-07, + "logits/chosen": -2.979612350463867, + "logits/rejected": -2.942445993423462, + "logps/chosen": -383.40643310546875, + "logps/rejected": -351.91448974609375, + "loss": 0.6306, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.4733237326145172, + "rewards/margins": 0.18433848023414612, + "rewards/rejected": -0.6576622128486633, + "step": 640 + }, + { + "epoch": 0.17011253598534415, + "grad_norm": 15.059297561645508, + "learning_rate": 4.925960218232072e-07, + "logits/chosen": -2.9805855751037598, + "logits/rejected": -2.963609218597412, + "logps/chosen": -371.2182312011719, + "logps/rejected": -376.56842041015625, + "loss": 0.6076, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.4861178994178772, + "rewards/margins": 0.249765545129776, + "rewards/rejected": -0.7358834147453308, + "step": 650 + }, + { + "epoch": 0.17272965192358022, + "grad_norm": 17.203136444091797, + "learning_rate": 4.920340499234116e-07, + "logits/chosen": -2.945225477218628, + "logits/rejected": -2.918910503387451, + "logps/chosen": -391.84295654296875, + "logps/rejected": -356.2134704589844, + "loss": 0.6413, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.5608336925506592, + "rewards/margins": 0.1767912060022354, + "rewards/rejected": -0.7376248836517334, + "step": 660 + }, + { + "epoch": 0.17534676786181627, + "grad_norm": 23.82184600830078, + "learning_rate": 4.914518682500995e-07, + "logits/chosen": -3.0318546295166016, + "logits/rejected": -3.0210180282592773, + "logps/chosen": -397.6226501464844, + "logps/rejected": -371.94366455078125, + "loss": 0.6136, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5073692202568054, + "rewards/margins": 0.23156848549842834, + "rewards/rejected": -0.7389377355575562, + "step": 670 + }, + { + "epoch": 0.17796388380005235, + "grad_norm": 12.960205078125, + "learning_rate": 4.90849525415273e-07, + "logits/chosen": -2.9876632690429688, + "logits/rejected": -2.9763565063476562, + "logps/chosen": -399.0164489746094, + "logps/rejected": -358.02972412109375, + "loss": 0.6074, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.5352808833122253, + "rewards/margins": 0.2567313313484192, + "rewards/rejected": -0.7920122742652893, + "step": 680 + }, + { + "epoch": 0.1805809997382884, + "grad_norm": 16.679786682128906, + "learning_rate": 4.902270717143858e-07, + "logits/chosen": -2.982391119003296, + "logits/rejected": -2.9801688194274902, + "logps/chosen": -371.9320373535156, + "logps/rejected": -395.40777587890625, + "loss": 0.6047, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.6789665222167969, + "rewards/margins": 0.27203455567359924, + "rewards/rejected": -0.9510010480880737, + "step": 690 + }, + { + "epoch": 0.18319811567652447, + "grad_norm": 13.776201248168945, + "learning_rate": 4.895845591221426e-07, + "logits/chosen": -2.9419898986816406, + "logits/rejected": -2.9705393314361572, + "logps/chosen": -395.2472839355469, + "logps/rejected": -399.77435302734375, + "loss": 0.6266, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.6857240796089172, + "rewards/margins": 0.23093768954277039, + "rewards/rejected": -0.9166617393493652, + "step": 700 + }, + { + "epoch": 0.18319811567652447, + "eval_logits/chosen": -2.9229085445404053, + "eval_logits/rejected": -2.9094736576080322, + "eval_logps/chosen": -406.5211486816406, + "eval_logps/rejected": -387.9122619628906, + "eval_loss": 0.642038881778717, + "eval_rewards/accuracies": 0.6365000009536743, + "eval_rewards/chosen": -0.7030214667320251, + "eval_rewards/margins": 0.205052450299263, + "eval_rewards/rejected": -0.9080740213394165, + "eval_runtime": 305.3295, + "eval_samples_per_second": 6.55, + "eval_steps_per_second": 0.819, + "step": 700 + }, + { + "epoch": 0.18581523161476055, + "grad_norm": 44.67826461791992, + "learning_rate": 4.8892204128816e-07, + "logits/chosen": -2.973836660385132, + "logits/rejected": -2.9684879779815674, + "logps/chosen": -407.85052490234375, + "logps/rejected": -396.7626647949219, + "loss": 0.6492, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.6791498064994812, + "rewards/margins": 0.1747795045375824, + "rewards/rejected": -0.853929340839386, + "step": 710 + }, + { + "epoch": 0.1884323475529966, + "grad_norm": 22.158349990844727, + "learning_rate": 4.882395735324863e-07, + "logits/chosen": -2.9512436389923096, + "logits/rejected": -2.902366876602173, + "logps/chosen": -407.8587646484375, + "logps/rejected": -385.2088317871094, + "loss": 0.651, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.6286091804504395, + "rewards/margins": 0.17438486218452454, + "rewards/rejected": -0.8029941320419312, + "step": 720 + }, + { + "epoch": 0.19104946349123267, + "grad_norm": 15.196109771728516, + "learning_rate": 4.875372128409829e-07, + "logits/chosen": -2.939497232437134, + "logits/rejected": -2.9162261486053467, + "logps/chosen": -397.8129577636719, + "logps/rejected": -357.2427978515625, + "loss": 0.6525, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.5556532144546509, + "rewards/margins": 0.15908560156822205, + "rewards/rejected": -0.7147387266159058, + "step": 730 + }, + { + "epoch": 0.19366657942946872, + "grad_norm": 13.985071182250977, + "learning_rate": 4.868150178605653e-07, + "logits/chosen": -2.9422688484191895, + "logits/rejected": -2.945159435272217, + "logps/chosen": -343.93597412109375, + "logps/rejected": -314.3185119628906, + "loss": 0.6162, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.5319259166717529, + "rewards/margins": 0.2180342972278595, + "rewards/rejected": -0.74996018409729, + "step": 740 + }, + { + "epoch": 0.1962836953677048, + "grad_norm": 14.363373756408691, + "learning_rate": 4.860730488943068e-07, + "logits/chosen": -2.917142629623413, + "logits/rejected": -2.9183247089385986, + "logps/chosen": -356.16552734375, + "logps/rejected": -352.13116455078125, + "loss": 0.6178, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4881489872932434, + "rewards/margins": 0.2087596356868744, + "rewards/rejected": -0.6969085931777954, + "step": 750 + }, + { + "epoch": 0.19890081130594087, + "grad_norm": 16.34290885925293, + "learning_rate": 4.853113678964021e-07, + "logits/chosen": -2.939605712890625, + "logits/rejected": -2.9561939239501953, + "logps/chosen": -398.5882263183594, + "logps/rejected": -393.08612060546875, + "loss": 0.6337, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.5295611619949341, + "rewards/margins": 0.18238191306591034, + "rewards/rejected": -0.7119430303573608, + "step": 760 + }, + { + "epoch": 0.20151792724417691, + "grad_norm": 13.591556549072266, + "learning_rate": 4.845300384669957e-07, + "logits/chosen": -2.9653282165527344, + "logits/rejected": -2.9657387733459473, + "logps/chosen": -367.00433349609375, + "logps/rejected": -339.44781494140625, + "loss": 0.6483, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.4500230848789215, + "rewards/margins": 0.13824717700481415, + "rewards/rejected": -0.5882702469825745, + "step": 770 + }, + { + "epoch": 0.204135043182413, + "grad_norm": 16.679380416870117, + "learning_rate": 4.8372912584687e-07, + "logits/chosen": -2.9903674125671387, + "logits/rejected": -2.9671452045440674, + "logps/chosen": -396.09393310546875, + "logps/rejected": -374.65313720703125, + "loss": 0.6298, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.3986968398094177, + "rewards/margins": 0.19737406075000763, + "rewards/rejected": -0.5960708856582642, + "step": 780 + }, + { + "epoch": 0.20675215912064904, + "grad_norm": 15.164281845092773, + "learning_rate": 4.829086969119983e-07, + "logits/chosen": -2.9598774909973145, + "logits/rejected": -2.9876530170440674, + "logps/chosen": -361.7220153808594, + "logps/rejected": -362.77734375, + "loss": 0.6576, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.4737739562988281, + "rewards/margins": 0.13218006491661072, + "rewards/rejected": -0.6059540510177612, + "step": 790 + }, + { + "epoch": 0.2093692750588851, + "grad_norm": 14.60089111328125, + "learning_rate": 4.820688201679605e-07, + "logits/chosen": -3.0159783363342285, + "logits/rejected": -2.9888081550598145, + "logps/chosen": -381.7862243652344, + "logps/rejected": -319.8965759277344, + "loss": 0.5942, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.4121805727481842, + "rewards/margins": 0.2772974371910095, + "rewards/rejected": -0.6894780397415161, + "step": 800 + }, + { + "epoch": 0.2093692750588851, + "eval_logits/chosen": -2.939723253250122, + "eval_logits/rejected": -2.925475835800171, + "eval_logps/chosen": -385.91180419921875, + "eval_logps/rejected": -364.74835205078125, + "eval_loss": 0.6367480754852295, + "eval_rewards/accuracies": 0.6474999785423279, + "eval_rewards/chosen": -0.4969281554222107, + "eval_rewards/margins": 0.17950665950775146, + "eval_rewards/rejected": -0.6764348149299622, + "eval_runtime": 305.2085, + "eval_samples_per_second": 6.553, + "eval_steps_per_second": 0.819, + "step": 800 + }, + { + "epoch": 0.21198639099712116, + "grad_norm": 12.582085609436035, + "learning_rate": 4.812095657442231e-07, + "logits/chosen": -2.991004467010498, + "logits/rejected": -3.030839204788208, + "logps/chosen": -396.093017578125, + "logps/rejected": -396.8296203613281, + "loss": 0.6504, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.5232123732566833, + "rewards/margins": 0.1550460159778595, + "rewards/rejected": -0.6782584190368652, + "step": 810 + }, + { + "epoch": 0.21460350693535724, + "grad_norm": 16.284395217895508, + "learning_rate": 4.803310053882831e-07, + "logits/chosen": -2.9644618034362793, + "logits/rejected": -2.983457088470459, + "logps/chosen": -342.4942321777344, + "logps/rejected": -368.9544372558594, + "loss": 0.6324, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.5437088012695312, + "rewards/margins": 0.18539607524871826, + "rewards/rejected": -0.7291048765182495, + "step": 820 + }, + { + "epoch": 0.2172206228735933, + "grad_norm": 16.87338638305664, + "learning_rate": 4.794332124596775e-07, + "logits/chosen": -2.9662275314331055, + "logits/rejected": -2.9847869873046875, + "logps/chosen": -412.5888671875, + "logps/rejected": -409.6481018066406, + "loss": 0.6377, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.606338381767273, + "rewards/margins": 0.19459688663482666, + "rewards/rejected": -0.8009351491928101, + "step": 830 + }, + { + "epoch": 0.21983773881182936, + "grad_norm": 18.873977661132812, + "learning_rate": 4.785162619238574e-07, + "logits/chosen": -2.9262964725494385, + "logits/rejected": -2.9126768112182617, + "logps/chosen": -387.4312438964844, + "logps/rejected": -363.56671142578125, + "loss": 0.6294, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.6643389463424683, + "rewards/margins": 0.21009120345115662, + "rewards/rejected": -0.8744300603866577, + "step": 840 + }, + { + "epoch": 0.22245485475006543, + "grad_norm": 22.909137725830078, + "learning_rate": 4.775802303459287e-07, + "logits/chosen": -2.9198760986328125, + "logits/rejected": -2.91233491897583, + "logps/chosen": -391.82257080078125, + "logps/rejected": -389.66168212890625, + "loss": 0.6364, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.6800801157951355, + "rewards/margins": 0.21083597838878632, + "rewards/rejected": -0.8909161686897278, + "step": 850 + }, + { + "epoch": 0.22507197068830148, + "grad_norm": 17.654157638549805, + "learning_rate": 4.766251958842589e-07, + "logits/chosen": -2.899445056915283, + "logits/rejected": -2.8913986682891846, + "logps/chosen": -407.30523681640625, + "logps/rejected": -389.7769775390625, + "loss": 0.6371, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6278396844863892, + "rewards/margins": 0.1948491483926773, + "rewards/rejected": -0.82268887758255, + "step": 860 + }, + { + "epoch": 0.22768908662653756, + "grad_norm": 23.719482421875, + "learning_rate": 4.756512382839506e-07, + "logits/chosen": -2.8611950874328613, + "logits/rejected": -2.8404242992401123, + "logps/chosen": -392.8919982910156, + "logps/rejected": -398.45172119140625, + "loss": 0.6474, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6410590410232544, + "rewards/margins": 0.19700810313224792, + "rewards/rejected": -0.8380670547485352, + "step": 870 + }, + { + "epoch": 0.23030620256477363, + "grad_norm": 19.95950698852539, + "learning_rate": 4.746584388701831e-07, + "logits/chosen": -2.9077162742614746, + "logits/rejected": -2.902599334716797, + "logps/chosen": -403.0314636230469, + "logps/rejected": -388.3122253417969, + "loss": 0.6378, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.6414380073547363, + "rewards/margins": 0.19418500363826752, + "rewards/rejected": -0.8356229662895203, + "step": 880 + }, + { + "epoch": 0.23292331850300968, + "grad_norm": 20.028629302978516, + "learning_rate": 4.736468805414218e-07, + "logits/chosen": -2.889512538909912, + "logits/rejected": -2.907574415206909, + "logps/chosen": -376.83734130859375, + "logps/rejected": -398.76568603515625, + "loss": 0.6141, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.5963784456253052, + "rewards/margins": 0.25601688027381897, + "rewards/rejected": -0.852395236492157, + "step": 890 + }, + { + "epoch": 0.23554043444124576, + "grad_norm": 22.77333641052246, + "learning_rate": 4.7261664776249595e-07, + "logits/chosen": -2.8576667308807373, + "logits/rejected": -2.837547779083252, + "logps/chosen": -346.2492980957031, + "logps/rejected": -343.90350341796875, + "loss": 0.6171, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.5129637122154236, + "rewards/margins": 0.22999227046966553, + "rewards/rejected": -0.7429560422897339, + "step": 900 + }, + { + "epoch": 0.23554043444124576, + "eval_logits/chosen": -2.8992178440093994, + "eval_logits/rejected": -2.881546974182129, + "eval_logps/chosen": -390.1064758300781, + "eval_logps/rejected": -371.53509521484375, + "eval_loss": 0.6330006122589111, + "eval_rewards/accuracies": 0.6545000076293945, + "eval_rewards/chosen": -0.5388749837875366, + "eval_rewards/margins": 0.20542745292186737, + "eval_rewards/rejected": -0.7443024516105652, + "eval_runtime": 305.2957, + "eval_samples_per_second": 6.551, + "eval_steps_per_second": 0.819, + "step": 900 + }, + { + "epoch": 0.2381575503794818, + "grad_norm": 16.45803451538086, + "learning_rate": 4.7156782655754624e-07, + "logits/chosen": -2.96109938621521, + "logits/rejected": -2.9101853370666504, + "logps/chosen": -412.92095947265625, + "logps/rejected": -353.53106689453125, + "loss": 0.638, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5114455223083496, + "rewards/margins": 0.19183678925037384, + "rewards/rejected": -0.703282356262207, + "step": 910 + }, + { + "epoch": 0.24077466631771788, + "grad_norm": 14.590066909790039, + "learning_rate": 4.705005045028414e-07, + "logits/chosen": -2.8991317749023438, + "logits/rejected": -2.8741579055786133, + "logps/chosen": -404.0238952636719, + "logps/rejected": -386.28668212890625, + "loss": 0.6204, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6287791132926941, + "rewards/margins": 0.23485076427459717, + "rewards/rejected": -0.863629937171936, + "step": 920 + }, + { + "epoch": 0.24339178225595393, + "grad_norm": 21.144271850585938, + "learning_rate": 4.694147707194659e-07, + "logits/chosen": -2.976510524749756, + "logits/rejected": -2.9652724266052246, + "logps/chosen": -416.43731689453125, + "logps/rejected": -405.408935546875, + "loss": 0.6055, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.7039724588394165, + "rewards/margins": 0.302602618932724, + "rewards/rejected": -1.0065749883651733, + "step": 930 + }, + { + "epoch": 0.24600889819419, + "grad_norm": 19.868425369262695, + "learning_rate": 4.683107158658781e-07, + "logits/chosen": -2.8925671577453613, + "logits/rejected": -2.888092517852783, + "logps/chosen": -431.69171142578125, + "logps/rejected": -417.070556640625, + "loss": 0.5975, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.6716684103012085, + "rewards/margins": 0.3120590150356293, + "rewards/rejected": -0.9837274551391602, + "step": 940 + }, + { + "epoch": 0.24862601413242608, + "grad_norm": 25.779151916503906, + "learning_rate": 4.6718843213034066e-07, + "logits/chosen": -2.9175336360931396, + "logits/rejected": -2.9109795093536377, + "logps/chosen": -401.5478515625, + "logps/rejected": -391.0757141113281, + "loss": 0.6248, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.8010644912719727, + "rewards/margins": 0.26846641302108765, + "rewards/rejected": -1.0695308446884155, + "step": 950 + }, + { + "epoch": 0.2512431300706621, + "grad_norm": 19.95328712463379, + "learning_rate": 4.660480132232224e-07, + "logits/chosen": -2.910229206085205, + "logits/rejected": -2.9196810722351074, + "logps/chosen": -433.5536193847656, + "logps/rejected": -417.9415588378906, + "loss": 0.6403, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.8831882476806641, + "rewards/margins": 0.22323890030384064, + "rewards/rejected": -1.1064269542694092, + "step": 960 + }, + { + "epoch": 0.25386024600889817, + "grad_norm": 23.01506996154785, + "learning_rate": 4.64889554369174e-07, + "logits/chosen": -2.9168238639831543, + "logits/rejected": -2.8963983058929443, + "logps/chosen": -433.3750915527344, + "logps/rejected": -402.49578857421875, + "loss": 0.589, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.8203709721565247, + "rewards/margins": 0.3379738926887512, + "rewards/rejected": -1.1583448648452759, + "step": 970 + }, + { + "epoch": 0.2564773619471343, + "grad_norm": 21.516292572021484, + "learning_rate": 4.637131522991764e-07, + "logits/chosen": -2.8918282985687256, + "logits/rejected": -2.890716552734375, + "logps/chosen": -435.3067932128906, + "logps/rejected": -429.76068115234375, + "loss": 0.6116, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.8054723739624023, + "rewards/margins": 0.27397674322128296, + "rewards/rejected": -1.0794490575790405, + "step": 980 + }, + { + "epoch": 0.2590944778853703, + "grad_norm": 18.14751625061035, + "learning_rate": 4.6251890524246375e-07, + "logits/chosen": -2.9248886108398438, + "logits/rejected": -2.9085328578948975, + "logps/chosen": -389.9818115234375, + "logps/rejected": -382.972900390625, + "loss": 0.5939, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.819879412651062, + "rewards/margins": 0.344366192817688, + "rewards/rejected": -1.16424560546875, + "step": 990 + }, + { + "epoch": 0.26171159382360637, + "grad_norm": 18.19559097290039, + "learning_rate": 4.613069129183218e-07, + "logits/chosen": -2.947519063949585, + "logits/rejected": -2.893505811691284, + "logps/chosen": -472.57659912109375, + "logps/rejected": -439.01263427734375, + "loss": 0.6156, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.858264148235321, + "rewards/margins": 0.26358070969581604, + "rewards/rejected": -1.121845006942749, + "step": 1000 + }, + { + "epoch": 0.26171159382360637, + "eval_logits/chosen": -2.8664913177490234, + "eval_logits/rejected": -2.846865177154541, + "eval_logps/chosen": -428.9975280761719, + "eval_logps/rejected": -414.9855041503906, + "eval_loss": 0.6271011829376221, + "eval_rewards/accuracies": 0.6460000276565552, + "eval_rewards/chosen": -0.9277856349945068, + "eval_rewards/margins": 0.2510209381580353, + "eval_rewards/rejected": -1.1788065433502197, + "eval_runtime": 305.3271, + "eval_samples_per_second": 6.55, + "eval_steps_per_second": 0.819, + "step": 1000 + }, + { + "epoch": 0.2643287097618425, + "grad_norm": 28.633275985717773, + "learning_rate": 4.6007727652776065e-07, + "logits/chosen": -2.8592729568481445, + "logits/rejected": -2.8527398109436035, + "logps/chosen": -399.6195983886719, + "logps/rejected": -404.9161071777344, + "loss": 0.6108, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9239141345024109, + "rewards/margins": 0.2775779664516449, + "rewards/rejected": -1.2014920711517334, + "step": 1010 + }, + { + "epoch": 0.2669458257000785, + "grad_norm": 16.247283935546875, + "learning_rate": 4.588300987450652e-07, + "logits/chosen": -2.925482749938965, + "logits/rejected": -2.9205174446105957, + "logps/chosen": -404.5378723144531, + "logps/rejected": -372.3696594238281, + "loss": 0.6335, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.8224859237670898, + "rewards/margins": 0.25445401668548584, + "rewards/rejected": -1.0769398212432861, + "step": 1020 + }, + { + "epoch": 0.26956294163831457, + "grad_norm": 15.247428894042969, + "learning_rate": 4.5756548370922134e-07, + "logits/chosen": -2.8664026260375977, + "logits/rejected": -2.852753162384033, + "logps/chosen": -376.1838073730469, + "logps/rejected": -374.4062194824219, + "loss": 0.6468, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.6621295213699341, + "rewards/margins": 0.20799696445465088, + "rewards/rejected": -0.8701265454292297, + "step": 1030 + }, + { + "epoch": 0.2721800575765506, + "grad_norm": 26.429670333862305, + "learning_rate": 4.5628353701522047e-07, + "logits/chosen": -2.8901050090789795, + "logits/rejected": -2.8922314643859863, + "logps/chosen": -440.0348205566406, + "logps/rejected": -419.0166015625, + "loss": 0.6016, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.5985369086265564, + "rewards/margins": 0.2978154718875885, + "rewards/rejected": -0.8963524699211121, + "step": 1040 + }, + { + "epoch": 0.2747971735147867, + "grad_norm": 16.453901290893555, + "learning_rate": 4.549843657052429e-07, + "logits/chosen": -2.913086414337158, + "logits/rejected": -2.9038262367248535, + "logps/chosen": -408.3675842285156, + "logps/rejected": -414.268310546875, + "loss": 0.5836, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6507371664047241, + "rewards/margins": 0.3384549021720886, + "rewards/rejected": -0.9891921281814575, + "step": 1050 + }, + { + "epoch": 0.27741428945302277, + "grad_norm": 20.94676971435547, + "learning_rate": 4.5366807825971907e-07, + "logits/chosen": -2.854055643081665, + "logits/rejected": -2.854862928390503, + "logps/chosen": -393.8934020996094, + "logps/rejected": -386.7836608886719, + "loss": 0.6529, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.8595059514045715, + "rewards/margins": 0.21301527321338654, + "rewards/rejected": -1.0725212097167969, + "step": 1060 + }, + { + "epoch": 0.2800314053912588, + "grad_norm": 19.222152709960938, + "learning_rate": 4.5233478458827176e-07, + "logits/chosen": -2.895177125930786, + "logits/rejected": -2.871817111968994, + "logps/chosen": -451.8866271972656, + "logps/rejected": -412.9781188964844, + "loss": 0.579, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.8334499597549438, + "rewards/margins": 0.3665994703769684, + "rewards/rejected": -1.2000494003295898, + "step": 1070 + }, + { + "epoch": 0.2826485213294949, + "grad_norm": 18.82132339477539, + "learning_rate": 4.509845960205389e-07, + "logits/chosen": -2.8257076740264893, + "logits/rejected": -2.8204989433288574, + "logps/chosen": -429.1475524902344, + "logps/rejected": -408.2667541503906, + "loss": 0.6282, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.771919846534729, + "rewards/margins": 0.25145334005355835, + "rewards/rejected": -1.0233732461929321, + "step": 1080 + }, + { + "epoch": 0.28526563726773096, + "grad_norm": 22.576290130615234, + "learning_rate": 4.4961762529687736e-07, + "logits/chosen": -2.8405702114105225, + "logits/rejected": -2.8226046562194824, + "logps/chosen": -415.6966857910156, + "logps/rejected": -397.05230712890625, + "loss": 0.6673, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.814344048500061, + "rewards/margins": 0.15356814861297607, + "rewards/rejected": -0.9679121971130371, + "step": 1090 + }, + { + "epoch": 0.287882753205967, + "grad_norm": 26.202625274658203, + "learning_rate": 4.482339865589492e-07, + "logits/chosen": -2.860849142074585, + "logits/rejected": -2.819664239883423, + "logps/chosen": -419.00335693359375, + "logps/rejected": -368.59674072265625, + "loss": 0.6636, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.829715371131897, + "rewards/margins": 0.14634691178798676, + "rewards/rejected": -0.9760621786117554, + "step": 1100 + }, + { + "epoch": 0.287882753205967, + "eval_logits/chosen": -2.83473801612854, + "eval_logits/rejected": -2.8143739700317383, + "eval_logps/chosen": -416.0617980957031, + "eval_logps/rejected": -400.14886474609375, + "eval_loss": 0.6234466433525085, + "eval_rewards/accuracies": 0.6514999866485596, + "eval_rewards/chosen": -0.7984281182289124, + "eval_rewards/margins": 0.23201218247413635, + "eval_rewards/rejected": -1.0304402112960815, + "eval_runtime": 305.573, + "eval_samples_per_second": 6.545, + "eval_steps_per_second": 0.818, + "step": 1100 + }, + { + "epoch": 0.2904998691442031, + "grad_norm": 21.07489776611328, + "learning_rate": 4.4683379534019076e-07, + "logits/chosen": -2.883356809616089, + "logits/rejected": -2.8841772079467773, + "logps/chosen": -424.1853942871094, + "logps/rejected": -418.2691345214844, + "loss": 0.6305, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.7898249626159668, + "rewards/margins": 0.19554057717323303, + "rewards/rejected": -0.9853655099868774, + "step": 1110 + }, + { + "epoch": 0.29311698508243916, + "grad_norm": 21.048519134521484, + "learning_rate": 4.4541716855616593e-07, + "logits/chosen": -2.821552038192749, + "logits/rejected": -2.7955610752105713, + "logps/chosen": -392.34136962890625, + "logps/rejected": -401.60382080078125, + "loss": 0.6075, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.7865413427352905, + "rewards/margins": 0.277415007352829, + "rewards/rejected": -1.0639564990997314, + "step": 1120 + }, + { + "epoch": 0.2957341010206752, + "grad_norm": 16.305269241333008, + "learning_rate": 4.4398422449480357e-07, + "logits/chosen": -2.7998039722442627, + "logits/rejected": -2.76208758354187, + "logps/chosen": -412.30511474609375, + "logps/rejected": -412.4529724121094, + "loss": 0.6492, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.8083792924880981, + "rewards/margins": 0.19571712613105774, + "rewards/rejected": -1.0040963888168335, + "step": 1130 + }, + { + "epoch": 0.29835121695891126, + "grad_norm": 19.82772445678711, + "learning_rate": 4.4253508280652036e-07, + "logits/chosen": -2.8314437866210938, + "logits/rejected": -2.7757253646850586, + "logps/chosen": -435.48638916015625, + "logps/rejected": -384.79669189453125, + "loss": 0.6123, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6878252029418945, + "rewards/margins": 0.2673383355140686, + "rewards/rejected": -0.9551635980606079, + "step": 1140 + }, + { + "epoch": 0.30096833289714736, + "grad_norm": 16.961671829223633, + "learning_rate": 4.410698644942302e-07, + "logits/chosen": -2.8539395332336426, + "logits/rejected": -2.839357614517212, + "logps/chosen": -425.7850036621094, + "logps/rejected": -408.3335266113281, + "loss": 0.597, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7383579611778259, + "rewards/margins": 0.29951414465904236, + "rewards/rejected": -1.037872076034546, + "step": 1150 + }, + { + "epoch": 0.3035854488353834, + "grad_norm": 18.105398178100586, + "learning_rate": 4.3958869190324057e-07, + "logits/chosen": -2.784026622772217, + "logits/rejected": -2.7167916297912598, + "logps/chosen": -432.7950744628906, + "logps/rejected": -418.3269958496094, + "loss": 0.6018, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.9281485676765442, + "rewards/margins": 0.3167566657066345, + "rewards/rejected": -1.2449051141738892, + "step": 1160 + }, + { + "epoch": 0.30620256477361946, + "grad_norm": 20.290996551513672, + "learning_rate": 4.380916887110365e-07, + "logits/chosen": -2.844586133956909, + "logits/rejected": -2.80912446975708, + "logps/chosen": -451.59234619140625, + "logps/rejected": -413.44390869140625, + "loss": 0.6339, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2258434295654297, + "rewards/margins": 0.2452480047941208, + "rewards/rejected": -1.4710915088653564, + "step": 1170 + }, + { + "epoch": 0.30881968071185556, + "grad_norm": 27.33935546875, + "learning_rate": 4.3657897991695394e-07, + "logits/chosen": -2.7094969749450684, + "logits/rejected": -2.780738115310669, + "logps/chosen": -440.80987548828125, + "logps/rejected": -465.4100646972656, + "loss": 0.6284, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2238306999206543, + "rewards/margins": 0.2834627032279968, + "rewards/rejected": -1.5072933435440063, + "step": 1180 + }, + { + "epoch": 0.3114367966500916, + "grad_norm": 19.233346939086914, + "learning_rate": 4.350506918317416e-07, + "logits/chosen": -2.8551743030548096, + "logits/rejected": -2.819492816925049, + "logps/chosen": -427.6788635253906, + "logps/rejected": -432.52703857421875, + "loss": 0.6235, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.1010067462921143, + "rewards/margins": 0.2940807640552521, + "rewards/rejected": -1.395087480545044, + "step": 1190 + }, + { + "epoch": 0.31405391258832765, + "grad_norm": 32.360862731933594, + "learning_rate": 4.335069520670149e-07, + "logits/chosen": -2.813873291015625, + "logits/rejected": -2.778324604034424, + "logps/chosen": -412.0577087402344, + "logps/rejected": -414.05462646484375, + "loss": 0.6832, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.132204294204712, + "rewards/margins": 0.14098653197288513, + "rewards/rejected": -1.2731907367706299, + "step": 1200 + }, + { + "epoch": 0.31405391258832765, + "eval_logits/chosen": -2.821247100830078, + "eval_logits/rejected": -2.7994351387023926, + "eval_logps/chosen": -439.2535705566406, + "eval_logps/rejected": -428.8004150390625, + "eval_loss": 0.6152091026306152, + "eval_rewards/accuracies": 0.6570000052452087, + "eval_rewards/chosen": -1.0303457975387573, + "eval_rewards/margins": 0.2866101562976837, + "eval_rewards/rejected": -1.3169556856155396, + "eval_runtime": 305.4997, + "eval_samples_per_second": 6.547, + "eval_steps_per_second": 0.818, + "step": 1200 + }, + { + "epoch": 0.3166710285265637, + "grad_norm": 20.405153274536133, + "learning_rate": 4.319478895245999e-07, + "logits/chosen": -2.8161978721618652, + "logits/rejected": -2.7879137992858887, + "logps/chosen": -414.6759338378906, + "logps/rejected": -396.0447998046875, + "loss": 0.6088, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.9484050869941711, + "rewards/margins": 0.28157109022140503, + "rewards/rejected": -1.2299760580062866, + "step": 1210 + }, + { + "epoch": 0.3192881444647998, + "grad_norm": 21.530492782592773, + "learning_rate": 4.3037363438577036e-07, + "logits/chosen": -2.8707480430603027, + "logits/rejected": -2.8560750484466553, + "logps/chosen": -423.5750427246094, + "logps/rejected": -442.72576904296875, + "loss": 0.6312, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.9034959673881531, + "rewards/margins": 0.23671674728393555, + "rewards/rejected": -1.1402127742767334, + "step": 1220 + }, + { + "epoch": 0.32190526040303585, + "grad_norm": 19.900814056396484, + "learning_rate": 4.2878431810037716e-07, + "logits/chosen": -2.8587565422058105, + "logits/rejected": -2.8229823112487793, + "logps/chosen": -477.77374267578125, + "logps/rejected": -438.41082763671875, + "loss": 0.5864, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9826862215995789, + "rewards/margins": 0.3427571952342987, + "rewards/rejected": -1.3254432678222656, + "step": 1230 + }, + { + "epoch": 0.3245223763412719, + "grad_norm": 16.341970443725586, + "learning_rate": 4.271800733758729e-07, + "logits/chosen": -2.7875592708587646, + "logits/rejected": -2.7579915523529053, + "logps/chosen": -480.4039611816406, + "logps/rejected": -449.5467834472656, + "loss": 0.6178, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1455281972885132, + "rewards/margins": 0.32378047704696655, + "rewards/rejected": -1.4693087339401245, + "step": 1240 + }, + { + "epoch": 0.327139492279508, + "grad_norm": 17.49270248413086, + "learning_rate": 4.255610341662304e-07, + "logits/chosen": -2.830955982208252, + "logits/rejected": -2.783357858657837, + "logps/chosen": -442.563720703125, + "logps/rejected": -430.7234802246094, + "loss": 0.6059, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.118556261062622, + "rewards/margins": 0.31357699632644653, + "rewards/rejected": -1.4321330785751343, + "step": 1250 + }, + { + "epoch": 0.32975660821774405, + "grad_norm": 15.970780372619629, + "learning_rate": 4.2392733566075757e-07, + "logits/chosen": -2.822460174560547, + "logits/rejected": -2.818441867828369, + "logps/chosen": -425.0038146972656, + "logps/rejected": -420.7904357910156, + "loss": 0.632, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.9978582262992859, + "rewards/margins": 0.23995347321033478, + "rewards/rejected": -1.237811803817749, + "step": 1260 + }, + { + "epoch": 0.3323737241559801, + "grad_norm": 23.021156311035156, + "learning_rate": 4.2227911427280973e-07, + "logits/chosen": -2.8251795768737793, + "logits/rejected": -2.798708200454712, + "logps/chosen": -408.9261779785156, + "logps/rejected": -386.421875, + "loss": 0.6156, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.916782557964325, + "rewards/margins": 0.2843112349510193, + "rewards/rejected": -1.2010937929153442, + "step": 1270 + }, + { + "epoch": 0.33499084009421615, + "grad_norm": 22.88947105407715, + "learning_rate": 4.206165076283982e-07, + "logits/chosen": -2.8256075382232666, + "logits/rejected": -2.8119583129882812, + "logps/chosen": -417.31341552734375, + "logps/rejected": -419.28680419921875, + "loss": 0.5893, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.0365794897079468, + "rewards/margins": 0.34971266984939575, + "rewards/rejected": -1.3862922191619873, + "step": 1280 + }, + { + "epoch": 0.33760795603245225, + "grad_norm": 20.84588623046875, + "learning_rate": 4.1893965455469946e-07, + "logits/chosen": -2.7863240242004395, + "logits/rejected": -2.772761583328247, + "logps/chosen": -448.05792236328125, + "logps/rejected": -433.765869140625, + "loss": 0.6644, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.2257723808288574, + "rewards/margins": 0.2406727820634842, + "rewards/rejected": -1.4664452075958252, + "step": 1290 + }, + { + "epoch": 0.3402250719706883, + "grad_norm": 18.72652816772461, + "learning_rate": 4.172486950684626e-07, + "logits/chosen": -2.824350118637085, + "logits/rejected": -2.8358230590820312, + "logps/chosen": -438.16900634765625, + "logps/rejected": -455.0042419433594, + "loss": 0.5967, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.146238088607788, + "rewards/margins": 0.3505471646785736, + "rewards/rejected": -1.4967854022979736, + "step": 1300 + }, + { + "epoch": 0.3402250719706883, + "eval_logits/chosen": -2.7756471633911133, + "eval_logits/rejected": -2.7494444847106934, + "eval_logps/chosen": -459.6400146484375, + "eval_logps/rejected": -450.31976318359375, + "eval_loss": 0.6130924224853516, + "eval_rewards/accuracies": 0.6654999852180481, + "eval_rewards/chosen": -1.2342103719711304, + "eval_rewards/margins": 0.29793861508369446, + "eval_rewards/rejected": -1.5321489572525024, + "eval_runtime": 305.5502, + "eval_samples_per_second": 6.546, + "eval_steps_per_second": 0.818, + "step": 1300 + }, + { + "epoch": 0.34284218790892435, + "grad_norm": 20.80463981628418, + "learning_rate": 4.155437703643181e-07, + "logits/chosen": -2.84661602973938, + "logits/rejected": -2.8071157932281494, + "logps/chosen": -431.5113830566406, + "logps/rejected": -413.513916015625, + "loss": 0.6008, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.1648436784744263, + "rewards/margins": 0.32154667377471924, + "rewards/rejected": -1.486390233039856, + "step": 1310 + }, + { + "epoch": 0.34545930384716045, + "grad_norm": 22.277143478393555, + "learning_rate": 4.138250228029881e-07, + "logits/chosen": -2.8156943321228027, + "logits/rejected": -2.802820920944214, + "logps/chosen": -443.06024169921875, + "logps/rejected": -460.61663818359375, + "loss": 0.6321, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.1023852825164795, + "rewards/margins": 0.26114708185195923, + "rewards/rejected": -1.363532304763794, + "step": 1320 + }, + { + "epoch": 0.3480764197853965, + "grad_norm": 21.171588897705078, + "learning_rate": 4.1209259589939935e-07, + "logits/chosen": -2.7955193519592285, + "logits/rejected": -2.804624319076538, + "logps/chosen": -397.19091796875, + "logps/rejected": -402.78857421875, + "loss": 0.6281, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.0586354732513428, + "rewards/margins": 0.24395787715911865, + "rewards/rejected": -1.302593469619751, + "step": 1330 + }, + { + "epoch": 0.35069353572363254, + "grad_norm": 32.8704719543457, + "learning_rate": 4.103466343106998e-07, + "logits/chosen": -2.846787929534912, + "logits/rejected": -2.852377414703369, + "logps/chosen": -447.3720703125, + "logps/rejected": -427.18853759765625, + "loss": 0.6461, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0319609642028809, + "rewards/margins": 0.21338698267936707, + "rewards/rejected": -1.2453479766845703, + "step": 1340 + }, + { + "epoch": 0.35331065166186865, + "grad_norm": 20.399002075195312, + "learning_rate": 4.085872838241796e-07, + "logits/chosen": -2.772639036178589, + "logits/rejected": -2.7564244270324707, + "logps/chosen": -427.06103515625, + "logps/rejected": -415.52569580078125, + "loss": 0.6264, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.9161670804023743, + "rewards/margins": 0.2716852128505707, + "rewards/rejected": -1.1878522634506226, + "step": 1350 + }, + { + "epoch": 0.3559277676001047, + "grad_norm": 18.165796279907227, + "learning_rate": 4.06814691345098e-07, + "logits/chosen": -2.824145793914795, + "logits/rejected": -2.7942967414855957, + "logps/chosen": -417.0282287597656, + "logps/rejected": -399.99652099609375, + "loss": 0.6027, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.8873056173324585, + "rewards/margins": 0.3002934157848358, + "rewards/rejected": -1.1875989437103271, + "step": 1360 + }, + { + "epoch": 0.35854488353834074, + "grad_norm": 17.33743667602539, + "learning_rate": 4.0502900488441707e-07, + "logits/chosen": -2.8374624252319336, + "logits/rejected": -2.851886749267578, + "logps/chosen": -432.22711181640625, + "logps/rejected": -437.4540100097656, + "loss": 0.6232, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.8847773671150208, + "rewards/margins": 0.2480648010969162, + "rewards/rejected": -1.1328423023223877, + "step": 1370 + }, + { + "epoch": 0.3611619994765768, + "grad_norm": 25.823244094848633, + "learning_rate": 4.032303735464422e-07, + "logits/chosen": -2.937242269515991, + "logits/rejected": -2.8683810234069824, + "logps/chosen": -434.61297607421875, + "logps/rejected": -418.332275390625, + "loss": 0.5839, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.79572993516922, + "rewards/margins": 0.3832002282142639, + "rewards/rejected": -1.1789300441741943, + "step": 1380 + }, + { + "epoch": 0.3637791154148129, + "grad_norm": 20.350576400756836, + "learning_rate": 4.014189475163726e-07, + "logits/chosen": -2.8276124000549316, + "logits/rejected": -2.8166391849517822, + "logps/chosen": -393.64813232421875, + "logps/rejected": -399.48638916015625, + "loss": 0.5903, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.6952024698257446, + "rewards/margins": 0.34315189719200134, + "rewards/rejected": -1.0383542776107788, + "step": 1390 + }, + { + "epoch": 0.36639623135304894, + "grad_norm": 26.685108184814453, + "learning_rate": 3.995948780477605e-07, + "logits/chosen": -2.871060371398926, + "logits/rejected": -2.8492379188537598, + "logps/chosen": -421.88494873046875, + "logps/rejected": -410.37371826171875, + "loss": 0.596, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.8149229884147644, + "rewards/margins": 0.3252275884151459, + "rewards/rejected": -1.1401506662368774, + "step": 1400 + }, + { + "epoch": 0.36639623135304894, + "eval_logits/chosen": -2.828916311264038, + "eval_logits/rejected": -2.8083691596984863, + "eval_logps/chosen": -422.09027099609375, + "eval_logps/rejected": -414.0766296386719, + "eval_loss": 0.6064282655715942, + "eval_rewards/accuracies": 0.6819999814033508, + "eval_rewards/chosen": -0.8587133288383484, + "eval_rewards/margins": 0.31100472807884216, + "eval_rewards/rejected": -1.1697180271148682, + "eval_runtime": 305.6064, + "eval_samples_per_second": 6.544, + "eval_steps_per_second": 0.818, + "step": 1400 + }, + { + "epoch": 0.369013347291285, + "grad_norm": 23.994709014892578, + "learning_rate": 3.977583174498816e-07, + "logits/chosen": -2.850383758544922, + "logits/rejected": -2.8548877239227295, + "logps/chosen": -428.41357421875, + "logps/rejected": -422.70208740234375, + "loss": 0.5758, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.8807367086410522, + "rewards/margins": 0.3707871437072754, + "rewards/rejected": -1.2515239715576172, + "step": 1410 + }, + { + "epoch": 0.3716304632295211, + "grad_norm": 20.96696662902832, + "learning_rate": 3.9590941907501717e-07, + "logits/chosen": -2.8528692722320557, + "logits/rejected": -2.826349973678589, + "logps/chosen": -452.9150390625, + "logps/rejected": -441.3421325683594, + "loss": 0.611, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.8741127252578735, + "rewards/margins": 0.3615128993988037, + "rewards/rejected": -1.2356255054473877, + "step": 1420 + }, + { + "epoch": 0.37424757916775714, + "grad_norm": 24.699710845947266, + "learning_rate": 3.9404833730564974e-07, + "logits/chosen": -2.749164342880249, + "logits/rejected": -2.7452807426452637, + "logps/chosen": -406.6269226074219, + "logps/rejected": -410.21746826171875, + "loss": 0.6114, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.8366050720214844, + "rewards/margins": 0.314082533121109, + "rewards/rejected": -1.150687575340271, + "step": 1430 + }, + { + "epoch": 0.3768646951059932, + "grad_norm": 20.18442726135254, + "learning_rate": 3.9217522754157117e-07, + "logits/chosen": -2.824708938598633, + "logits/rejected": -2.822957992553711, + "logps/chosen": -413.8306579589844, + "logps/rejected": -416.0909118652344, + "loss": 0.5755, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.9702291488647461, + "rewards/margins": 0.4000950753688812, + "rewards/rejected": -1.3703243732452393, + "step": 1440 + }, + { + "epoch": 0.37948181104422923, + "grad_norm": 24.324777603149414, + "learning_rate": 3.9029024618690785e-07, + "logits/chosen": -2.8401103019714355, + "logits/rejected": -2.802492380142212, + "logps/chosen": -404.65887451171875, + "logps/rejected": -410.8133239746094, + "loss": 0.5906, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.9892793893814087, + "rewards/margins": 0.39633244276046753, + "rewards/rejected": -1.3856117725372314, + "step": 1450 + }, + { + "epoch": 0.38209892698246534, + "grad_norm": 25.477262496948242, + "learning_rate": 3.883935506370605e-07, + "logits/chosen": -2.787506341934204, + "logits/rejected": -2.782578229904175, + "logps/chosen": -414.322998046875, + "logps/rejected": -399.5470886230469, + "loss": 0.6312, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.9510722160339355, + "rewards/margins": 0.2990874648094177, + "rewards/rejected": -1.2501596212387085, + "step": 1460 + }, + { + "epoch": 0.3847160429207014, + "grad_norm": 15.537137031555176, + "learning_rate": 3.864852992655616e-07, + "logits/chosen": -2.7832908630371094, + "logits/rejected": -2.7727818489074707, + "logps/chosen": -415.6470642089844, + "logps/rejected": -435.2330627441406, + "loss": 0.551, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.9724925756454468, + "rewards/margins": 0.455828994512558, + "rewards/rejected": -1.4283217191696167, + "step": 1470 + }, + { + "epoch": 0.38733315885893743, + "grad_norm": 20.25286865234375, + "learning_rate": 3.845656514108515e-07, + "logits/chosen": -2.8250439167022705, + "logits/rejected": -2.7870283126831055, + "logps/chosen": -451.704345703125, + "logps/rejected": -409.6332092285156, + "loss": 0.6335, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.237602949142456, + "rewards/margins": 0.28984588384628296, + "rewards/rejected": -1.5274488925933838, + "step": 1480 + }, + { + "epoch": 0.38995027479717354, + "grad_norm": 21.001708984375, + "learning_rate": 3.8263476736297375e-07, + "logits/chosen": -2.775477886199951, + "logits/rejected": -2.741433620452881, + "logps/chosen": -431.9996032714844, + "logps/rejected": -435.52178955078125, + "loss": 0.5698, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.1075719594955444, + "rewards/margins": 0.43814000487327576, + "rewards/rejected": -1.5457121133804321, + "step": 1490 + }, + { + "epoch": 0.3925673907354096, + "grad_norm": 29.425813674926758, + "learning_rate": 3.8069280835019055e-07, + "logits/chosen": -2.7563180923461914, + "logits/rejected": -2.731633424758911, + "logps/chosen": -448.45123291015625, + "logps/rejected": -439.7250061035156, + "loss": 0.592, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.986266016960144, + "rewards/margins": 0.3872339129447937, + "rewards/rejected": -1.3734999895095825, + "step": 1500 + }, + { + "epoch": 0.3925673907354096, + "eval_logits/chosen": -2.7703075408935547, + "eval_logits/rejected": -2.7455074787139893, + "eval_logps/chosen": -433.11322021484375, + "eval_logps/rejected": -428.9928894042969, + "eval_loss": 0.6027323007583618, + "eval_rewards/accuracies": 0.671500027179718, + "eval_rewards/chosen": -0.9689425230026245, + "eval_rewards/margins": 0.34993812441825867, + "eval_rewards/rejected": -1.318880558013916, + "eval_runtime": 305.4978, + "eval_samples_per_second": 6.547, + "eval_steps_per_second": 0.818, + "step": 1500 + }, + { + "epoch": 0.39518450667364563, + "grad_norm": 20.41739845275879, + "learning_rate": 3.7873993652552073e-07, + "logits/chosen": -2.7873950004577637, + "logits/rejected": -2.767686128616333, + "logps/chosen": -394.41082763671875, + "logps/rejected": -397.44976806640625, + "loss": 0.654, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.9665681719779968, + "rewards/margins": 0.24985328316688538, + "rewards/rejected": -1.2164217233657837, + "step": 1510 + }, + { + "epoch": 0.39780162261188173, + "grad_norm": 17.706754684448242, + "learning_rate": 3.767763149531995e-07, + "logits/chosen": -2.817774772644043, + "logits/rejected": -2.7983908653259277, + "logps/chosen": -410.2703552246094, + "logps/rejected": -415.4830017089844, + "loss": 0.578, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.7426007986068726, + "rewards/margins": 0.37523287534713745, + "rewards/rejected": -1.1178338527679443, + "step": 1520 + }, + { + "epoch": 0.4004187385501178, + "grad_norm": 19.806367874145508, + "learning_rate": 3.7480210759506326e-07, + "logits/chosen": -2.7944726943969727, + "logits/rejected": -2.7969970703125, + "logps/chosen": -424.00128173828125, + "logps/rejected": -411.03619384765625, + "loss": 0.6494, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.7284184098243713, + "rewards/margins": 0.221491739153862, + "rewards/rejected": -0.9499101638793945, + "step": 1530 + }, + { + "epoch": 0.40303585448835383, + "grad_norm": 26.830059051513672, + "learning_rate": 3.728174792968582e-07, + "logits/chosen": -2.749136209487915, + "logits/rejected": -2.7285478115081787, + "logps/chosen": -378.54522705078125, + "logps/rejected": -378.9236145019531, + "loss": 0.6119, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.7706597447395325, + "rewards/margins": 0.29169803857803345, + "rewards/rejected": -1.062357783317566, + "step": 1540 + }, + { + "epoch": 0.4056529704265899, + "grad_norm": 19.88861656188965, + "learning_rate": 3.70822595774476e-07, + "logits/chosen": -2.802050828933716, + "logits/rejected": -2.8034234046936035, + "logps/chosen": -424.07757568359375, + "logps/rejected": -420.0818786621094, + "loss": 0.5911, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7196288704872131, + "rewards/margins": 0.37008053064346313, + "rewards/rejected": -1.0897094011306763, + "step": 1550 + }, + { + "epoch": 0.408270086364826, + "grad_norm": 21.920900344848633, + "learning_rate": 3.688176236001168e-07, + "logits/chosen": -2.808371067047119, + "logits/rejected": -2.7770209312438965, + "logps/chosen": -437.070556640625, + "logps/rejected": -407.98260498046875, + "loss": 0.599, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.73606276512146, + "rewards/margins": 0.3621772229671478, + "rewards/rejected": -1.0982400178909302, + "step": 1560 + }, + { + "epoch": 0.410887202303062, + "grad_norm": 21.512451171875, + "learning_rate": 3.6680273018838016e-07, + "logits/chosen": -2.7860348224639893, + "logits/rejected": -2.765151262283325, + "logps/chosen": -402.9967346191406, + "logps/rejected": -403.93328857421875, + "loss": 0.5804, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.7530256509780884, + "rewards/margins": 0.39612632989883423, + "rewards/rejected": -1.1491520404815674, + "step": 1570 + }, + { + "epoch": 0.4135043182412981, + "grad_norm": 27.112638473510742, + "learning_rate": 3.6477808378228596e-07, + "logits/chosen": -2.7512621879577637, + "logits/rejected": -2.7771029472351074, + "logps/chosen": -403.02099609375, + "logps/rejected": -453.424072265625, + "loss": 0.5933, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7836870551109314, + "rewards/margins": 0.3679850399494171, + "rewards/rejected": -1.151672124862671, + "step": 1580 + }, + { + "epoch": 0.4161214341795342, + "grad_norm": 25.062524795532227, + "learning_rate": 3.6274385343922674e-07, + "logits/chosen": -2.832534074783325, + "logits/rejected": -2.849515438079834, + "logps/chosen": -390.73565673828125, + "logps/rejected": -424.64654541015625, + "loss": 0.5968, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.8553959131240845, + "rewards/margins": 0.34458768367767334, + "rewards/rejected": -1.1999835968017578, + "step": 1590 + }, + { + "epoch": 0.4187385501177702, + "grad_norm": 21.186635971069336, + "learning_rate": 3.6070020901685057e-07, + "logits/chosen": -2.724576234817505, + "logits/rejected": -2.726635694503784, + "logps/chosen": -425.3138122558594, + "logps/rejected": -408.3916015625, + "loss": 0.6353, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.8639251589775085, + "rewards/margins": 0.26024505496025085, + "rewards/rejected": -1.1241703033447266, + "step": 1600 + }, + { + "epoch": 0.4187385501177702, + "eval_logits/chosen": -2.7245428562164307, + "eval_logits/rejected": -2.6972126960754395, + "eval_logps/chosen": -432.62255859375, + "eval_logps/rejected": -429.33135986328125, + "eval_loss": 0.6051159501075745, + "eval_rewards/accuracies": 0.6744999885559082, + "eval_rewards/chosen": -0.9640358090400696, + "eval_rewards/margins": 0.3582296371459961, + "eval_rewards/rejected": -1.322265386581421, + "eval_runtime": 305.6135, + "eval_samples_per_second": 6.544, + "eval_steps_per_second": 0.818, + "step": 1600 + }, + { + "epoch": 0.4213556660560063, + "grad_norm": 19.41357421875, + "learning_rate": 3.5864732115887863e-07, + "logits/chosen": -2.790837049484253, + "logits/rejected": -2.7912559509277344, + "logps/chosen": -404.7127685546875, + "logps/rejected": -439.0284729003906, + "loss": 0.5745, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.9212363958358765, + "rewards/margins": 0.4179501533508301, + "rewards/rejected": -1.339186668395996, + "step": 1610 + }, + { + "epoch": 0.4239727819942423, + "grad_norm": 31.039003372192383, + "learning_rate": 3.565853612808562e-07, + "logits/chosen": -2.813098669052124, + "logits/rejected": -2.756390333175659, + "logps/chosen": -455.2381286621094, + "logps/rejected": -444.98101806640625, + "loss": 0.622, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.1717721223831177, + "rewards/margins": 0.3537500500679016, + "rewards/rejected": -1.525522232055664, + "step": 1620 + }, + { + "epoch": 0.4265898979324784, + "grad_norm": 20.93377685546875, + "learning_rate": 3.5451450155583984e-07, + "logits/chosen": -2.663109302520752, + "logits/rejected": -2.7095718383789062, + "logps/chosen": -419.74652099609375, + "logps/rejected": -429.96502685546875, + "loss": 0.6041, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2498713731765747, + "rewards/margins": 0.4084538519382477, + "rewards/rejected": -1.6583251953125, + "step": 1630 + }, + { + "epoch": 0.42920701387071447, + "grad_norm": 21.383033752441406, + "learning_rate": 3.5243491490002055e-07, + "logits/chosen": -2.721489191055298, + "logits/rejected": -2.705233097076416, + "logps/chosen": -456.5779724121094, + "logps/rejected": -452.54620361328125, + "loss": 0.6828, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.2452127933502197, + "rewards/margins": 0.2219144105911255, + "rewards/rejected": -1.4671272039413452, + "step": 1640 + }, + { + "epoch": 0.4318241298089505, + "grad_norm": 20.577220916748047, + "learning_rate": 3.503467749582857e-07, + "logits/chosen": -2.7935385704040527, + "logits/rejected": -2.749878168106079, + "logps/chosen": -415.6224060058594, + "logps/rejected": -389.2660827636719, + "loss": 0.6546, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.9616597890853882, + "rewards/margins": 0.2633201479911804, + "rewards/rejected": -1.2249799966812134, + "step": 1650 + }, + { + "epoch": 0.4344412457471866, + "grad_norm": 27.85550308227539, + "learning_rate": 3.482502560897194e-07, + "logits/chosen": -2.722708225250244, + "logits/rejected": -2.7652525901794434, + "logps/chosen": -375.1950378417969, + "logps/rejected": -402.11474609375, + "loss": 0.6213, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.940973162651062, + "rewards/margins": 0.28709056973457336, + "rewards/rejected": -1.2280638217926025, + "step": 1660 + }, + { + "epoch": 0.43705836168542267, + "grad_norm": 19.033451080322266, + "learning_rate": 3.4614553335304403e-07, + "logits/chosen": -2.797665596008301, + "logits/rejected": -2.7312228679656982, + "logps/chosen": -454.24371337890625, + "logps/rejected": -424.7903747558594, + "loss": 0.6157, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.0169007778167725, + "rewards/margins": 0.3303568661212921, + "rewards/rejected": -1.3472576141357422, + "step": 1670 + }, + { + "epoch": 0.4396754776236587, + "grad_norm": 19.925254821777344, + "learning_rate": 3.440327824920022e-07, + "logits/chosen": -2.8008456230163574, + "logits/rejected": -2.744900941848755, + "logps/chosen": -449.263427734375, + "logps/rejected": -425.96429443359375, + "loss": 0.5824, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.8703651428222656, + "rewards/margins": 0.4040173590183258, + "rewards/rejected": -1.2743823528289795, + "step": 1680 + }, + { + "epoch": 0.44229259356189476, + "grad_norm": 19.3896541595459, + "learning_rate": 3.4191217992068287e-07, + "logits/chosen": -2.8239524364471436, + "logits/rejected": -2.7781131267547607, + "logps/chosen": -445.6703186035156, + "logps/rejected": -414.863525390625, + "loss": 0.6067, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.9370183944702148, + "rewards/margins": 0.33230060338974, + "rewards/rejected": -1.26931893825531, + "step": 1690 + }, + { + "epoch": 0.44490970950013087, + "grad_norm": 25.677919387817383, + "learning_rate": 3.3978390270879056e-07, + "logits/chosen": -2.7083306312561035, + "logits/rejected": -2.722886562347412, + "logps/chosen": -384.851318359375, + "logps/rejected": -393.93927001953125, + "loss": 0.6603, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.1110846996307373, + "rewards/margins": 0.19145555794239044, + "rewards/rejected": -1.3025401830673218, + "step": 1700 + }, + { + "epoch": 0.44490970950013087, + "eval_logits/chosen": -2.7304868698120117, + "eval_logits/rejected": -2.702120780944824, + "eval_logps/chosen": -435.1521301269531, + "eval_logps/rejected": -429.3145446777344, + "eval_loss": 0.6016219854354858, + "eval_rewards/accuracies": 0.6765000224113464, + "eval_rewards/chosen": -0.9893313050270081, + "eval_rewards/margins": 0.3327656388282776, + "eval_rewards/rejected": -1.3220969438552856, + "eval_runtime": 305.5368, + "eval_samples_per_second": 6.546, + "eval_steps_per_second": 0.818, + "step": 1700 + }, + { + "epoch": 0.4475268254383669, + "grad_norm": 22.6643123626709, + "learning_rate": 3.376481285668599e-07, + "logits/chosen": -2.784320831298828, + "logits/rejected": -2.812058925628662, + "logps/chosen": -382.7949523925781, + "logps/rejected": -412.650390625, + "loss": 0.632, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.9195866584777832, + "rewards/margins": 0.27537328004837036, + "rewards/rejected": -1.1949598789215088, + "step": 1710 + }, + { + "epoch": 0.45014394137660296, + "grad_norm": 25.10662269592285, + "learning_rate": 3.355050358314172e-07, + "logits/chosen": -2.8318912982940674, + "logits/rejected": -2.8115198612213135, + "logps/chosen": -418.42559814453125, + "logps/rejected": -420.23126220703125, + "loss": 0.5826, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.7352002859115601, + "rewards/margins": 0.3559706211090088, + "rewards/rejected": -1.0911709070205688, + "step": 1720 + }, + { + "epoch": 0.45276105731483907, + "grad_norm": 24.134742736816406, + "learning_rate": 3.33354803450089e-07, + "logits/chosen": -2.738598585128784, + "logits/rejected": -2.707373857498169, + "logps/chosen": -400.3019714355469, + "logps/rejected": -398.3058166503906, + "loss": 0.609, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.751221776008606, + "rewards/margins": 0.33383291959762573, + "rewards/rejected": -1.085054636001587, + "step": 1730 + }, + { + "epoch": 0.4553781732530751, + "grad_norm": 19.136178970336914, + "learning_rate": 3.311976109666605e-07, + "logits/chosen": -2.7323246002197266, + "logits/rejected": -2.700840473175049, + "logps/chosen": -419.31512451171875, + "logps/rejected": -397.88238525390625, + "loss": 0.6054, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.7072068452835083, + "rewards/margins": 0.31471288204193115, + "rewards/rejected": -1.0219197273254395, + "step": 1740 + }, + { + "epoch": 0.45799528919131116, + "grad_norm": 20.46944236755371, + "learning_rate": 3.2903363850608317e-07, + "logits/chosen": -2.7930941581726074, + "logits/rejected": -2.73994517326355, + "logps/chosen": -430.36114501953125, + "logps/rejected": -423.3164978027344, + "loss": 0.5988, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.0622642040252686, + "rewards/margins": 0.35906052589416504, + "rewards/rejected": -1.421324610710144, + "step": 1750 + }, + { + "epoch": 0.46061240512954726, + "grad_norm": 20.967994689941406, + "learning_rate": 3.2686306675943477e-07, + "logits/chosen": -2.7039730548858643, + "logits/rejected": -2.709929943084717, + "logps/chosen": -429.5506286621094, + "logps/rejected": -427.92413330078125, + "loss": 0.6031, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.0673789978027344, + "rewards/margins": 0.34702402353286743, + "rewards/rejected": -1.414402961730957, + "step": 1760 + }, + { + "epoch": 0.4632295210677833, + "grad_norm": 20.740760803222656, + "learning_rate": 3.2468607696883145e-07, + "logits/chosen": -2.708698034286499, + "logits/rejected": -2.7119338512420654, + "logps/chosen": -437.08807373046875, + "logps/rejected": -471.3455505371094, + "loss": 0.5756, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1100339889526367, + "rewards/margins": 0.45587754249572754, + "rewards/rejected": -1.5659115314483643, + "step": 1770 + }, + { + "epoch": 0.46584663700601936, + "grad_norm": 22.729509353637695, + "learning_rate": 3.2250285091229435e-07, + "logits/chosen": -2.7388596534729004, + "logits/rejected": -2.694728374481201, + "logps/chosen": -416.31463623046875, + "logps/rejected": -420.25653076171875, + "loss": 0.6293, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.0822761058807373, + "rewards/margins": 0.31783348321914673, + "rewards/rejected": -1.4001096487045288, + "step": 1780 + }, + { + "epoch": 0.4684637529442554, + "grad_norm": 22.45555305480957, + "learning_rate": 3.2031357088857083e-07, + "logits/chosen": -2.7370693683624268, + "logits/rejected": -2.708481550216675, + "logps/chosen": -462.5494689941406, + "logps/rejected": -482.8470153808594, + "loss": 0.6173, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.1477806568145752, + "rewards/margins": 0.3766568601131439, + "rewards/rejected": -1.524437665939331, + "step": 1790 + }, + { + "epoch": 0.4710808688824915, + "grad_norm": 26.537885665893555, + "learning_rate": 3.1811841970191267e-07, + "logits/chosen": -2.5913147926330566, + "logits/rejected": -2.5993223190307617, + "logps/chosen": -404.4351501464844, + "logps/rejected": -467.8827209472656, + "loss": 0.5551, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.0686676502227783, + "rewards/margins": 0.5164337158203125, + "rewards/rejected": -1.5851013660430908, + "step": 1800 + }, + { + "epoch": 0.4710808688824915, + "eval_logits/chosen": -2.6491506099700928, + "eval_logits/rejected": -2.615879774093628, + "eval_logps/chosen": -436.5640869140625, + "eval_logps/rejected": -434.75897216796875, + "eval_loss": 0.6023004055023193, + "eval_rewards/accuracies": 0.6790000200271606, + "eval_rewards/chosen": -1.0034514665603638, + "eval_rewards/margins": 0.3730900287628174, + "eval_rewards/rejected": -1.3765413761138916, + "eval_runtime": 305.4204, + "eval_samples_per_second": 6.548, + "eval_steps_per_second": 0.819, + "step": 1800 + }, + { + "epoch": 0.47369798482072756, + "grad_norm": 17.421649932861328, + "learning_rate": 3.1591758064681257e-07, + "logits/chosen": -2.6142051219940186, + "logits/rejected": -2.555877447128296, + "logps/chosen": -421.052490234375, + "logps/rejected": -406.779541015625, + "loss": 0.5928, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.9783207774162292, + "rewards/margins": 0.4080726206302643, + "rewards/rejected": -1.3863933086395264, + "step": 1810 + }, + { + "epoch": 0.4763151007589636, + "grad_norm": 24.198755264282227, + "learning_rate": 3.13711237492698e-07, + "logits/chosen": -2.685159921646118, + "logits/rejected": -2.6774396896362305, + "logps/chosen": -460.9088439941406, + "logps/rejected": -460.2481994628906, + "loss": 0.6401, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.9882858395576477, + "rewards/margins": 0.3006265163421631, + "rewards/rejected": -1.2889124155044556, + "step": 1820 + }, + { + "epoch": 0.4789322166971997, + "grad_norm": 22.546266555786133, + "learning_rate": 3.1149957446858767e-07, + "logits/chosen": -2.7318637371063232, + "logits/rejected": -2.746227264404297, + "logps/chosen": -389.7530212402344, + "logps/rejected": -395.2696533203125, + "loss": 0.6069, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.7182685136795044, + "rewards/margins": 0.309965580701828, + "rewards/rejected": -1.0282341241836548, + "step": 1830 + }, + { + "epoch": 0.48154933263543576, + "grad_norm": 18.50275993347168, + "learning_rate": 3.0928277624770736e-07, + "logits/chosen": -2.7960715293884277, + "logits/rejected": -2.758817195892334, + "logps/chosen": -428.894775390625, + "logps/rejected": -424.63494873046875, + "loss": 0.5771, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.6762595176696777, + "rewards/margins": 0.4380512237548828, + "rewards/rejected": -1.1143107414245605, + "step": 1840 + }, + { + "epoch": 0.4841664485736718, + "grad_norm": 16.639318466186523, + "learning_rate": 3.0706102793207073e-07, + "logits/chosen": -2.7766544818878174, + "logits/rejected": -2.7374072074890137, + "logps/chosen": -429.73052978515625, + "logps/rejected": -426.40850830078125, + "loss": 0.5717, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.729512095451355, + "rewards/margins": 0.40905341506004333, + "rewards/rejected": -1.1385654211044312, + "step": 1850 + }, + { + "epoch": 0.48678356451190785, + "grad_norm": 18.389541625976562, + "learning_rate": 3.048345150370226e-07, + "logits/chosen": -2.7712199687957764, + "logits/rejected": -2.7354605197906494, + "logps/chosen": -465.0520935058594, + "logps/rejected": -462.63519287109375, + "loss": 0.5966, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.9968741536140442, + "rewards/margins": 0.40299710631370544, + "rewards/rejected": -1.3998713493347168, + "step": 1860 + }, + { + "epoch": 0.48940068045014395, + "grad_norm": 21.121103286743164, + "learning_rate": 3.0260342347574913e-07, + "logits/chosen": -2.700634717941284, + "logits/rejected": -2.6599361896514893, + "logps/chosen": -451.46331787109375, + "logps/rejected": -454.69427490234375, + "loss": 0.5382, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.978916347026825, + "rewards/margins": 0.47690868377685547, + "rewards/rejected": -1.4558249711990356, + "step": 1870 + }, + { + "epoch": 0.49201779638838, + "grad_norm": 24.27654266357422, + "learning_rate": 3.0036793954375357e-07, + "logits/chosen": -2.7394368648529053, + "logits/rejected": -2.6990807056427, + "logps/chosen": -442.2903747558594, + "logps/rejected": -426.3751525878906, + "loss": 0.5808, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.9537175893783569, + "rewards/margins": 0.4698936939239502, + "rewards/rejected": -1.4236112833023071, + "step": 1880 + }, + { + "epoch": 0.49463491232661605, + "grad_norm": 24.052539825439453, + "learning_rate": 2.9812824990330085e-07, + "logits/chosen": -2.7231922149658203, + "logits/rejected": -2.696277379989624, + "logps/chosen": -431.68585205078125, + "logps/rejected": -429.3919982910156, + "loss": 0.6315, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.8916054964065552, + "rewards/margins": 0.33528465032577515, + "rewards/rejected": -1.2268900871276855, + "step": 1890 + }, + { + "epoch": 0.49725202826485215, + "grad_norm": 19.382165908813477, + "learning_rate": 2.958845415678316e-07, + "logits/chosen": -2.703700542449951, + "logits/rejected": -2.666050672531128, + "logps/chosen": -440.453125, + "logps/rejected": -444.79034423828125, + "loss": 0.5877, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.8149045705795288, + "rewards/margins": 0.42137575149536133, + "rewards/rejected": -1.2362802028656006, + "step": 1900 + }, + { + "epoch": 0.49725202826485215, + "eval_logits/chosen": -2.694143056869507, + "eval_logits/rejected": -2.6620967388153076, + "eval_logps/chosen": -417.5872497558594, + "eval_logps/rejected": -415.6308288574219, + "eval_loss": 0.5975241661071777, + "eval_rewards/accuracies": 0.6834999918937683, + "eval_rewards/chosen": -0.8136825561523438, + "eval_rewards/margins": 0.37157776951789856, + "eval_rewards/rejected": -1.1852604150772095, + "eval_runtime": 305.6021, + "eval_samples_per_second": 6.544, + "eval_steps_per_second": 0.818, + "step": 1900 + }, + { + "epoch": 0.4998691442030882, + "grad_norm": 17.323326110839844, + "learning_rate": 2.936370018863459e-07, + "logits/chosen": -2.7364118099212646, + "logits/rejected": -2.7139904499053955, + "logps/chosen": -417.9366760253906, + "logps/rejected": -403.66143798828125, + "loss": 0.5874, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.8763308525085449, + "rewards/margins": 0.36916738748550415, + "rewards/rejected": -1.2454981803894043, + "step": 1910 + }, + { + "epoch": 0.5024862601413242, + "grad_norm": 19.605926513671875, + "learning_rate": 2.913858185277605e-07, + "logits/chosen": -2.7196900844573975, + "logits/rejected": -2.6851718425750732, + "logps/chosen": -425.770263671875, + "logps/rejected": -433.83673095703125, + "loss": 0.5813, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.8689553141593933, + "rewards/margins": 0.44626492261886597, + "rewards/rejected": -1.3152204751968384, + "step": 1920 + }, + { + "epoch": 0.5051033760795604, + "grad_norm": 23.041301727294922, + "learning_rate": 2.89131179465238e-07, + "logits/chosen": -2.7089123725891113, + "logits/rejected": -2.6577157974243164, + "logps/chosen": -421.91558837890625, + "logps/rejected": -414.3662109375, + "loss": 0.55, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.8832708597183228, + "rewards/margins": 0.5182112455368042, + "rewards/rejected": -1.4014819860458374, + "step": 1930 + }, + { + "epoch": 0.5077204920177963, + "grad_norm": 22.370925903320312, + "learning_rate": 2.8687327296049125e-07, + "logits/chosen": -2.6943976879119873, + "logits/rejected": -2.670966625213623, + "logps/chosen": -417.71807861328125, + "logps/rejected": -440.22589111328125, + "loss": 0.5719, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.8600457906723022, + "rewards/margins": 0.47562676668167114, + "rewards/rejected": -1.3356726169586182, + "step": 1940 + }, + { + "epoch": 0.5103376079560324, + "grad_norm": 18.212282180786133, + "learning_rate": 2.846122875480637e-07, + "logits/chosen": -2.696530818939209, + "logits/rejected": -2.638589382171631, + "logps/chosen": -437.90167236328125, + "logps/rejected": -430.6405334472656, + "loss": 0.5766, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.8469659686088562, + "rewards/margins": 0.425870418548584, + "rewards/rejected": -1.272836446762085, + "step": 1950 + }, + { + "epoch": 0.5129547238942685, + "grad_norm": 21.467227935791016, + "learning_rate": 2.8234841201958647e-07, + "logits/chosen": -2.7403194904327393, + "logits/rejected": -2.6844117641448975, + "logps/chosen": -450.5631408691406, + "logps/rejected": -435.8984375, + "loss": 0.5596, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8618534207344055, + "rewards/margins": 0.47362977266311646, + "rewards/rejected": -1.335483193397522, + "step": 1960 + }, + { + "epoch": 0.5155718398325045, + "grad_norm": 28.714305877685547, + "learning_rate": 2.800818354080148e-07, + "logits/chosen": -2.6684775352478027, + "logits/rejected": -2.6224045753479004, + "logps/chosen": -444.83251953125, + "logps/rejected": -412.0975036621094, + "loss": 0.619, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.9538711309432983, + "rewards/margins": 0.33846694231033325, + "rewards/rejected": -1.2923381328582764, + "step": 1970 + }, + { + "epoch": 0.5181889557707406, + "grad_norm": 24.992568969726562, + "learning_rate": 2.778127469718435e-07, + "logits/chosen": -2.6164069175720215, + "logits/rejected": -2.6504337787628174, + "logps/chosen": -390.96527099609375, + "logps/rejected": -431.69854736328125, + "loss": 0.6144, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.9115864038467407, + "rewards/margins": 0.32063305377960205, + "rewards/rejected": -1.2322193384170532, + "step": 1980 + }, + { + "epoch": 0.5208060717089767, + "grad_norm": 19.11081886291504, + "learning_rate": 2.755413361793039e-07, + "logits/chosen": -2.6722495555877686, + "logits/rejected": -2.6163339614868164, + "logps/chosen": -403.4658203125, + "logps/rejected": -406.2340393066406, + "loss": 0.5674, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.817135214805603, + "rewards/margins": 0.4171879291534424, + "rewards/rejected": -1.2343231439590454, + "step": 1990 + }, + { + "epoch": 0.5234231876472127, + "grad_norm": 24.406513214111328, + "learning_rate": 2.7326779269254356e-07, + "logits/chosen": -2.7332730293273926, + "logits/rejected": -2.6852307319641113, + "logps/chosen": -446.953857421875, + "logps/rejected": -410.27093505859375, + "loss": 0.5827, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.7922626733779907, + "rewards/margins": 0.43351325392723083, + "rewards/rejected": -1.225775957107544, + "step": 2000 + }, + { + "epoch": 0.5234231876472127, + "eval_logits/chosen": -2.6396472454071045, + "eval_logits/rejected": -2.604276418685913, + "eval_logps/chosen": -423.45745849609375, + "eval_logps/rejected": -422.7220764160156, + "eval_loss": 0.5934838652610779, + "eval_rewards/accuracies": 0.6809999942779541, + "eval_rewards/chosen": -0.8723848462104797, + "eval_rewards/margins": 0.383787602186203, + "eval_rewards/rejected": -1.2561724185943604, + "eval_runtime": 305.4532, + "eval_samples_per_second": 6.548, + "eval_steps_per_second": 0.818, + "step": 2000 + }, + { + "epoch": 0.5260403035854488, + "grad_norm": 21.45716094970703, + "learning_rate": 2.709923063517895e-07, + "logits/chosen": -2.673267126083374, + "logits/rejected": -2.667255401611328, + "logps/chosen": -410.90777587890625, + "logps/rejected": -433.248046875, + "loss": 0.5703, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.8045026659965515, + "rewards/margins": 0.4072667062282562, + "rewards/rejected": -1.211769461631775, + "step": 2010 + }, + { + "epoch": 0.528657419523685, + "grad_norm": 23.03792953491211, + "learning_rate": 2.68715067159496e-07, + "logits/chosen": -2.7115061283111572, + "logits/rejected": -2.6757900714874268, + "logps/chosen": -402.1265563964844, + "logps/rejected": -395.89752197265625, + "loss": 0.5894, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.7784110903739929, + "rewards/margins": 0.35269591212272644, + "rewards/rejected": -1.131106972694397, + "step": 2020 + }, + { + "epoch": 0.5312745354619209, + "grad_norm": 22.643049240112305, + "learning_rate": 2.664362652644806e-07, + "logits/chosen": -2.7354533672332764, + "logits/rejected": -2.698335647583008, + "logps/chosen": -465.36376953125, + "logps/rejected": -446.8394470214844, + "loss": 0.5532, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.8965269923210144, + "rewards/margins": 0.5018103718757629, + "rewards/rejected": -1.3983373641967773, + "step": 2030 + }, + { + "epoch": 0.533891651400157, + "grad_norm": 18.090749740600586, + "learning_rate": 2.6415609094604555e-07, + "logits/chosen": -2.6659553050994873, + "logits/rejected": -2.690216541290283, + "logps/chosen": -451.0194396972656, + "logps/rejected": -454.2710876464844, + "loss": 0.589, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.010286569595337, + "rewards/margins": 0.4080902636051178, + "rewards/rejected": -1.4183766841888428, + "step": 2040 + }, + { + "epoch": 0.5365087673383931, + "grad_norm": 19.967069625854492, + "learning_rate": 2.618747345980904e-07, + "logits/chosen": -2.6821203231811523, + "logits/rejected": -2.6286585330963135, + "logps/chosen": -429.80517578125, + "logps/rejected": -390.1981506347656, + "loss": 0.6111, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.1355177164077759, + "rewards/margins": 0.328762948513031, + "rewards/rejected": -1.4642808437347412, + "step": 2050 + }, + { + "epoch": 0.5391258832766291, + "grad_norm": 20.209062576293945, + "learning_rate": 2.595923867132136e-07, + "logits/chosen": -2.7044026851654053, + "logits/rejected": -2.679009437561035, + "logps/chosen": -469.7115173339844, + "logps/rejected": -470.56231689453125, + "loss": 0.5892, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1646978855133057, + "rewards/margins": 0.41930079460144043, + "rewards/rejected": -1.5839985609054565, + "step": 2060 + }, + { + "epoch": 0.5417429992148652, + "grad_norm": 22.267589569091797, + "learning_rate": 2.5730923786680667e-07, + "logits/chosen": -2.638622522354126, + "logits/rejected": -2.679886817932129, + "logps/chosen": -434.09967041015625, + "logps/rejected": -474.56787109375, + "loss": 0.6065, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.1707215309143066, + "rewards/margins": 0.36304971575737, + "rewards/rejected": -1.533771276473999, + "step": 2070 + }, + { + "epoch": 0.5443601151531012, + "grad_norm": 27.767457962036133, + "learning_rate": 2.5502547870114135e-07, + "logits/chosen": -2.677556037902832, + "logits/rejected": -2.6269454956054688, + "logps/chosen": -443.53631591796875, + "logps/rejected": -426.9546813964844, + "loss": 0.6383, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.1648094654083252, + "rewards/margins": 0.3214976489543915, + "rewards/rejected": -1.486307144165039, + "step": 2080 + }, + { + "epoch": 0.5469772310913373, + "grad_norm": 22.076040267944336, + "learning_rate": 2.527412999094506e-07, + "logits/chosen": -2.646812677383423, + "logits/rejected": -2.620919704437256, + "logps/chosen": -479.4862365722656, + "logps/rejected": -492.4109802246094, + "loss": 0.5783, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0661259889602661, + "rewards/margins": 0.4469536244869232, + "rewards/rejected": -1.5130794048309326, + "step": 2090 + }, + { + "epoch": 0.5495943470295734, + "grad_norm": 27.053951263427734, + "learning_rate": 2.5045689222000636e-07, + "logits/chosen": -2.636404275894165, + "logits/rejected": -2.619544506072998, + "logps/chosen": -407.72406005859375, + "logps/rejected": -409.077392578125, + "loss": 0.6017, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.00985586643219, + "rewards/margins": 0.3562536835670471, + "rewards/rejected": -1.3661094903945923, + "step": 2100 + }, + { + "epoch": 0.5495943470295734, + "eval_logits/chosen": -2.643636703491211, + "eval_logits/rejected": -2.6104650497436523, + "eval_logps/chosen": -436.8658447265625, + "eval_logps/rejected": -436.8172302246094, + "eval_loss": 0.5910605192184448, + "eval_rewards/accuracies": 0.690500020980835, + "eval_rewards/chosen": -1.006468415260315, + "eval_rewards/margins": 0.3906554877758026, + "eval_rewards/rejected": -1.3971240520477295, + "eval_runtime": 305.5499, + "eval_samples_per_second": 6.546, + "eval_steps_per_second": 0.818, + "step": 2100 + }, + { + "epoch": 0.5522114629678094, + "grad_norm": 19.81754493713379, + "learning_rate": 2.481724463801933e-07, + "logits/chosen": -2.6853280067443848, + "logits/rejected": -2.6354801654815674, + "logps/chosen": -442.880615234375, + "logps/rejected": -425.76275634765625, + "loss": 0.594, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.9968615770339966, + "rewards/margins": 0.4066940248012543, + "rewards/rejected": -1.4035555124282837, + "step": 2110 + }, + { + "epoch": 0.5548285789060455, + "grad_norm": 21.1542911529541, + "learning_rate": 2.4588815314058154e-07, + "logits/chosen": -2.678277015686035, + "logits/rejected": -2.6839067935943604, + "logps/chosen": -404.617431640625, + "logps/rejected": -398.31866455078125, + "loss": 0.5719, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.9024378061294556, + "rewards/margins": 0.42626914381980896, + "rewards/rejected": -1.328706979751587, + "step": 2120 + }, + { + "epoch": 0.5574456948442816, + "grad_norm": 23.462890625, + "learning_rate": 2.4360420323899917e-07, + "logits/chosen": -2.6694867610931396, + "logits/rejected": -2.6611833572387695, + "logps/chosen": -429.3199768066406, + "logps/rejected": -420.8585510253906, + "loss": 0.5921, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.780637264251709, + "rewards/margins": 0.40070563554763794, + "rewards/rejected": -1.1813428401947021, + "step": 2130 + }, + { + "epoch": 0.5600628107825176, + "grad_norm": 18.006694793701172, + "learning_rate": 2.4132078738460583e-07, + "logits/chosen": -2.7235093116760254, + "logits/rejected": -2.6758511066436768, + "logps/chosen": -414.63250732421875, + "logps/rejected": -385.173828125, + "loss": 0.6092, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.7920923233032227, + "rewards/margins": 0.32466885447502136, + "rewards/rejected": -1.1167610883712769, + "step": 2140 + }, + { + "epoch": 0.5626799267207537, + "grad_norm": 28.11566162109375, + "learning_rate": 2.390380962419682e-07, + "logits/chosen": -2.672776460647583, + "logits/rejected": -2.6661226749420166, + "logps/chosen": -382.5970764160156, + "logps/rejected": -362.99176025390625, + "loss": 0.6076, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7663475275039673, + "rewards/margins": 0.33483588695526123, + "rewards/rejected": -1.101183295249939, + "step": 2150 + }, + { + "epoch": 0.5652970426589898, + "grad_norm": 21.116987228393555, + "learning_rate": 2.3675632041513977e-07, + "logits/chosen": -2.7305362224578857, + "logits/rejected": -2.656404972076416, + "logps/chosen": -436.4358825683594, + "logps/rejected": -386.28509521484375, + "loss": 0.5598, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.7153185606002808, + "rewards/margins": 0.4554404318332672, + "rewards/rejected": -1.1707589626312256, + "step": 2160 + }, + { + "epoch": 0.5679141585972258, + "grad_norm": 25.329484939575195, + "learning_rate": 2.344756504317453e-07, + "logits/chosen": -2.698883533477783, + "logits/rejected": -2.624823808670044, + "logps/chosen": -420.55389404296875, + "logps/rejected": -399.3967590332031, + "loss": 0.5962, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.9281074404716492, + "rewards/margins": 0.3701552450656891, + "rewards/rejected": -1.2982627153396606, + "step": 2170 + }, + { + "epoch": 0.5705312745354619, + "grad_norm": 27.779399871826172, + "learning_rate": 2.3219627672707237e-07, + "logits/chosen": -2.671786069869995, + "logits/rejected": -2.638526201248169, + "logps/chosen": -414.17974853515625, + "logps/rejected": -390.75604248046875, + "loss": 0.5965, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.9639150500297546, + "rewards/margins": 0.3531908392906189, + "rewards/rejected": -1.3171058893203735, + "step": 2180 + }, + { + "epoch": 0.573148390473698, + "grad_norm": 19.64858055114746, + "learning_rate": 2.2991838962816918e-07, + "logits/chosen": -2.6252007484436035, + "logits/rejected": -2.5690817832946777, + "logps/chosen": -419.4808654785156, + "logps/rejected": -446.12841796875, + "loss": 0.596, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.9539289474487305, + "rewards/margins": 0.398750901222229, + "rewards/rejected": -1.3526798486709595, + "step": 2190 + }, + { + "epoch": 0.575765506411934, + "grad_norm": 24.863256454467773, + "learning_rate": 2.2764217933795297e-07, + "logits/chosen": -2.6451430320739746, + "logits/rejected": -2.6056628227233887, + "logps/chosen": -420.39849853515625, + "logps/rejected": -424.5686950683594, + "loss": 0.5539, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.854151725769043, + "rewards/margins": 0.47093433141708374, + "rewards/rejected": -1.325085997581482, + "step": 2200 + }, + { + "epoch": 0.575765506411934, + "eval_logits/chosen": -2.6075503826141357, + "eval_logits/rejected": -2.572392463684082, + "eval_logps/chosen": -426.81951904296875, + "eval_logps/rejected": -426.5499267578125, + "eval_loss": 0.5919502377510071, + "eval_rewards/accuracies": 0.6884999871253967, + "eval_rewards/chosen": -0.9060052037239075, + "eval_rewards/margins": 0.38844582438468933, + "eval_rewards/rejected": -1.294451117515564, + "eval_runtime": 305.4694, + "eval_samples_per_second": 6.547, + "eval_steps_per_second": 0.818, + "step": 2200 + }, + { + "epoch": 0.5783826223501701, + "grad_norm": 21.576183319091797, + "learning_rate": 2.253678359193278e-07, + "logits/chosen": -2.707562208175659, + "logits/rejected": -2.6342368125915527, + "logps/chosen": -451.54180908203125, + "logps/rejected": -455.8236389160156, + "loss": 0.5991, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.9837636947631836, + "rewards/margins": 0.3827388882637024, + "rewards/rejected": -1.3665026426315308, + "step": 2210 + }, + { + "epoch": 0.5809997382884062, + "grad_norm": 20.301193237304688, + "learning_rate": 2.230955492793149e-07, + "logits/chosen": -2.5531551837921143, + "logits/rejected": -2.5302395820617676, + "logps/chosen": -448.05279541015625, + "logps/rejected": -460.99267578125, + "loss": 0.6251, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.0065380334854126, + "rewards/margins": 0.3645462393760681, + "rewards/rejected": -1.371084451675415, + "step": 2220 + }, + { + "epoch": 0.5836168542266422, + "grad_norm": 22.607479095458984, + "learning_rate": 2.2082550915319468e-07, + "logits/chosen": -2.5578253269195557, + "logits/rejected": -2.5488719940185547, + "logps/chosen": -447.6844787597656, + "logps/rejected": -438.8826599121094, + "loss": 0.5936, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.9693658947944641, + "rewards/margins": 0.4336482882499695, + "rewards/rejected": -1.403014063835144, + "step": 2230 + }, + { + "epoch": 0.5862339701648783, + "grad_norm": 20.127931594848633, + "learning_rate": 2.1855790508866433e-07, + "logits/chosen": -2.6090965270996094, + "logits/rejected": -2.596123218536377, + "logps/chosen": -463.4794006347656, + "logps/rejected": -465.2300720214844, + "loss": 0.6107, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.9245331883430481, + "rewards/margins": 0.33225584030151367, + "rewards/rejected": -1.256788969039917, + "step": 2240 + }, + { + "epoch": 0.5888510861031143, + "grad_norm": 15.41345500946045, + "learning_rate": 2.162929264300107e-07, + "logits/chosen": -2.6332390308380127, + "logits/rejected": -2.6083428859710693, + "logps/chosen": -422.32220458984375, + "logps/rejected": -420.503662109375, + "loss": 0.5636, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.8126707077026367, + "rewards/margins": 0.4325372576713562, + "rewards/rejected": -1.2452080249786377, + "step": 2250 + }, + { + "epoch": 0.5914682020413504, + "grad_norm": 22.760326385498047, + "learning_rate": 2.1403076230230005e-07, + "logits/chosen": -2.6489734649658203, + "logits/rejected": -2.622544288635254, + "logps/chosen": -430.8290100097656, + "logps/rejected": -420.72613525390625, + "loss": 0.626, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.867121696472168, + "rewards/margins": 0.30833083391189575, + "rewards/rejected": -1.1754525899887085, + "step": 2260 + }, + { + "epoch": 0.5940853179795865, + "grad_norm": 26.066482543945312, + "learning_rate": 2.1177160159558596e-07, + "logits/chosen": -2.634918212890625, + "logits/rejected": -2.56962251663208, + "logps/chosen": -447.6597595214844, + "logps/rejected": -419.47430419921875, + "loss": 0.5887, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.9099095463752747, + "rewards/margins": 0.41119471192359924, + "rewards/rejected": -1.3211042881011963, + "step": 2270 + }, + { + "epoch": 0.5967024339178225, + "grad_norm": 26.43048858642578, + "learning_rate": 2.0951563294913734e-07, + "logits/chosen": -2.65494704246521, + "logits/rejected": -2.5803942680358887, + "logps/chosen": -423.8819274902344, + "logps/rejected": -414.9833068847656, + "loss": 0.5454, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.8593618273735046, + "rewards/margins": 0.4740574359893799, + "rewards/rejected": -1.3334193229675293, + "step": 2280 + }, + { + "epoch": 0.5993195498560586, + "grad_norm": 24.643842697143555, + "learning_rate": 2.072630447356869e-07, + "logits/chosen": -2.598431348800659, + "logits/rejected": -2.5783185958862305, + "logps/chosen": -422.7080078125, + "logps/rejected": -415.7620544433594, + "loss": 0.5828, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.9891357421875, + "rewards/margins": 0.3770177960395813, + "rewards/rejected": -1.3661534786224365, + "step": 2290 + }, + { + "epoch": 0.6019366657942947, + "grad_norm": 24.30388832092285, + "learning_rate": 2.0501402504570232e-07, + "logits/chosen": -2.6597847938537598, + "logits/rejected": -2.590627908706665, + "logps/chosen": -457.49249267578125, + "logps/rejected": -459.54620361328125, + "loss": 0.5795, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.0667564868927002, + "rewards/margins": 0.49383625388145447, + "rewards/rejected": -1.5605928897857666, + "step": 2300 + }, + { + "epoch": 0.6019366657942947, + "eval_logits/chosen": -2.5756797790527344, + "eval_logits/rejected": -2.5398993492126465, + "eval_logps/chosen": -447.8605041503906, + "eval_logps/rejected": -451.0841064453125, + "eval_loss": 0.5913601517677307, + "eval_rewards/accuracies": 0.6865000128746033, + "eval_rewards/chosen": -1.11641526222229, + "eval_rewards/margins": 0.42337724566459656, + "eval_rewards/rejected": -1.5397926568984985, + "eval_runtime": 305.5859, + "eval_samples_per_second": 6.545, + "eval_steps_per_second": 0.818, + "step": 2300 + }, + { + "epoch": 0.6045537817325307, + "grad_norm": 23.54559898376465, + "learning_rate": 2.027687616716804e-07, + "logits/chosen": -2.54463529586792, + "logits/rejected": -2.5222010612487793, + "logps/chosen": -398.11944580078125, + "logps/rejected": -391.347412109375, + "loss": 0.5937, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.1046133041381836, + "rewards/margins": 0.42002907395362854, + "rewards/rejected": -1.5246422290802002, + "step": 2310 + }, + { + "epoch": 0.6071708976707668, + "grad_norm": 27.417768478393555, + "learning_rate": 2.005274420924668e-07, + "logits/chosen": -2.6474318504333496, + "logits/rejected": -2.5949997901916504, + "logps/chosen": -436.841064453125, + "logps/rejected": -420.890380859375, + "loss": 0.6072, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.06412672996521, + "rewards/margins": 0.38113874197006226, + "rewards/rejected": -1.445265293121338, + "step": 2320 + }, + { + "epoch": 0.6097880136090029, + "grad_norm": 24.830379486083984, + "learning_rate": 1.9829025345760121e-07, + "logits/chosen": -2.622990369796753, + "logits/rejected": -2.6124253273010254, + "logps/chosen": -460.695556640625, + "logps/rejected": -477.2328186035156, + "loss": 0.6122, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.05497407913208, + "rewards/margins": 0.3757147490978241, + "rewards/rejected": -1.430688738822937, + "step": 2330 + }, + { + "epoch": 0.6124051295472389, + "grad_norm": 25.735050201416016, + "learning_rate": 1.960573825716911e-07, + "logits/chosen": -2.5889840126037598, + "logits/rejected": -2.547499656677246, + "logps/chosen": -403.8297424316406, + "logps/rejected": -414.8653259277344, + "loss": 0.5988, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.0478129386901855, + "rewards/margins": 0.3687422275543213, + "rewards/rejected": -1.4165551662445068, + "step": 2340 + }, + { + "epoch": 0.615022245485475, + "grad_norm": 26.266759872436523, + "learning_rate": 1.9382901587881273e-07, + "logits/chosen": -2.63970685005188, + "logits/rejected": -2.6058359146118164, + "logps/chosen": -429.8356018066406, + "logps/rejected": -418.068115234375, + "loss": 0.5684, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.977154552936554, + "rewards/margins": 0.4486463665962219, + "rewards/rejected": -1.4258009195327759, + "step": 2350 + }, + { + "epoch": 0.6176393614237111, + "grad_norm": 21.620264053344727, + "learning_rate": 1.9160533944694364e-07, + "logits/chosen": -2.6155648231506348, + "logits/rejected": -2.558945894241333, + "logps/chosen": -429.2547912597656, + "logps/rejected": -438.4996032714844, + "loss": 0.5524, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.8521722555160522, + "rewards/margins": 0.5101855397224426, + "rewards/rejected": -1.3623578548431396, + "step": 2360 + }, + { + "epoch": 0.6202564773619471, + "grad_norm": 20.288921356201172, + "learning_rate": 1.8938653895242602e-07, + "logits/chosen": -2.601743221282959, + "logits/rejected": -2.5524344444274902, + "logps/chosen": -433.3799743652344, + "logps/rejected": -439.91143798828125, + "loss": 0.5521, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.9650028944015503, + "rewards/margins": 0.5604956150054932, + "rewards/rejected": -1.525498628616333, + "step": 2370 + }, + { + "epoch": 0.6228735933001832, + "grad_norm": 28.30954933166504, + "learning_rate": 1.8717279966446264e-07, + "logits/chosen": -2.4995055198669434, + "logits/rejected": -2.4924209117889404, + "logps/chosen": -416.691650390625, + "logps/rejected": -430.76373291015625, + "loss": 0.6314, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.0411570072174072, + "rewards/margins": 0.34819602966308594, + "rewards/rejected": -1.3893530368804932, + "step": 2380 + }, + { + "epoch": 0.6254907092384192, + "grad_norm": 25.23048210144043, + "learning_rate": 1.8496430642964694e-07, + "logits/chosen": -2.5988945960998535, + "logits/rejected": -2.5367140769958496, + "logps/chosen": -437.4556579589844, + "logps/rejected": -441.42694091796875, + "loss": 0.5833, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.9659013748168945, + "rewards/margins": 0.42699605226516724, + "rewards/rejected": -1.392897367477417, + "step": 2390 + }, + { + "epoch": 0.6281078251766553, + "grad_norm": 34.421661376953125, + "learning_rate": 1.8276124365652855e-07, + "logits/chosen": -2.5923240184783936, + "logits/rejected": -2.53601336479187, + "logps/chosen": -428.5076599121094, + "logps/rejected": -437.510498046875, + "loss": 0.5657, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.9861478805541992, + "rewards/margins": 0.4693332314491272, + "rewards/rejected": -1.4554810523986816, + "step": 2400 + }, + { + "epoch": 0.6281078251766553, + "eval_logits/chosen": -2.5486767292022705, + "eval_logits/rejected": -2.5120887756347656, + "eval_logps/chosen": -439.6860656738281, + "eval_logps/rejected": -442.0413513183594, + "eval_loss": 0.590362012386322, + "eval_rewards/accuracies": 0.6859999895095825, + "eval_rewards/chosen": -1.0346707105636597, + "eval_rewards/margins": 0.41469448804855347, + "eval_rewards/rejected": -1.449365258216858, + "eval_runtime": 305.5078, + "eval_samples_per_second": 6.546, + "eval_steps_per_second": 0.818, + "step": 2400 + }, + { + "epoch": 0.6307249411148914, + "grad_norm": 23.83245086669922, + "learning_rate": 1.805637953002149e-07, + "logits/chosen": -2.6228480339050293, + "logits/rejected": -2.5939719676971436, + "logps/chosen": -411.1646423339844, + "logps/rejected": -414.8374938964844, + "loss": 0.5865, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.0131409168243408, + "rewards/margins": 0.4185038208961487, + "rewards/rejected": -1.4316446781158447, + "step": 2410 + }, + { + "epoch": 0.6333420570531274, + "grad_norm": 28.81045150756836, + "learning_rate": 1.7837214484701153e-07, + "logits/chosen": -2.615571975708008, + "logits/rejected": -2.6004979610443115, + "logps/chosen": -421.10235595703125, + "logps/rejected": -409.6927185058594, + "loss": 0.5985, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.9161368608474731, + "rewards/margins": 0.39567703008651733, + "rewards/rejected": -1.3118139505386353, + "step": 2420 + }, + { + "epoch": 0.6359591729913635, + "grad_norm": 26.689922332763672, + "learning_rate": 1.761864752991004e-07, + "logits/chosen": -2.626842975616455, + "logits/rejected": -2.571432590484619, + "logps/chosen": -425.1255798339844, + "logps/rejected": -433.8043518066406, + "loss": 0.587, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.967668890953064, + "rewards/margins": 0.38932132720947266, + "rewards/rejected": -1.3569902181625366, + "step": 2430 + }, + { + "epoch": 0.6385762889295996, + "grad_norm": 24.693151473999023, + "learning_rate": 1.7400696915925995e-07, + "logits/chosen": -2.6202292442321777, + "logits/rejected": -2.5344481468200684, + "logps/chosen": -441.83404541015625, + "logps/rejected": -409.7845764160156, + "loss": 0.5849, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.9791529774665833, + "rewards/margins": 0.497478187084198, + "rewards/rejected": -1.4766310453414917, + "step": 2440 + }, + { + "epoch": 0.6411934048678356, + "grad_norm": 29.49846076965332, + "learning_rate": 1.718338084156254e-07, + "logits/chosen": -2.4964375495910645, + "logits/rejected": -2.4695093631744385, + "logps/chosen": -455.32952880859375, + "logps/rejected": -444.91778564453125, + "loss": 0.547, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.9237340688705444, + "rewards/margins": 0.5019347071647644, + "rewards/rejected": -1.425668716430664, + "step": 2450 + }, + { + "epoch": 0.6438105208060717, + "grad_norm": 21.89922332763672, + "learning_rate": 1.696671745264937e-07, + "logits/chosen": -2.616865396499634, + "logits/rejected": -2.570014715194702, + "logps/chosen": -446.42449951171875, + "logps/rejected": -421.83251953125, + "loss": 0.5447, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.9398025274276733, + "rewards/margins": 0.527073085308075, + "rewards/rejected": -1.466875672340393, + "step": 2460 + }, + { + "epoch": 0.6464276367443078, + "grad_norm": 31.102645874023438, + "learning_rate": 1.67507248405171e-07, + "logits/chosen": -2.581944227218628, + "logits/rejected": -2.5663084983825684, + "logps/chosen": -431.9105529785156, + "logps/rejected": -461.98876953125, + "loss": 0.5943, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.9992059469223022, + "rewards/margins": 0.4130525588989258, + "rewards/rejected": -1.412258505821228, + "step": 2470 + }, + { + "epoch": 0.6490447526825438, + "grad_norm": 32.388267517089844, + "learning_rate": 1.6535421040486683e-07, + "logits/chosen": -2.458714485168457, + "logits/rejected": -2.4205844402313232, + "logps/chosen": -415.0240173339844, + "logps/rejected": -413.59429931640625, + "loss": 0.5649, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.9899314045906067, + "rewards/margins": 0.49838346242904663, + "rewards/rejected": -1.4883147478103638, + "step": 2480 + }, + { + "epoch": 0.6516618686207799, + "grad_norm": 21.825029373168945, + "learning_rate": 1.6320824030363456e-07, + "logits/chosen": -2.5214855670928955, + "logits/rejected": -2.5148208141326904, + "logps/chosen": -409.963134765625, + "logps/rejected": -417.6705017089844, + "loss": 0.5916, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.0183316469192505, + "rewards/margins": 0.4151875376701355, + "rewards/rejected": -1.4335191249847412, + "step": 2490 + }, + { + "epoch": 0.654278984559016, + "grad_norm": 27.251855850219727, + "learning_rate": 1.6106951728936024e-07, + "logits/chosen": -2.5894880294799805, + "logits/rejected": -2.5418648719787598, + "logps/chosen": -421.0638732910156, + "logps/rejected": -455.240966796875, + "loss": 0.5306, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.922796905040741, + "rewards/margins": 0.5818823575973511, + "rewards/rejected": -1.5046792030334473, + "step": 2500 + }, + { + "epoch": 0.654278984559016, + "eval_logits/chosen": -2.510161876678467, + "eval_logits/rejected": -2.469223976135254, + "eval_logps/chosen": -440.859130859375, + "eval_logps/rejected": -445.5005187988281, + "eval_loss": 0.5917896628379822, + "eval_rewards/accuracies": 0.6825000047683716, + "eval_rewards/chosen": -1.0464012622833252, + "eval_rewards/margins": 0.43755561113357544, + "eval_rewards/rejected": -1.4839569330215454, + "eval_runtime": 305.4345, + "eval_samples_per_second": 6.548, + "eval_steps_per_second": 0.819, + "step": 2500 + }, + { + "epoch": 0.656896100497252, + "grad_norm": 28.463520050048828, + "learning_rate": 1.5893821994479994e-07, + "logits/chosen": -2.5858142375946045, + "logits/rejected": -2.5548527240753174, + "logps/chosen": -465.49346923828125, + "logps/rejected": -450.70831298828125, + "loss": 0.5918, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.0464518070220947, + "rewards/margins": 0.4308743476867676, + "rewards/rejected": -1.4773260354995728, + "step": 2510 + }, + { + "epoch": 0.6595132164354881, + "grad_norm": 29.278364181518555, + "learning_rate": 1.5681452623266867e-07, + "logits/chosen": -2.5009493827819824, + "logits/rejected": -2.4307141304016113, + "logps/chosen": -475.93963623046875, + "logps/rejected": -456.0147399902344, + "loss": 0.5432, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.12712824344635, + "rewards/margins": 0.5893834829330444, + "rewards/rejected": -1.7165117263793945, + "step": 2520 + }, + { + "epoch": 0.6621303323737242, + "grad_norm": 31.27610206604004, + "learning_rate": 1.546986134807801e-07, + "logits/chosen": -2.5525741577148438, + "logits/rejected": -2.480517864227295, + "logps/chosen": -429.0491638183594, + "logps/rejected": -453.71417236328125, + "loss": 0.5516, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.1591657400131226, + "rewards/margins": 0.501166582107544, + "rewards/rejected": -1.6603323221206665, + "step": 2530 + }, + { + "epoch": 0.6647474483119602, + "grad_norm": 18.123111724853516, + "learning_rate": 1.5259065836724034e-07, + "logits/chosen": -2.4638514518737793, + "logits/rejected": -2.4330999851226807, + "logps/chosen": -429.5963439941406, + "logps/rejected": -456.29827880859375, + "loss": 0.5854, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1803653240203857, + "rewards/margins": 0.48663124442100525, + "rewards/rejected": -1.6669965982437134, + "step": 2540 + }, + { + "epoch": 0.6673645642501963, + "grad_norm": 36.21045684814453, + "learning_rate": 1.5049083690569454e-07, + "logits/chosen": -2.469520330429077, + "logits/rejected": -2.4402925968170166, + "logps/chosen": -426.5743713378906, + "logps/rejected": -462.9029235839844, + "loss": 0.5813, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2028522491455078, + "rewards/margins": 0.5434964299201965, + "rewards/rejected": -1.7463487386703491, + "step": 2550 + }, + { + "epoch": 0.6699816801884323, + "grad_norm": 30.517663955688477, + "learning_rate": 1.4839932443063056e-07, + "logits/chosen": -2.4739162921905518, + "logits/rejected": -2.4111859798431396, + "logps/chosen": -488.611572265625, + "logps/rejected": -463.06329345703125, + "loss": 0.5618, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.2092149257659912, + "rewards/margins": 0.5156753659248352, + "rewards/rejected": -1.7248903512954712, + "step": 2560 + }, + { + "epoch": 0.6725987961266684, + "grad_norm": 35.15083312988281, + "learning_rate": 1.46316295582738e-07, + "logits/chosen": -2.486800193786621, + "logits/rejected": -2.4465322494506836, + "logps/chosen": -431.0811462402344, + "logps/rejected": -436.2640075683594, + "loss": 0.6517, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2838318347930908, + "rewards/margins": 0.31887996196746826, + "rewards/rejected": -1.6027119159698486, + "step": 2570 + }, + { + "epoch": 0.6752159120649045, + "grad_norm": 27.834619522094727, + "learning_rate": 1.4424192429432655e-07, + "logits/chosen": -2.5374560356140137, + "logits/rejected": -2.4828081130981445, + "logps/chosen": -433.96722412109375, + "logps/rejected": -470.0076599121094, + "loss": 0.5697, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.0254337787628174, + "rewards/margins": 0.4902397692203522, + "rewards/rejected": -1.5156733989715576, + "step": 2580 + }, + { + "epoch": 0.6778330280031405, + "grad_norm": 35.93947219848633, + "learning_rate": 1.4217638377480158e-07, + "logits/chosen": -2.4948618412017822, + "logits/rejected": -2.4619128704071045, + "logps/chosen": -418.11517333984375, + "logps/rejected": -444.2643127441406, + "loss": 0.5733, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.0571857690811157, + "rewards/margins": 0.48725780844688416, + "rewards/rejected": -1.5444434881210327, + "step": 2590 + }, + { + "epoch": 0.6804501439413766, + "grad_norm": 26.80233383178711, + "learning_rate": 1.401198464962021e-07, + "logits/chosen": -2.5296876430511475, + "logits/rejected": -2.4376063346862793, + "logps/chosen": -448.37811279296875, + "logps/rejected": -441.19036865234375, + "loss": 0.5762, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.0862284898757935, + "rewards/margins": 0.4645492434501648, + "rewards/rejected": -1.550777792930603, + "step": 2600 + }, + { + "epoch": 0.6804501439413766, + "eval_logits/chosen": -2.473548412322998, + "eval_logits/rejected": -2.429135799407959, + "eval_logps/chosen": -443.086181640625, + "eval_logps/rejected": -448.5192565917969, + "eval_loss": 0.5926596522331238, + "eval_rewards/accuracies": 0.6779999732971191, + "eval_rewards/chosen": -1.068671703338623, + "eval_rewards/margins": 0.4454721510410309, + "eval_rewards/rejected": -1.5141440629959106, + "eval_runtime": 305.3934, + "eval_samples_per_second": 6.549, + "eval_steps_per_second": 0.819, + "step": 2600 + }, + { + "epoch": 0.6830672598796127, + "grad_norm": 26.471881866455078, + "learning_rate": 1.3807248417879894e-07, + "logits/chosen": -2.563028335571289, + "logits/rejected": -2.5249762535095215, + "logps/chosen": -456.01922607421875, + "logps/rejected": -460.52679443359375, + "loss": 0.5634, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.0320606231689453, + "rewards/margins": 0.5281775593757629, + "rewards/rejected": -1.5602381229400635, + "step": 2610 + }, + { + "epoch": 0.6856843758178487, + "grad_norm": 41.65851974487305, + "learning_rate": 1.3603446777675665e-07, + "logits/chosen": -2.4479596614837646, + "logits/rejected": -2.421311855316162, + "logps/chosen": -440.4335021972656, + "logps/rejected": -441.00592041015625, + "loss": 0.6009, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.09227454662323, + "rewards/margins": 0.42088931798934937, + "rewards/rejected": -1.5131638050079346, + "step": 2620 + }, + { + "epoch": 0.6883014917560848, + "grad_norm": 27.369688034057617, + "learning_rate": 1.3400596746385814e-07, + "logits/chosen": -2.5403740406036377, + "logits/rejected": -2.475461483001709, + "logps/chosen": -450.69293212890625, + "logps/rejected": -447.13330078125, + "loss": 0.594, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0313596725463867, + "rewards/margins": 0.4859469532966614, + "rewards/rejected": -1.5173065662384033, + "step": 2630 + }, + { + "epoch": 0.6909186076943209, + "grad_norm": 35.06747817993164, + "learning_rate": 1.3198715261929586e-07, + "logits/chosen": -2.5465633869171143, + "logits/rejected": -2.5011157989501953, + "logps/chosen": -413.10919189453125, + "logps/rejected": -429.83404541015625, + "loss": 0.5716, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.0741420984268188, + "rewards/margins": 0.4673934578895569, + "rewards/rejected": -1.541535496711731, + "step": 2640 + }, + { + "epoch": 0.6935357236325569, + "grad_norm": 31.834104537963867, + "learning_rate": 1.299781918135282e-07, + "logits/chosen": -2.534392833709717, + "logits/rejected": -2.473947525024414, + "logps/chosen": -478.8719787597656, + "logps/rejected": -493.5030822753906, + "loss": 0.5345, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.9548946619033813, + "rewards/margins": 0.6267498731613159, + "rewards/rejected": -1.5816442966461182, + "step": 2650 + }, + { + "epoch": 0.696152839570793, + "grad_norm": 35.67092514038086, + "learning_rate": 1.279792527942045e-07, + "logits/chosen": -2.5275771617889404, + "logits/rejected": -2.448570728302002, + "logps/chosen": -452.7754821777344, + "logps/rejected": -476.6175842285156, + "loss": 0.5481, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.0713622570037842, + "rewards/margins": 0.5786231756210327, + "rewards/rejected": -1.6499855518341064, + "step": 2660 + }, + { + "epoch": 0.6987699555090291, + "grad_norm": 30.155054092407227, + "learning_rate": 1.259905024721576e-07, + "logits/chosen": -2.477794647216797, + "logits/rejected": -2.4506518840789795, + "logps/chosen": -427.8697814941406, + "logps/rejected": -444.8448791503906, + "loss": 0.5246, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.0339616537094116, + "rewards/margins": 0.6002413034439087, + "rewards/rejected": -1.6342031955718994, + "step": 2670 + }, + { + "epoch": 0.7013870714472651, + "grad_norm": 28.638723373413086, + "learning_rate": 1.2401210690746703e-07, + "logits/chosen": -2.480868101119995, + "logits/rejected": -2.4245822429656982, + "logps/chosen": -444.09613037109375, + "logps/rejected": -430.0035095214844, + "loss": 0.5973, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.9939875602722168, + "rewards/margins": 0.3904542922973633, + "rewards/rejected": -1.38444185256958, + "step": 2680 + }, + { + "epoch": 0.7040041873855012, + "grad_norm": 33.430381774902344, + "learning_rate": 1.2204423129559305e-07, + "logits/chosen": -2.5521812438964844, + "logits/rejected": -2.558464765548706, + "logps/chosen": -435.12530517578125, + "logps/rejected": -474.42022705078125, + "loss": 0.5872, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.00673508644104, + "rewards/margins": 0.5006878972053528, + "rewards/rejected": -1.5074230432510376, + "step": 2690 + }, + { + "epoch": 0.7066213033237373, + "grad_norm": 30.671358108520508, + "learning_rate": 1.2008703995358299e-07, + "logits/chosen": -2.5638155937194824, + "logits/rejected": -2.5175604820251465, + "logps/chosen": -432.1807556152344, + "logps/rejected": -433.16790771484375, + "loss": 0.6016, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.9962761998176575, + "rewards/margins": 0.4422995448112488, + "rewards/rejected": -1.4385757446289062, + "step": 2700 + }, + { + "epoch": 0.7066213033237373, + "eval_logits/chosen": -2.474745273590088, + "eval_logits/rejected": -2.4329495429992676, + "eval_logps/chosen": -443.8889465332031, + "eval_logps/rejected": -447.90631103515625, + "eval_loss": 0.5935620069503784, + "eval_rewards/accuracies": 0.6800000071525574, + "eval_rewards/chosen": -1.0766992568969727, + "eval_rewards/margins": 0.43131566047668457, + "eval_rewards/rejected": -1.5080151557922363, + "eval_runtime": 305.4592, + "eval_samples_per_second": 6.548, + "eval_steps_per_second": 0.818, + "step": 2700 + }, + { + "epoch": 0.7092384192619733, + "grad_norm": 21.97296905517578, + "learning_rate": 1.1814069630635068e-07, + "logits/chosen": -2.4879977703094482, + "logits/rejected": -2.474510431289673, + "logps/chosen": -442.66912841796875, + "logps/rejected": -471.7681579589844, + "loss": 0.5934, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.020108699798584, + "rewards/margins": 0.45263218879699707, + "rewards/rejected": -1.472740888595581, + "step": 2710 + }, + { + "epoch": 0.7118555352002094, + "grad_norm": 23.306360244750977, + "learning_rate": 1.1620536287303051e-07, + "logits/chosen": -2.53651762008667, + "logits/rejected": -2.4935238361358643, + "logps/chosen": -478.90570068359375, + "logps/rejected": -468.0852966308594, + "loss": 0.6237, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.0745232105255127, + "rewards/margins": 0.37477684020996094, + "rewards/rejected": -1.4493000507354736, + "step": 2720 + }, + { + "epoch": 0.7144726511384454, + "grad_norm": 20.9937744140625, + "learning_rate": 1.1428120125340716e-07, + "logits/chosen": -2.548900604248047, + "logits/rejected": -2.488852024078369, + "logps/chosen": -430.0403747558594, + "logps/rejected": -420.0286560058594, + "loss": 0.5506, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.0618057250976562, + "rewards/margins": 0.5464845895767212, + "rewards/rejected": -1.608290433883667, + "step": 2730 + }, + { + "epoch": 0.7170897670766815, + "grad_norm": 26.31341552734375, + "learning_rate": 1.123683721144223e-07, + "logits/chosen": -2.536130666732788, + "logits/rejected": -2.496840715408325, + "logps/chosen": -471.9339294433594, + "logps/rejected": -469.2123107910156, + "loss": 0.5805, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.082291841506958, + "rewards/margins": 0.47875848412513733, + "rewards/rejected": -1.561050295829773, + "step": 2740 + }, + { + "epoch": 0.7197068830149176, + "grad_norm": 18.80337142944336, + "learning_rate": 1.1046703517675845e-07, + "logits/chosen": -2.521416425704956, + "logits/rejected": -2.5193381309509277, + "logps/chosen": -423.4697265625, + "logps/rejected": -469.30743408203125, + "loss": 0.5889, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.000197172164917, + "rewards/margins": 0.44857126474380493, + "rewards/rejected": -1.4487683773040771, + "step": 2750 + }, + { + "epoch": 0.7223239989531536, + "grad_norm": 24.486021041870117, + "learning_rate": 1.085773492015028e-07, + "logits/chosen": -2.509730815887451, + "logits/rejected": -2.452089786529541, + "logps/chosen": -424.95355224609375, + "logps/rejected": -419.2748107910156, + "loss": 0.5638, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.0358409881591797, + "rewards/margins": 0.4873877167701721, + "rewards/rejected": -1.5232288837432861, + "step": 2760 + }, + { + "epoch": 0.7249411148913897, + "grad_norm": 30.51848030090332, + "learning_rate": 1.0669947197689033e-07, + "logits/chosen": -2.5046708583831787, + "logits/rejected": -2.434422731399536, + "logps/chosen": -449.0489196777344, + "logps/rejected": -453.250732421875, + "loss": 0.558, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.060209035873413, + "rewards/margins": 0.5281985402107239, + "rewards/rejected": -1.5884075164794922, + "step": 2770 + }, + { + "epoch": 0.7275582308296258, + "grad_norm": 28.53468132019043, + "learning_rate": 1.048335603051291e-07, + "logits/chosen": -2.508882761001587, + "logits/rejected": -2.462200403213501, + "logps/chosen": -468.8936462402344, + "logps/rejected": -488.55438232421875, + "loss": 0.5138, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.091536521911621, + "rewards/margins": 0.6539732217788696, + "rewards/rejected": -1.7455097436904907, + "step": 2780 + }, + { + "epoch": 0.7301753467678618, + "grad_norm": 27.401844024658203, + "learning_rate": 1.0297976998930663e-07, + "logits/chosen": -2.515787363052368, + "logits/rejected": -2.470237970352173, + "logps/chosen": -447.158935546875, + "logps/rejected": -455.6554260253906, + "loss": 0.5452, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.0867551565170288, + "rewards/margins": 0.6032642126083374, + "rewards/rejected": -1.6900192499160767, + "step": 2790 + }, + { + "epoch": 0.7327924627060979, + "grad_norm": 32.95134353637695, + "learning_rate": 1.0113825582038077e-07, + "logits/chosen": -2.5129332542419434, + "logits/rejected": -2.466841220855713, + "logps/chosen": -468.6444396972656, + "logps/rejected": -477.91632080078125, + "loss": 0.6068, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.3479645252227783, + "rewards/margins": 0.4015529155731201, + "rewards/rejected": -1.7495174407958984, + "step": 2800 + }, + { + "epoch": 0.7327924627060979, + "eval_logits/chosen": -2.4707725048065186, + "eval_logits/rejected": -2.4294190406799316, + "eval_logps/chosen": -455.2722473144531, + "eval_logps/rejected": -461.43121337890625, + "eval_loss": 0.5897455811500549, + "eval_rewards/accuracies": 0.6819999814033508, + "eval_rewards/chosen": -1.1905323266983032, + "eval_rewards/margins": 0.4527316391468048, + "eval_rewards/rejected": -1.6432641744613647, + "eval_runtime": 305.6038, + "eval_samples_per_second": 6.544, + "eval_steps_per_second": 0.818, + "step": 2800 + }, + { + "epoch": 0.735409578644334, + "grad_norm": 22.0037784576416, + "learning_rate": 9.930917156425475e-08, + "logits/chosen": -2.528698682785034, + "logits/rejected": -2.490994930267334, + "logps/chosen": -452.98828125, + "logps/rejected": -478.26873779296875, + "loss": 0.5954, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2057301998138428, + "rewards/margins": 0.4493107795715332, + "rewards/rejected": -1.655040979385376, + "step": 2810 + }, + { + "epoch": 0.73802669458257, + "grad_norm": 23.0594539642334, + "learning_rate": 9.749266994893754e-08, + "logits/chosen": -2.498945713043213, + "logits/rejected": -2.4089159965515137, + "logps/chosen": -427.6974182128906, + "logps/rejected": -433.5654296875, + "loss": 0.6381, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.1993119716644287, + "rewards/margins": 0.3153248429298401, + "rewards/rejected": -1.514636754989624, + "step": 2820 + }, + { + "epoch": 0.7406438105208061, + "grad_norm": 31.801725387573242, + "learning_rate": 9.568890265179128e-08, + "logits/chosen": -2.500518798828125, + "logits/rejected": -2.4740939140319824, + "logps/chosen": -455.19140625, + "logps/rejected": -443.7815856933594, + "loss": 0.615, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.1849435567855835, + "rewards/margins": 0.42013731598854065, + "rewards/rejected": -1.6050809621810913, + "step": 2830 + }, + { + "epoch": 0.7432609264590422, + "grad_norm": 24.575345993041992, + "learning_rate": 9.389802028686616e-08, + "logits/chosen": -2.5450186729431152, + "logits/rejected": -2.514194965362549, + "logps/chosen": -447.0918884277344, + "logps/rejected": -432.509521484375, + "loss": 0.6262, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.1515171527862549, + "rewards/margins": 0.3097633123397827, + "rewards/rejected": -1.461280345916748, + "step": 2840 + }, + { + "epoch": 0.7458780423972782, + "grad_norm": 37.35166931152344, + "learning_rate": 9.212017239232426e-08, + "logits/chosen": -2.5267508029937744, + "logits/rejected": -2.511732816696167, + "logps/chosen": -450.9908752441406, + "logps/rejected": -463.26214599609375, + "loss": 0.5356, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.0432413816452026, + "rewards/margins": 0.5652891397476196, + "rewards/rejected": -1.6085306406021118, + "step": 2850 + }, + { + "epoch": 0.7484951583355143, + "grad_norm": 30.909400939941406, + "learning_rate": 9.035550741795328e-08, + "logits/chosen": -2.5176095962524414, + "logits/rejected": -2.512455940246582, + "logps/chosen": -430.74078369140625, + "logps/rejected": -466.84979248046875, + "loss": 0.5908, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.0033584833145142, + "rewards/margins": 0.47517308592796326, + "rewards/rejected": -1.4785315990447998, + "step": 2860 + }, + { + "epoch": 0.7511122742737504, + "grad_norm": 24.312652587890625, + "learning_rate": 8.860417271277065e-08, + "logits/chosen": -2.5723533630371094, + "logits/rejected": -2.5612332820892334, + "logps/chosen": -448.51092529296875, + "logps/rejected": -463.96258544921875, + "loss": 0.595, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.025071620941162, + "rewards/margins": 0.38559845089912415, + "rewards/rejected": -1.4106700420379639, + "step": 2870 + }, + { + "epoch": 0.7537293902119864, + "grad_norm": 30.318265914916992, + "learning_rate": 8.686631451272029e-08, + "logits/chosen": -2.547738552093506, + "logits/rejected": -2.5238020420074463, + "logps/chosen": -440.672607421875, + "logps/rejected": -445.39593505859375, + "loss": 0.5913, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2060959339141846, + "rewards/margins": 0.419172465801239, + "rewards/rejected": -1.625268578529358, + "step": 2880 + }, + { + "epoch": 0.7563465061502225, + "grad_norm": 33.638999938964844, + "learning_rate": 8.514207792846168e-08, + "logits/chosen": -2.562272787094116, + "logits/rejected": -2.533822536468506, + "logps/chosen": -436.4820251464844, + "logps/rejected": -438.751708984375, + "loss": 0.5828, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.1529896259307861, + "rewards/margins": 0.4592018127441406, + "rewards/rejected": -1.6121914386749268, + "step": 2890 + }, + { + "epoch": 0.7589636220884585, + "grad_norm": 22.52386474609375, + "learning_rate": 8.343160693325355e-08, + "logits/chosen": -2.5034773349761963, + "logits/rejected": -2.4751226902008057, + "logps/chosen": -441.08843994140625, + "logps/rejected": -471.1349182128906, + "loss": 0.5821, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.1367791891098022, + "rewards/margins": 0.4893072247505188, + "rewards/rejected": -1.6260864734649658, + "step": 2900 + }, + { + "epoch": 0.7589636220884585, + "eval_logits/chosen": -2.4862163066864014, + "eval_logits/rejected": -2.4469528198242188, + "eval_logps/chosen": -448.6697082519531, + "eval_logps/rejected": -453.08331298828125, + "eval_loss": 0.5870286822319031, + "eval_rewards/accuracies": 0.684499979019165, + "eval_rewards/chosen": -1.1245074272155762, + "eval_rewards/margins": 0.43527737259864807, + "eval_rewards/rejected": -1.5597847700119019, + "eval_runtime": 305.4546, + "eval_samples_per_second": 6.548, + "eval_steps_per_second": 0.818, + "step": 2900 + }, + { + "epoch": 0.7615807380266946, + "grad_norm": 26.818082809448242, + "learning_rate": 8.173504435093173e-08, + "logits/chosen": -2.4929020404815674, + "logits/rejected": -2.425166606903076, + "logps/chosen": -418.92266845703125, + "logps/rejected": -419.7470703125, + "loss": 0.5526, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.0992368459701538, + "rewards/margins": 0.522510826587677, + "rewards/rejected": -1.621747612953186, + "step": 2910 + }, + { + "epoch": 0.7641978539649307, + "grad_norm": 21.685863494873047, + "learning_rate": 8.005253184398359e-08, + "logits/chosen": -2.5369515419006348, + "logits/rejected": -2.4856951236724854, + "logps/chosen": -464.41748046875, + "logps/rejected": -482.64569091796875, + "loss": 0.5998, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.0638659000396729, + "rewards/margins": 0.4084245264530182, + "rewards/rejected": -1.4722901582717896, + "step": 2920 + }, + { + "epoch": 0.7668149699031667, + "grad_norm": 26.51576042175293, + "learning_rate": 7.838420990171926e-08, + "logits/chosen": -2.5660033226013184, + "logits/rejected": -2.5116262435913086, + "logps/chosen": -452.36541748046875, + "logps/rejected": -460.9500427246094, + "loss": 0.5528, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.042595386505127, + "rewards/margins": 0.4849773049354553, + "rewards/rejected": -1.527572751045227, + "step": 2930 + }, + { + "epoch": 0.7694320858414028, + "grad_norm": 21.060117721557617, + "learning_rate": 7.673021782854083e-08, + "logits/chosen": -2.4519286155700684, + "logits/rejected": -2.4212234020233154, + "logps/chosen": -442.04351806640625, + "logps/rejected": -423.04547119140625, + "loss": 0.568, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.062628149986267, + "rewards/margins": 0.5063890218734741, + "rewards/rejected": -1.5690171718597412, + "step": 2940 + }, + { + "epoch": 0.7720492017796389, + "grad_norm": 29.852025985717773, + "learning_rate": 7.509069373231039e-08, + "logits/chosen": -2.4772579669952393, + "logits/rejected": -2.4347732067108154, + "logps/chosen": -440.98681640625, + "logps/rejected": -446.54791259765625, + "loss": 0.5898, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.1728198528289795, + "rewards/margins": 0.4668344557285309, + "rewards/rejected": -1.6396541595458984, + "step": 2950 + }, + { + "epoch": 0.7746663177178749, + "grad_norm": 26.482280731201172, + "learning_rate": 7.346577451281821e-08, + "logits/chosen": -2.4865143299102783, + "logits/rejected": -2.4856116771698, + "logps/chosen": -455.3196716308594, + "logps/rejected": -467.3148498535156, + "loss": 0.56, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1614563465118408, + "rewards/margins": 0.5401021838188171, + "rewards/rejected": -1.7015584707260132, + "step": 2960 + }, + { + "epoch": 0.777283433656111, + "grad_norm": 30.637746810913086, + "learning_rate": 7.185559585035136e-08, + "logits/chosen": -2.50410532951355, + "logits/rejected": -2.441951274871826, + "logps/chosen": -472.7854919433594, + "logps/rejected": -494.48382568359375, + "loss": 0.5495, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.1519578695297241, + "rewards/margins": 0.5563610792160034, + "rewards/rejected": -1.7083189487457275, + "step": 2970 + }, + { + "epoch": 0.7799005495943471, + "grad_norm": 23.6710147857666, + "learning_rate": 7.026029219436502e-08, + "logits/chosen": -2.52189040184021, + "logits/rejected": -2.45927095413208, + "logps/chosen": -436.1434631347656, + "logps/rejected": -452.95794677734375, + "loss": 0.5503, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.1254907846450806, + "rewards/margins": 0.49826329946517944, + "rewards/rejected": -1.6237539052963257, + "step": 2980 + }, + { + "epoch": 0.7825176655325831, + "grad_norm": 19.153810501098633, + "learning_rate": 6.867999675225522e-08, + "logits/chosen": -2.5626466274261475, + "logits/rejected": -2.5076661109924316, + "logps/chosen": -412.48858642578125, + "logps/rejected": -431.7395935058594, + "loss": 0.5615, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1552931070327759, + "rewards/margins": 0.5237449407577515, + "rewards/rejected": -1.6790380477905273, + "step": 2990 + }, + { + "epoch": 0.7851347814708192, + "grad_norm": 40.37303161621094, + "learning_rate": 6.711484147823662e-08, + "logits/chosen": -2.4564764499664307, + "logits/rejected": -2.4580986499786377, + "logps/chosen": -411.7818298339844, + "logps/rejected": -465.6504821777344, + "loss": 0.5393, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.116335153579712, + "rewards/margins": 0.5623366832733154, + "rewards/rejected": -1.6786715984344482, + "step": 3000 + }, + { + "epoch": 0.7851347814708192, + "eval_logits/chosen": -2.456458568572998, + "eval_logits/rejected": -2.4160501956939697, + "eval_logps/chosen": -458.4520568847656, + "eval_logps/rejected": -464.20196533203125, + "eval_loss": 0.5873444676399231, + "eval_rewards/accuracies": 0.6869999766349792, + "eval_rewards/chosen": -1.222330927848816, + "eval_rewards/margins": 0.4486404359340668, + "eval_rewards/rejected": -1.670971393585205, + "eval_runtime": 305.68, + "eval_samples_per_second": 6.543, + "eval_steps_per_second": 0.818, + "step": 3000 + }, + { + "epoch": 0.7877518974090553, + "grad_norm": 20.812664031982422, + "learning_rate": 6.556495706232412e-08, + "logits/chosen": -2.4477975368499756, + "logits/rejected": -2.460245132446289, + "logps/chosen": -460.86468505859375, + "logps/rejected": -472.31884765625, + "loss": 0.597, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2172662019729614, + "rewards/margins": 0.4750109612941742, + "rewards/rejected": -1.692277193069458, + "step": 3010 + }, + { + "epoch": 0.7903690133472913, + "grad_norm": 24.549638748168945, + "learning_rate": 6.403047291942057e-08, + "logits/chosen": -2.4225709438323975, + "logits/rejected": -2.3343379497528076, + "logps/chosen": -420.82391357421875, + "logps/rejected": -419.4525451660156, + "loss": 0.5818, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.250446081161499, + "rewards/margins": 0.4668423533439636, + "rewards/rejected": -1.7172883749008179, + "step": 3020 + }, + { + "epoch": 0.7929861292855274, + "grad_norm": 34.49531173706055, + "learning_rate": 6.251151717851021e-08, + "logits/chosen": -2.4909310340881348, + "logits/rejected": -2.473926067352295, + "logps/chosen": -417.85321044921875, + "logps/rejected": -428.09283447265625, + "loss": 0.6039, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1568142175674438, + "rewards/margins": 0.43484121561050415, + "rewards/rejected": -1.5916552543640137, + "step": 3030 + }, + { + "epoch": 0.7956032452237635, + "grad_norm": 27.338661193847656, + "learning_rate": 6.100821667196041e-08, + "logits/chosen": -2.5778112411499023, + "logits/rejected": -2.4326424598693848, + "logps/chosen": -468.60430908203125, + "logps/rejected": -429.7701721191406, + "loss": 0.54, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.1660789251327515, + "rewards/margins": 0.5631740093231201, + "rewards/rejected": -1.7292530536651611, + "step": 3040 + }, + { + "epoch": 0.7982203611619995, + "grad_norm": 25.222888946533203, + "learning_rate": 5.952069692493061e-08, + "logits/chosen": -2.4194066524505615, + "logits/rejected": -2.4164278507232666, + "logps/chosen": -414.73687744140625, + "logps/rejected": -454.0586853027344, + "loss": 0.5665, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.1890028715133667, + "rewards/margins": 0.49300870299339294, + "rewards/rejected": -1.682011365890503, + "step": 3050 + }, + { + "epoch": 0.8008374771002356, + "grad_norm": 32.99183654785156, + "learning_rate": 5.8049082144891794e-08, + "logits/chosen": -2.4552743434906006, + "logits/rejected": -2.459123134613037, + "logps/chosen": -455.1717224121094, + "logps/rejected": -536.97802734375, + "loss": 0.5644, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.1822292804718018, + "rewards/margins": 0.548469066619873, + "rewards/rejected": -1.7306982278823853, + "step": 3060 + }, + { + "epoch": 0.8034545930384716, + "grad_norm": 23.637554168701172, + "learning_rate": 5.659349521125459e-08, + "logits/chosen": -2.556009292602539, + "logits/rejected": -2.5285425186157227, + "logps/chosen": -475.5826110839844, + "logps/rejected": -476.42156982421875, + "loss": 0.6112, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.1752772331237793, + "rewards/margins": 0.3971019685268402, + "rewards/rejected": -1.5723793506622314, + "step": 3070 + }, + { + "epoch": 0.8060717089767077, + "grad_norm": 34.81662368774414, + "learning_rate": 5.5154057665109e-08, + "logits/chosen": -2.498422145843506, + "logits/rejected": -2.464500665664673, + "logps/chosen": -449.62548828125, + "logps/rejected": -458.1709899902344, + "loss": 0.547, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.151908278465271, + "rewards/margins": 0.5177582502365112, + "rewards/rejected": -1.6696665287017822, + "step": 3080 + }, + { + "epoch": 0.8086888249149438, + "grad_norm": 24.595640182495117, + "learning_rate": 5.3730889699075853e-08, + "logits/chosen": -2.5110769271850586, + "logits/rejected": -2.4285898208618164, + "logps/chosen": -460.2265625, + "logps/rejected": -442.6695861816406, + "loss": 0.5678, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.075935959815979, + "rewards/margins": 0.49235135316848755, + "rewards/rejected": -1.5682871341705322, + "step": 3090 + }, + { + "epoch": 0.8113059408531798, + "grad_norm": 35.13508224487305, + "learning_rate": 5.2324110147270893e-08, + "logits/chosen": -2.4877264499664307, + "logits/rejected": -2.455303192138672, + "logps/chosen": -461.565185546875, + "logps/rejected": -469.4137268066406, + "loss": 0.577, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.0697096586227417, + "rewards/margins": 0.5005604028701782, + "rewards/rejected": -1.57027006149292, + "step": 3100 + }, + { + "epoch": 0.8113059408531798, + "eval_logits/chosen": -2.453810691833496, + "eval_logits/rejected": -2.4136769771575928, + "eval_logps/chosen": -449.80560302734375, + "eval_logps/rejected": -454.67962646484375, + "eval_loss": 0.5885876417160034, + "eval_rewards/accuracies": 0.684499979019165, + "eval_rewards/chosen": -1.1358660459518433, + "eval_rewards/margins": 0.439881831407547, + "eval_rewards/rejected": -1.5757479667663574, + "eval_runtime": 305.6852, + "eval_samples_per_second": 6.543, + "eval_steps_per_second": 0.818, + "step": 3100 + }, + { + "epoch": 0.8139230567914159, + "grad_norm": 24.051559448242188, + "learning_rate": 5.0933836475381795e-08, + "logits/chosen": -2.544729709625244, + "logits/rejected": -2.472533702850342, + "logps/chosen": -466.03924560546875, + "logps/rejected": -479.94140625, + "loss": 0.5701, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.0298961400985718, + "rewards/margins": 0.48342984914779663, + "rewards/rejected": -1.5133259296417236, + "step": 3110 + }, + { + "epoch": 0.816540172729652, + "grad_norm": 19.326536178588867, + "learning_rate": 4.956018477086005e-08, + "logits/chosen": -2.479027271270752, + "logits/rejected": -2.427156448364258, + "logps/chosen": -458.11993408203125, + "logps/rejected": -453.17864990234375, + "loss": 0.576, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.121168851852417, + "rewards/margins": 0.4721224904060364, + "rewards/rejected": -1.593291163444519, + "step": 3120 + }, + { + "epoch": 0.819157288667888, + "grad_norm": 21.694664001464844, + "learning_rate": 4.820326973322763e-08, + "logits/chosen": -2.4353275299072266, + "logits/rejected": -2.3973605632781982, + "logps/chosen": -432.6817321777344, + "logps/rejected": -453.9100646972656, + "loss": 0.5839, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.170310378074646, + "rewards/margins": 0.4695982038974762, + "rewards/rejected": -1.6399085521697998, + "step": 3130 + }, + { + "epoch": 0.821774404606124, + "grad_norm": 30.601573944091797, + "learning_rate": 4.686320466449981e-08, + "logits/chosen": -2.469275951385498, + "logits/rejected": -2.3796298503875732, + "logps/chosen": -429.75885009765625, + "logps/rejected": -460.8003845214844, + "loss": 0.5587, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.1616802215576172, + "rewards/margins": 0.5735237002372742, + "rewards/rejected": -1.735203742980957, + "step": 3140 + }, + { + "epoch": 0.8243915205443602, + "grad_norm": 24.408126831054688, + "learning_rate": 4.554010145972417e-08, + "logits/chosen": -2.5520882606506348, + "logits/rejected": -2.478285312652588, + "logps/chosen": -454.3311462402344, + "logps/rejected": -466.89288330078125, + "loss": 0.611, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.19753098487854, + "rewards/margins": 0.42739981412887573, + "rewards/rejected": -1.624930739402771, + "step": 3150 + }, + { + "epoch": 0.8270086364825961, + "grad_norm": 32.94855499267578, + "learning_rate": 4.423407059763745e-08, + "logits/chosen": -2.4835262298583984, + "logits/rejected": -2.424791097640991, + "logps/chosen": -474.9059143066406, + "logps/rejected": -492.48681640625, + "loss": 0.5942, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.1789145469665527, + "rewards/margins": 0.4700239598751068, + "rewards/rejected": -1.648938536643982, + "step": 3160 + }, + { + "epoch": 0.8296257524208323, + "grad_norm": 23.921096801757812, + "learning_rate": 4.294522113144078e-08, + "logits/chosen": -2.417403221130371, + "logits/rejected": -2.3476955890655518, + "logps/chosen": -451.852783203125, + "logps/rejected": -444.2347717285156, + "loss": 0.5671, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.1180942058563232, + "rewards/margins": 0.5155684351921082, + "rewards/rejected": -1.6336625814437866, + "step": 3170 + }, + { + "epoch": 0.8322428683590684, + "grad_norm": 26.380470275878906, + "learning_rate": 4.1673660679693804e-08, + "logits/chosen": -2.4901657104492188, + "logits/rejected": -2.497138500213623, + "logps/chosen": -406.10546875, + "logps/rejected": -466.86248779296875, + "loss": 0.5895, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2110943794250488, + "rewards/margins": 0.4295726716518402, + "rewards/rejected": -1.640667200088501, + "step": 3180 + }, + { + "epoch": 0.8348599842973043, + "grad_norm": 21.208757400512695, + "learning_rate": 4.041949541732825e-08, + "logits/chosen": -2.4689438343048096, + "logits/rejected": -2.470299482345581, + "logps/chosen": -445.4402770996094, + "logps/rejected": -477.43695068359375, + "loss": 0.5566, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.1410871744155884, + "rewards/margins": 0.5272367596626282, + "rewards/rejected": -1.6683238744735718, + "step": 3190 + }, + { + "epoch": 0.8374771002355405, + "grad_norm": 25.565385818481445, + "learning_rate": 3.9182830066782605e-08, + "logits/chosen": -2.4609150886535645, + "logits/rejected": -2.456033229827881, + "logps/chosen": -459.481201171875, + "logps/rejected": -505.4794006347656, + "loss": 0.5731, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.2549974918365479, + "rewards/margins": 0.49946990609169006, + "rewards/rejected": -1.7544673681259155, + "step": 3200 + }, + { + "epoch": 0.8374771002355405, + "eval_logits/chosen": -2.4400744438171387, + "eval_logits/rejected": -2.3988091945648193, + "eval_logps/chosen": -455.5009460449219, + "eval_logps/rejected": -462.0313415527344, + "eval_loss": 0.586392343044281, + "eval_rewards/accuracies": 0.6899999976158142, + "eval_rewards/chosen": -1.1928194761276245, + "eval_rewards/margins": 0.4564457833766937, + "eval_rewards/rejected": -1.649265170097351, + "eval_runtime": 305.5164, + "eval_samples_per_second": 6.546, + "eval_steps_per_second": 0.818, + "step": 3200 + }, + { + "epoch": 0.8400942161737766, + "grad_norm": 19.77925682067871, + "learning_rate": 3.79637678892577e-08, + "logits/chosen": -2.403917074203491, + "logits/rejected": -2.4254679679870605, + "logps/chosen": -440.84747314453125, + "logps/rejected": -439.8973693847656, + "loss": 0.6224, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.181836724281311, + "rewards/margins": 0.35303249955177307, + "rewards/rejected": -1.5348690748214722, + "step": 3210 + }, + { + "epoch": 0.8427113321120125, + "grad_norm": 30.918712615966797, + "learning_rate": 3.6762410676094645e-08, + "logits/chosen": -2.459597110748291, + "logits/rejected": -2.455178737640381, + "logps/chosen": -486.547119140625, + "logps/rejected": -474.1707458496094, + "loss": 0.6022, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.1656169891357422, + "rewards/margins": 0.4497455060482025, + "rewards/rejected": -1.6153624057769775, + "step": 3220 + }, + { + "epoch": 0.8453284480502486, + "grad_norm": 42.9607048034668, + "learning_rate": 3.557885874027497e-08, + "logits/chosen": -2.3949739933013916, + "logits/rejected": -2.4011003971099854, + "logps/chosen": -452.0868225097656, + "logps/rejected": -458.3661193847656, + "loss": 0.6646, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.2526524066925049, + "rewards/margins": 0.25641515851020813, + "rewards/rejected": -1.509067416191101, + "step": 3230 + }, + { + "epoch": 0.8479455639884846, + "grad_norm": 26.522703170776367, + "learning_rate": 3.441321090804469e-08, + "logits/chosen": -2.5475668907165527, + "logits/rejected": -2.501690626144409, + "logps/chosen": -452.525634765625, + "logps/rejected": -438.3544006347656, + "loss": 0.564, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.1297898292541504, + "rewards/margins": 0.4648459851741791, + "rewards/rejected": -1.5946358442306519, + "step": 3240 + }, + { + "epoch": 0.8505626799267207, + "grad_norm": 23.155546188354492, + "learning_rate": 3.326556451066234e-08, + "logits/chosen": -2.5346245765686035, + "logits/rejected": -2.475264310836792, + "logps/chosen": -488.89306640625, + "logps/rejected": -491.93597412109375, + "loss": 0.5619, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1155110597610474, + "rewards/margins": 0.532517671585083, + "rewards/rejected": -1.6480286121368408, + "step": 3250 + }, + { + "epoch": 0.8531797958649568, + "grad_norm": 28.358789443969727, + "learning_rate": 3.2136015376271946e-08, + "logits/chosen": -2.4637229442596436, + "logits/rejected": -2.412484645843506, + "logps/chosen": -459.3023376464844, + "logps/rejected": -466.33636474609375, + "loss": 0.6073, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.308656930923462, + "rewards/margins": 0.3834686875343323, + "rewards/rejected": -1.692125678062439, + "step": 3260 + }, + { + "epoch": 0.8557969118031928, + "grad_norm": 26.66960334777832, + "learning_rate": 3.102465782190106e-08, + "logits/chosen": -2.4675605297088623, + "logits/rejected": -2.4589285850524902, + "logps/chosen": -436.20440673828125, + "logps/rejected": -456.6858825683594, + "loss": 0.5966, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.1874547004699707, + "rewards/margins": 0.4859285354614258, + "rewards/rejected": -1.673383116722107, + "step": 3270 + }, + { + "epoch": 0.8584140277414289, + "grad_norm": 27.6555233001709, + "learning_rate": 2.993158464558565e-08, + "logits/chosen": -2.4608101844787598, + "logits/rejected": -2.4574997425079346, + "logps/chosen": -457.8924255371094, + "logps/rejected": -490.01739501953125, + "loss": 0.5995, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.082580327987671, + "rewards/margins": 0.4137091040611267, + "rewards/rejected": -1.4962894916534424, + "step": 3280 + }, + { + "epoch": 0.861031143679665, + "grad_norm": 18.215288162231445, + "learning_rate": 2.8856887118621358e-08, + "logits/chosen": -2.506308078765869, + "logits/rejected": -2.5308749675750732, + "logps/chosen": -450.243408203125, + "logps/rejected": -485.89825439453125, + "loss": 0.6124, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.2139848470687866, + "rewards/margins": 0.4676898121833801, + "rewards/rejected": -1.6816747188568115, + "step": 3290 + }, + { + "epoch": 0.863648259617901, + "grad_norm": 25.5659122467041, + "learning_rate": 2.7800654977942482e-08, + "logits/chosen": -2.461979627609253, + "logits/rejected": -2.42045259475708, + "logps/chosen": -443.5435485839844, + "logps/rejected": -463.61669921875, + "loss": 0.586, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.1075669527053833, + "rewards/margins": 0.45719677209854126, + "rewards/rejected": -1.5647637844085693, + "step": 3300 + }, + { + "epoch": 0.863648259617901, + "eval_logits/chosen": -2.4384121894836426, + "eval_logits/rejected": -2.3969199657440186, + "eval_logps/chosen": -453.6158752441406, + "eval_logps/rejected": -459.41778564453125, + "eval_loss": 0.5864999890327454, + "eval_rewards/accuracies": 0.6894999742507935, + "eval_rewards/chosen": -1.1739686727523804, + "eval_rewards/margins": 0.4491608142852783, + "eval_rewards/rejected": -1.6231294870376587, + "eval_runtime": 306.5951, + "eval_samples_per_second": 6.523, + "eval_steps_per_second": 0.815, + "step": 3300 + }, + { + "epoch": 0.8662653755561371, + "grad_norm": 31.966367721557617, + "learning_rate": 2.676297641862879e-08, + "logits/chosen": -2.493067741394043, + "logits/rejected": -2.4418578147888184, + "logps/chosen": -404.053466796875, + "logps/rejected": -391.7318420410156, + "loss": 0.5668, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.12334406375885, + "rewards/margins": 0.49404868483543396, + "rewards/rejected": -1.6173927783966064, + "step": 3310 + }, + { + "epoch": 0.8688824914943732, + "grad_norm": 21.95867156982422, + "learning_rate": 2.5743938086541352e-08, + "logits/chosen": -2.448385715484619, + "logits/rejected": -2.4331841468811035, + "logps/chosen": -451.23193359375, + "logps/rejected": -464.74041748046875, + "loss": 0.5711, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.196354627609253, + "rewards/margins": 0.5040711760520935, + "rewards/rejected": -1.7004257440567017, + "step": 3320 + }, + { + "epoch": 0.8714996074326092, + "grad_norm": 23.792348861694336, + "learning_rate": 2.474362507108757e-08, + "logits/chosen": -2.5521392822265625, + "logits/rejected": -2.4925191402435303, + "logps/chosen": -461.41961669921875, + "logps/rejected": -468.02801513671875, + "loss": 0.5694, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.082940697669983, + "rewards/margins": 0.5743801593780518, + "rewards/rejected": -1.6573207378387451, + "step": 3330 + }, + { + "epoch": 0.8741167233708453, + "grad_norm": 27.30091094970703, + "learning_rate": 2.3762120898116495e-08, + "logits/chosen": -2.477776288986206, + "logits/rejected": -2.4457545280456543, + "logps/chosen": -455.5399475097656, + "logps/rejected": -483.2525329589844, + "loss": 0.5674, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.1902644634246826, + "rewards/margins": 0.4705522954463959, + "rewards/rejected": -1.6608167886734009, + "step": 3340 + }, + { + "epoch": 0.8767338393090814, + "grad_norm": 31.050687789916992, + "learning_rate": 2.2799507522944044e-08, + "logits/chosen": -2.408630847930908, + "logits/rejected": -2.3747382164001465, + "logps/chosen": -459.5210876464844, + "logps/rejected": -485.4042053222656, + "loss": 0.5379, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.127478837966919, + "rewards/margins": 0.5589307546615601, + "rewards/rejected": -1.686409592628479, + "step": 3350 + }, + { + "epoch": 0.8793509552473174, + "grad_norm": 28.34389877319336, + "learning_rate": 2.1855865323510054e-08, + "logits/chosen": -2.4636940956115723, + "logits/rejected": -2.370000123977661, + "logps/chosen": -459.6726989746094, + "logps/rejected": -500.09625244140625, + "loss": 0.5327, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.0568628311157227, + "rewards/margins": 0.6852847337722778, + "rewards/rejected": -1.7421478033065796, + "step": 3360 + }, + { + "epoch": 0.8819680711855535, + "grad_norm": 23.982769012451172, + "learning_rate": 2.0931273093666573e-08, + "logits/chosen": -2.4250173568725586, + "logits/rejected": -2.377990961074829, + "logps/chosen": -429.1495056152344, + "logps/rejected": -440.8812561035156, + "loss": 0.5617, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.227034330368042, + "rewards/margins": 0.48124265670776367, + "rewards/rejected": -1.7082771062850952, + "step": 3370 + }, + { + "epoch": 0.8845851871237895, + "grad_norm": 27.970073699951172, + "learning_rate": 2.002580803659873e-08, + "logits/chosen": -2.4545071125030518, + "logits/rejected": -2.4068009853363037, + "logps/chosen": -444.27227783203125, + "logps/rejected": -445.6871643066406, + "loss": 0.5925, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.1936101913452148, + "rewards/margins": 0.4538310170173645, + "rewards/rejected": -1.6474411487579346, + "step": 3380 + }, + { + "epoch": 0.8872023030620256, + "grad_norm": 23.618146896362305, + "learning_rate": 1.9139545758378256e-08, + "logits/chosen": -2.4549124240875244, + "logits/rejected": -2.363956928253174, + "logps/chosen": -459.8614807128906, + "logps/rejected": -442.1275939941406, + "loss": 0.5525, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.1489067077636719, + "rewards/margins": 0.5495232343673706, + "rewards/rejected": -1.698429822921753, + "step": 3390 + }, + { + "epoch": 0.8898194190002617, + "grad_norm": 24.354528427124023, + "learning_rate": 1.8272560261650277e-08, + "logits/chosen": -2.46649432182312, + "logits/rejected": -2.4250054359436035, + "logps/chosen": -503.41552734375, + "logps/rejected": -477.5843200683594, + "loss": 0.5629, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1212806701660156, + "rewards/margins": 0.5350293517112732, + "rewards/rejected": -1.6563100814819336, + "step": 3400 + }, + { + "epoch": 0.8898194190002617, + "eval_logits/chosen": -2.430605888366699, + "eval_logits/rejected": -2.3882248401641846, + "eval_logps/chosen": -451.9486083984375, + "eval_logps/rejected": -457.9693603515625, + "eval_loss": 0.5859763026237488, + "eval_rewards/accuracies": 0.6890000104904175, + "eval_rewards/chosen": -1.1572964191436768, + "eval_rewards/margins": 0.4513489007949829, + "eval_rewards/rejected": -1.6086454391479492, + "eval_runtime": 305.5222, + "eval_samples_per_second": 6.546, + "eval_steps_per_second": 0.818, + "step": 3400 + }, + { + "epoch": 0.8924365349384977, + "grad_norm": 23.631261825561523, + "learning_rate": 1.742492393945427e-08, + "logits/chosen": -2.426309108734131, + "logits/rejected": -2.361506938934326, + "logps/chosen": -479.4481506347656, + "logps/rejected": -459.58447265625, + "loss": 0.5771, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.1870609521865845, + "rewards/margins": 0.4371766149997711, + "rewards/rejected": -1.6242374181747437, + "step": 3410 + }, + { + "epoch": 0.8950536508767338, + "grad_norm": 28.153467178344727, + "learning_rate": 1.6596707569179302e-08, + "logits/chosen": -2.549783229827881, + "logits/rejected": -2.4772112369537354, + "logps/chosen": -478.44061279296875, + "logps/rejected": -468.6541442871094, + "loss": 0.5714, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.147237777709961, + "rewards/margins": 0.5185133814811707, + "rewards/rejected": -1.6657512187957764, + "step": 3420 + }, + { + "epoch": 0.8976707668149699, + "grad_norm": 29.848007202148438, + "learning_rate": 1.5787980306653848e-08, + "logits/chosen": -2.4930880069732666, + "logits/rejected": -2.423302412033081, + "logps/chosen": -463.26300048828125, + "logps/rejected": -475.92767333984375, + "loss": 0.5766, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.1223206520080566, + "rewards/margins": 0.49025315046310425, + "rewards/rejected": -1.6125738620758057, + "step": 3430 + }, + { + "epoch": 0.9002878827532059, + "grad_norm": 27.02195930480957, + "learning_rate": 1.499880968037165e-08, + "logits/chosen": -2.529888868331909, + "logits/rejected": -2.467481851577759, + "logps/chosen": -425.4710388183594, + "logps/rejected": -418.39752197265625, + "loss": 0.5716, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.080725073814392, + "rewards/margins": 0.45771676301956177, + "rewards/rejected": -1.538441777229309, + "step": 3440 + }, + { + "epoch": 0.902904998691442, + "grad_norm": 37.68179702758789, + "learning_rate": 1.4229261585852803e-08, + "logits/chosen": -2.5055129528045654, + "logits/rejected": -2.483830690383911, + "logps/chosen": -452.56610107421875, + "logps/rejected": -458.8866271972656, + "loss": 0.556, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.1098837852478027, + "rewards/margins": 0.5025152564048767, + "rewards/rejected": -1.6123991012573242, + "step": 3450 + }, + { + "epoch": 0.9055221146296781, + "grad_norm": 26.292009353637695, + "learning_rate": 1.3479400280141883e-08, + "logits/chosen": -2.4547009468078613, + "logits/rejected": -2.469587802886963, + "logps/chosen": -422.94110107421875, + "logps/rejected": -467.0780334472656, + "loss": 0.5599, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.089966058731079, + "rewards/margins": 0.5504525303840637, + "rewards/rejected": -1.6404184103012085, + "step": 3460 + }, + { + "epoch": 0.9081392305679141, + "grad_norm": 30.572351455688477, + "learning_rate": 1.2749288376442042e-08, + "logits/chosen": -2.461515426635742, + "logits/rejected": -2.381925344467163, + "logps/chosen": -481.184326171875, + "logps/rejected": -467.42510986328125, + "loss": 0.5632, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.1164970397949219, + "rewards/margins": 0.5962284803390503, + "rewards/rejected": -1.712725281715393, + "step": 3470 + }, + { + "epoch": 0.9107563465061502, + "grad_norm": 20.18982696533203, + "learning_rate": 1.2038986838887127e-08, + "logits/chosen": -2.5067992210388184, + "logits/rejected": -2.4624361991882324, + "logps/chosen": -433.2051696777344, + "logps/rejected": -454.38909912109375, + "loss": 0.6137, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.2155464887619019, + "rewards/margins": 0.42829465866088867, + "rewards/rejected": -1.6438411474227905, + "step": 3480 + }, + { + "epoch": 0.9133734624443863, + "grad_norm": 23.425052642822266, + "learning_rate": 1.1348554977451131e-08, + "logits/chosen": -2.522127628326416, + "logits/rejected": -2.47310471534729, + "logps/chosen": -474.30712890625, + "logps/rejected": -479.2377014160156, + "loss": 0.5578, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.1554954051971436, + "rewards/margins": 0.574936032295227, + "rewards/rejected": -1.730431318283081, + "step": 3490 + }, + { + "epoch": 0.9159905783826223, + "grad_norm": 20.377241134643555, + "learning_rate": 1.06780504429958e-08, + "logits/chosen": -2.4527220726013184, + "logits/rejected": -2.367492198944092, + "logps/chosen": -457.9435119628906, + "logps/rejected": -445.28009033203125, + "loss": 0.6059, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1457488536834717, + "rewards/margins": 0.4370895028114319, + "rewards/rejected": -1.5828382968902588, + "step": 3500 + }, + { + "epoch": 0.9159905783826223, + "eval_logits/chosen": -2.4319987297058105, + "eval_logits/rejected": -2.389674663543701, + "eval_logps/chosen": -452.93878173828125, + "eval_logps/rejected": -459.2307434082031, + "eval_loss": 0.5858200788497925, + "eval_rewards/accuracies": 0.6890000104904175, + "eval_rewards/chosen": -1.1671984195709229, + "eval_rewards/margins": 0.4540611207485199, + "eval_rewards/rejected": -1.6212595701217651, + "eval_runtime": 305.662, + "eval_samples_per_second": 6.543, + "eval_steps_per_second": 0.818, + "step": 3500 + }, + { + "epoch": 0.9186076943208584, + "grad_norm": 20.939462661743164, + "learning_rate": 1.0027529222456754e-08, + "logits/chosen": -2.44631028175354, + "logits/rejected": -2.397378921508789, + "logps/chosen": -429.8016052246094, + "logps/rejected": -454.264404296875, + "loss": 0.519, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.0528876781463623, + "rewards/margins": 0.5599857568740845, + "rewards/rejected": -1.6128734350204468, + "step": 3510 + }, + { + "epoch": 0.9212248102590945, + "grad_norm": 20.19708251953125, + "learning_rate": 9.397045634168766e-09, + "logits/chosen": -2.5315864086151123, + "logits/rejected": -2.5124194622039795, + "logps/chosen": -454.331787109375, + "logps/rejected": -493.25848388671875, + "loss": 0.5627, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.0776004791259766, + "rewards/margins": 0.5831270217895508, + "rewards/rejected": -1.6607275009155273, + "step": 3520 + }, + { + "epoch": 0.9238419261973305, + "grad_norm": 21.342370986938477, + "learning_rate": 8.78665232332998e-09, + "logits/chosen": -2.4148471355438232, + "logits/rejected": -2.374274730682373, + "logps/chosen": -415.21966552734375, + "logps/rejected": -443.37713623046875, + "loss": 0.5828, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.192144751548767, + "rewards/margins": 0.43268775939941406, + "rewards/rejected": -1.6248325109481812, + "step": 3530 + }, + { + "epoch": 0.9264590421355666, + "grad_norm": 24.934450149536133, + "learning_rate": 8.196400257606206e-09, + "logits/chosen": -2.5181639194488525, + "logits/rejected": -2.4744253158569336, + "logps/chosen": -481.7230529785156, + "logps/rejected": -507.2533264160156, + "loss": 0.5876, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.1774014234542847, + "rewards/margins": 0.5159769058227539, + "rewards/rejected": -1.693378210067749, + "step": 3540 + }, + { + "epoch": 0.9290761580738026, + "grad_norm": 22.040672302246094, + "learning_rate": 7.626338722875075e-09, + "logits/chosen": -2.460214853286743, + "logits/rejected": -2.4767918586730957, + "logps/chosen": -436.06005859375, + "logps/rejected": -465.63641357421875, + "loss": 0.5773, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.1084725856781006, + "rewards/margins": 0.45712822675704956, + "rewards/rejected": -1.565600872039795, + "step": 3550 + }, + { + "epoch": 0.9316932740120387, + "grad_norm": 24.051870346069336, + "learning_rate": 7.0765153191106875e-09, + "logits/chosen": -2.4851746559143066, + "logits/rejected": -2.4584531784057617, + "logps/chosen": -439.662109375, + "logps/rejected": -432.596435546875, + "loss": 0.5813, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.1396434307098389, + "rewards/margins": 0.5287774801254272, + "rewards/rejected": -1.6684210300445557, + "step": 3560 + }, + { + "epoch": 0.9343103899502748, + "grad_norm": 26.50396156311035, + "learning_rate": 6.54697595640899e-09, + "logits/chosen": -2.498539447784424, + "logits/rejected": -2.44036865234375, + "logps/chosen": -478.49664306640625, + "logps/rejected": -492.61224365234375, + "loss": 0.5445, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.0882660150527954, + "rewards/margins": 0.5571417212486267, + "rewards/rejected": -1.6454076766967773, + "step": 3570 + }, + { + "epoch": 0.9369275058885108, + "grad_norm": 20.330581665039062, + "learning_rate": 6.037764851154425e-09, + "logits/chosen": -2.4708151817321777, + "logits/rejected": -2.447495698928833, + "logps/chosen": -450.46099853515625, + "logps/rejected": -485.2198791503906, + "loss": 0.5668, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.0887110233306885, + "rewards/margins": 0.5052274465560913, + "rewards/rejected": -1.5939384698867798, + "step": 3580 + }, + { + "epoch": 0.9395446218267469, + "grad_norm": 26.653793334960938, + "learning_rate": 5.548924522327747e-09, + "logits/chosen": -2.44421648979187, + "logits/rejected": -2.406181573867798, + "logps/chosen": -457.9331970214844, + "logps/rejected": -471.136474609375, + "loss": 0.5838, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2172365188598633, + "rewards/margins": 0.4692623019218445, + "rewards/rejected": -1.6864988803863525, + "step": 3590 + }, + { + "epoch": 0.942161737764983, + "grad_norm": 26.454694747924805, + "learning_rate": 5.080495787955691e-09, + "logits/chosen": -2.4167091846466064, + "logits/rejected": -2.414551019668579, + "logps/chosen": -391.5062561035156, + "logps/rejected": -434.9417419433594, + "loss": 0.5703, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.0791380405426025, + "rewards/margins": 0.4439873695373535, + "rewards/rejected": -1.523125410079956, + "step": 3600 + }, + { + "epoch": 0.942161737764983, + "eval_logits/chosen": -2.431995391845703, + "eval_logits/rejected": -2.389662981033325, + "eval_logps/chosen": -452.2864990234375, + "eval_logps/rejected": -458.4890441894531, + "eval_loss": 0.5860488414764404, + "eval_rewards/accuracies": 0.6869999766349792, + "eval_rewards/chosen": -1.1606749296188354, + "eval_rewards/margins": 0.45316699147224426, + "eval_rewards/rejected": -1.6138420104980469, + "eval_runtime": 311.4284, + "eval_samples_per_second": 6.422, + "eval_steps_per_second": 0.803, + "step": 3600 + }, + { + "epoch": 0.944778853703219, + "grad_norm": 27.942058563232422, + "learning_rate": 4.632517761702814e-09, + "logits/chosen": -2.4297971725463867, + "logits/rejected": -2.3728883266448975, + "logps/chosen": -415.197998046875, + "logps/rejected": -435.744384765625, + "loss": 0.5755, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.1534807682037354, + "rewards/margins": 0.4942702651023865, + "rewards/rejected": -1.6477508544921875, + "step": 3610 + }, + { + "epoch": 0.9473959696414551, + "grad_norm": 34.84771728515625, + "learning_rate": 4.205027849605358e-09, + "logits/chosen": -2.46714448928833, + "logits/rejected": -2.431912899017334, + "logps/chosen": -434.2803649902344, + "logps/rejected": -427.74462890625, + "loss": 0.6174, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.1966065168380737, + "rewards/margins": 0.39964643120765686, + "rewards/rejected": -1.5962530374526978, + "step": 3620 + }, + { + "epoch": 0.9500130855796912, + "grad_norm": 24.179773330688477, + "learning_rate": 3.798061746947995e-09, + "logits/chosen": -2.5451560020446777, + "logits/rejected": -2.5169193744659424, + "logps/chosen": -442.5852966308594, + "logps/rejected": -437.0686950683594, + "loss": 0.5855, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.171771764755249, + "rewards/margins": 0.4499340057373047, + "rewards/rejected": -1.6217056512832642, + "step": 3630 + }, + { + "epoch": 0.9526302015179272, + "grad_norm": 41.15779113769531, + "learning_rate": 3.411653435283157e-09, + "logits/chosen": -2.4626052379608154, + "logits/rejected": -2.4032933712005615, + "logps/chosen": -461.59130859375, + "logps/rejected": -428.04669189453125, + "loss": 0.5857, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1210428476333618, + "rewards/margins": 0.4643692076206207, + "rewards/rejected": -1.5854119062423706, + "step": 3640 + }, + { + "epoch": 0.9552473174561633, + "grad_norm": 21.085739135742188, + "learning_rate": 3.0458351795936698e-09, + "logits/chosen": -2.5102531909942627, + "logits/rejected": -2.448899745941162, + "logps/chosen": -427.3299865722656, + "logps/rejected": -434.32305908203125, + "loss": 0.5532, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.0826635360717773, + "rewards/margins": 0.5397425889968872, + "rewards/rejected": -1.622406005859375, + "step": 3650 + }, + { + "epoch": 0.9578644333943994, + "grad_norm": 26.300575256347656, + "learning_rate": 2.700637525598598e-09, + "logits/chosen": -2.455357074737549, + "logits/rejected": -2.4355976581573486, + "logps/chosen": -460.82080078125, + "logps/rejected": -478.3233337402344, + "loss": 0.6174, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.1832480430603027, + "rewards/margins": 0.3413129150867462, + "rewards/rejected": -1.5245609283447266, + "step": 3660 + }, + { + "epoch": 0.9604815493326354, + "grad_norm": 24.9046688079834, + "learning_rate": 2.3760892972027324e-09, + "logits/chosen": -2.538999557495117, + "logits/rejected": -2.4709739685058594, + "logps/chosen": -467.25177001953125, + "logps/rejected": -474.575927734375, + "loss": 0.5588, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.215664267539978, + "rewards/margins": 0.5684916973114014, + "rewards/rejected": -1.7841558456420898, + "step": 3670 + }, + { + "epoch": 0.9630986652708715, + "grad_norm": 30.135499954223633, + "learning_rate": 2.0722175940897645e-09, + "logits/chosen": -2.434136152267456, + "logits/rejected": -2.447706937789917, + "logps/chosen": -442.7950134277344, + "logps/rejected": -483.507568359375, + "loss": 0.5304, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.1261581182479858, + "rewards/margins": 0.5885075926780701, + "rewards/rejected": -1.7146657705307007, + "step": 3680 + }, + { + "epoch": 0.9657157812091076, + "grad_norm": 21.930707931518555, + "learning_rate": 1.7890477894593748e-09, + "logits/chosen": -2.477616310119629, + "logits/rejected": -2.418490409851074, + "logps/chosen": -505.480712890625, + "logps/rejected": -485.1387634277344, + "loss": 0.5525, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.0282517671585083, + "rewards/margins": 0.5708995461463928, + "rewards/rejected": -1.5991512537002563, + "step": 3690 + }, + { + "epoch": 0.9683328971473436, + "grad_norm": 25.627620697021484, + "learning_rate": 1.5266035279088708e-09, + "logits/chosen": -2.3665719032287598, + "logits/rejected": -2.340869426727295, + "logps/chosen": -492.96484375, + "logps/rejected": -500.45941162109375, + "loss": 0.5533, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2112007141113281, + "rewards/margins": 0.4987810254096985, + "rewards/rejected": -1.7099816799163818, + "step": 3700 + }, + { + "epoch": 0.9683328971473436, + "eval_logits/chosen": -2.4303910732269287, + "eval_logits/rejected": -2.388193368911743, + "eval_logps/chosen": -452.4510498046875, + "eval_logps/rejected": -458.71649169921875, + "eval_loss": 0.5858403444290161, + "eval_rewards/accuracies": 0.6880000233650208, + "eval_rewards/chosen": -1.162320613861084, + "eval_rewards/margins": 0.4537965655326843, + "eval_rewards/rejected": -1.616117000579834, + "eval_runtime": 305.8027, + "eval_samples_per_second": 6.54, + "eval_steps_per_second": 0.818, + "step": 3700 + }, + { + "epoch": 0.9709500130855797, + "grad_norm": 37.73438262939453, + "learning_rate": 1.2849067234584621e-09, + "logits/chosen": -2.3928847312927246, + "logits/rejected": -2.3548622131347656, + "logps/chosen": -423.58935546875, + "logps/rejected": -446.5189514160156, + "loss": 0.6168, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.152722954750061, + "rewards/margins": 0.4318475127220154, + "rewards/rejected": -1.5845705270767212, + "step": 3710 + }, + { + "epoch": 0.9735671290238157, + "grad_norm": 27.73121452331543, + "learning_rate": 1.0639775577218625e-09, + "logits/chosen": -2.4332590103149414, + "logits/rejected": -2.3678956031799316, + "logps/chosen": -437.5235290527344, + "logps/rejected": -427.43280029296875, + "loss": 0.5611, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.154651403427124, + "rewards/margins": 0.5416392683982849, + "rewards/rejected": -1.6962906122207642, + "step": 3720 + }, + { + "epoch": 0.9761842449620518, + "grad_norm": 24.22023582458496, + "learning_rate": 8.638344782207485e-10, + "logits/chosen": -2.434138059616089, + "logits/rejected": -2.3985061645507812, + "logps/chosen": -426.9537048339844, + "logps/rejected": -440.77227783203125, + "loss": 0.5597, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.109956979751587, + "rewards/margins": 0.5067628026008606, + "rewards/rejected": -1.6167194843292236, + "step": 3730 + }, + { + "epoch": 0.9788013609002879, + "grad_norm": 25.113866806030273, + "learning_rate": 6.844941968447149e-10, + "logits/chosen": -2.4938652515411377, + "logits/rejected": -2.44439959526062, + "logps/chosen": -465.15985107421875, + "logps/rejected": -483.47088623046875, + "loss": 0.5536, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.1278215646743774, + "rewards/margins": 0.5776745676994324, + "rewards/rejected": -1.705496072769165, + "step": 3740 + }, + { + "epoch": 0.9814184768385239, + "grad_norm": 22.66619873046875, + "learning_rate": 5.25971688455612e-10, + "logits/chosen": -2.512327194213867, + "logits/rejected": -2.433387279510498, + "logps/chosen": -449.586669921875, + "logps/rejected": -469.92919921875, + "loss": 0.5283, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.1257078647613525, + "rewards/margins": 0.5826985239982605, + "rewards/rejected": -1.7084062099456787, + "step": 3750 + }, + { + "epoch": 0.98403559277676, + "grad_norm": 21.942731857299805, + "learning_rate": 3.882801896372967e-10, + "logits/chosen": -2.4940876960754395, + "logits/rejected": -2.446882963180542, + "logps/chosen": -456.06353759765625, + "logps/rejected": -445.09375, + "loss": 0.621, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.183485746383667, + "rewards/margins": 0.37569838762283325, + "rewards/rejected": -1.559183955192566, + "step": 3760 + }, + { + "epoch": 0.9866527087149961, + "grad_norm": 31.122547149658203, + "learning_rate": 2.714311975902661e-10, + "logits/chosen": -2.468987226486206, + "logits/rejected": -2.39099383354187, + "logps/chosen": -463.2923889160156, + "logps/rejected": -476.58935546875, + "loss": 0.5116, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.0611310005187988, + "rewards/margins": 0.5779252648353577, + "rewards/rejected": -1.6390562057495117, + "step": 3770 + }, + { + "epoch": 0.9892698246532321, + "grad_norm": 27.4237060546875, + "learning_rate": 1.754344691717591e-10, + "logits/chosen": -2.44380521774292, + "logits/rejected": -2.4307875633239746, + "logps/chosen": -434.5845642089844, + "logps/rejected": -482.71759033203125, + "loss": 0.6139, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.1346935033798218, + "rewards/margins": 0.3297516703605652, + "rewards/rejected": -1.4644451141357422, + "step": 3780 + }, + { + "epoch": 0.9918869405914682, + "grad_norm": 31.661060333251953, + "learning_rate": 1.0029802008096333e-10, + "logits/chosen": -2.47269868850708, + "logits/rejected": -2.401909112930298, + "logps/chosen": -473.67608642578125, + "logps/rejected": -482.57501220703125, + "loss": 0.5693, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.16953444480896, + "rewards/margins": 0.5178566575050354, + "rewards/rejected": -1.6873910427093506, + "step": 3790 + }, + { + "epoch": 0.9945040565297043, + "grad_norm": 28.650169372558594, + "learning_rate": 4.602812418974533e-11, + "logits/chosen": -2.5178725719451904, + "logits/rejected": -2.4773991107940674, + "logps/chosen": -471.7525939941406, + "logps/rejected": -475.13055419921875, + "loss": 0.5988, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.1166661977767944, + "rewards/margins": 0.4814838469028473, + "rewards/rejected": -1.5981502532958984, + "step": 3800 + }, + { + "epoch": 0.9945040565297043, + "eval_logits/chosen": -2.4306020736694336, + "eval_logits/rejected": -2.3882100582122803, + "eval_logps/chosen": -452.2973327636719, + "eval_logps/rejected": -458.4822998046875, + "eval_loss": 0.5861949324607849, + "eval_rewards/accuracies": 0.6884999871253967, + "eval_rewards/chosen": -1.1607835292816162, + "eval_rewards/margins": 0.45299115777015686, + "eval_rewards/rejected": -1.6137746572494507, + "eval_runtime": 305.6937, + "eval_samples_per_second": 6.542, + "eval_steps_per_second": 0.818, + "step": 3800 + }, + { + "epoch": 0.9971211724679403, + "grad_norm": 29.108121871948242, + "learning_rate": 1.2629313018819309e-11, + "logits/chosen": -2.446488857269287, + "logits/rejected": -2.4168055057525635, + "logps/chosen": -427.93896484375, + "logps/rejected": -450.84332275390625, + "loss": 0.5569, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.1016128063201904, + "rewards/margins": 0.558451771736145, + "rewards/rejected": -1.6600643396377563, + "step": 3810 + }, + { + "epoch": 0.9997382884061764, + "grad_norm": 23.65038299560547, + "learning_rate": 1.0437535929996855e-13, + "logits/chosen": -2.4572625160217285, + "logits/rejected": -2.4045023918151855, + "logps/chosen": -472.81048583984375, + "logps/rejected": -466.09051513671875, + "loss": 0.5511, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.1336238384246826, + "rewards/margins": 0.5550218820571899, + "rewards/rejected": -1.6886459589004517, + "step": 3820 + }, + { + "epoch": 1.0, + "step": 3821, + "total_flos": 0.0, + "train_loss": 0.6060711596530634, + "train_runtime": 35916.4658, + "train_samples_per_second": 1.702, + "train_steps_per_second": 0.106 + } + ], + "logging_steps": 10, + "max_steps": 3821, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}