diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4883 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 100, + "global_step": 3112, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 1.0706638115631692e-10, + "logits/chosen": 1.2566330432891846, + "logits/rejected": 0.7730951309204102, + "logps/chosen": -300.374267578125, + "logps/rejected": -324.00494384765625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.01, + "learning_rate": 1.070663811563169e-09, + "logits/chosen": 0.9792649745941162, + "logits/rejected": 1.7012548446655273, + "logps/chosen": -464.2229309082031, + "logps/rejected": -332.3782653808594, + "loss": 0.6952, + "rewards/accuracies": 0.4861111044883728, + "rewards/chosen": -0.006301212124526501, + "rewards/margins": -0.0025307913310825825, + "rewards/rejected": -0.0037704205606132746, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 2.141327623126338e-09, + "logits/chosen": 0.5618988871574402, + "logits/rejected": 1.6265491247177124, + "logps/chosen": -438.208984375, + "logps/rejected": -328.3803405761719, + "loss": 0.6981, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.012895757332444191, + "rewards/margins": -0.00526293832808733, + "rewards/rejected": -0.007632819004356861, + "step": 20 + }, + { + "epoch": 0.02, + "learning_rate": 3.2119914346895075e-09, + "logits/chosen": 0.8482489585876465, + "logits/rejected": 1.8450462818145752, + "logps/chosen": -437.23870849609375, + "logps/rejected": -367.84637451171875, + "loss": 0.6967, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": 0.005116526037454605, + "rewards/margins": -0.005225582513958216, + "rewards/rejected": 0.010342106223106384, + "step": 30 + }, + { + "epoch": 0.03, + "learning_rate": 4.282655246252676e-09, + "logits/chosen": 0.9240902662277222, + "logits/rejected": 2.074276924133301, + "logps/chosen": -408.9275207519531, + "logps/rejected": -335.3138122558594, + "loss": 0.6953, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.02186817303299904, + "rewards/margins": 0.013794437050819397, + "rewards/rejected": 0.008073735050857067, + "step": 40 + }, + { + "epoch": 0.03, + "learning_rate": 5.353319057815846e-09, + "logits/chosen": 0.9429410696029663, + "logits/rejected": 1.3247915506362915, + "logps/chosen": -487.283203125, + "logps/rejected": -337.8562927246094, + "loss": 0.6867, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.01094397995620966, + "rewards/margins": 0.016293564811348915, + "rewards/rejected": -0.005349582992494106, + "step": 50 + }, + { + "epoch": 0.04, + "learning_rate": 6.423982869379015e-09, + "logits/chosen": 0.9860417246818542, + "logits/rejected": 1.649106740951538, + "logps/chosen": -456.7554626464844, + "logps/rejected": -330.7721252441406, + "loss": 0.6692, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.0688941478729248, + "rewards/margins": 0.04043982923030853, + "rewards/rejected": 0.02845432423055172, + "step": 60 + }, + { + "epoch": 0.04, + "learning_rate": 7.494646680942184e-09, + "logits/chosen": 0.9182626008987427, + "logits/rejected": 1.827265977859497, + "logps/chosen": -396.9792785644531, + "logps/rejected": -330.5106506347656, + "loss": 0.6531, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.1199527382850647, + "rewards/margins": 0.07451293617486954, + "rewards/rejected": 0.04543980211019516, + "step": 70 + }, + { + "epoch": 0.05, + "learning_rate": 8.565310492505352e-09, + "logits/chosen": 0.5657048225402832, + "logits/rejected": 2.0772719383239746, + "logps/chosen": -467.70233154296875, + "logps/rejected": -340.50457763671875, + "loss": 0.6347, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.18721021711826324, + "rewards/margins": 0.12396695464849472, + "rewards/rejected": 0.06324325501918793, + "step": 80 + }, + { + "epoch": 0.06, + "learning_rate": 9.635974304068522e-09, + "logits/chosen": 0.9224559664726257, + "logits/rejected": 1.6432113647460938, + "logps/chosen": -410.8121032714844, + "logps/rejected": -300.57012939453125, + "loss": 0.608, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 0.28949785232543945, + "rewards/margins": 0.21210959553718567, + "rewards/rejected": 0.07738825678825378, + "step": 90 + }, + { + "epoch": 0.06, + "learning_rate": 1.0706638115631692e-08, + "logits/chosen": 0.8869959115982056, + "logits/rejected": 1.8451831340789795, + "logps/chosen": -403.8915710449219, + "logps/rejected": -300.415771484375, + "loss": 0.5911, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.3641214966773987, + "rewards/margins": 0.22090363502502441, + "rewards/rejected": 0.14321786165237427, + "step": 100 + }, + { + "epoch": 0.06, + "eval_logits/chosen": 0.30295610427856445, + "eval_logits/rejected": 0.8016409873962402, + "eval_logps/chosen": -395.38714599609375, + "eval_logps/rejected": -304.46209716796875, + "eval_loss": 0.5531623363494873, + "eval_rewards/accuracies": 0.84375, + "eval_rewards/chosen": 0.39543235301971436, + "eval_rewards/margins": 0.28056541085243225, + "eval_rewards/rejected": 0.1148669496178627, + "eval_runtime": 77.7884, + "eval_samples_per_second": 12.855, + "eval_steps_per_second": 0.411, + "step": 100 + }, + { + "epoch": 0.07, + "learning_rate": 1.177730192719486e-08, + "logits/chosen": 1.0650697946548462, + "logits/rejected": 1.743814468383789, + "logps/chosen": -376.599853515625, + "logps/rejected": -334.667724609375, + "loss": 0.5616, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.4621427655220032, + "rewards/margins": 0.31689101457595825, + "rewards/rejected": 0.14525175094604492, + "step": 110 + }, + { + "epoch": 0.08, + "learning_rate": 1.284796573875803e-08, + "logits/chosen": 0.7667558789253235, + "logits/rejected": 1.5485883951187134, + "logps/chosen": -444.92987060546875, + "logps/rejected": -348.63372802734375, + "loss": 0.532, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 0.5855865478515625, + "rewards/margins": 0.441326379776001, + "rewards/rejected": 0.14426018297672272, + "step": 120 + }, + { + "epoch": 0.08, + "learning_rate": 1.3918629550321198e-08, + "logits/chosen": 0.6718708276748657, + "logits/rejected": 1.5146424770355225, + "logps/chosen": -443.32568359375, + "logps/rejected": -351.67010498046875, + "loss": 0.5069, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.6444682478904724, + "rewards/margins": 0.5162140130996704, + "rewards/rejected": 0.128254234790802, + "step": 130 + }, + { + "epoch": 0.09, + "learning_rate": 1.4989293361884368e-08, + "logits/chosen": 0.7301766872406006, + "logits/rejected": 1.4742127656936646, + "logps/chosen": -460.937255859375, + "logps/rejected": -351.74871826171875, + "loss": 0.4694, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.7888091802597046, + "rewards/margins": 0.612645149230957, + "rewards/rejected": 0.17616406083106995, + "step": 140 + }, + { + "epoch": 0.1, + "learning_rate": 1.6059957173447535e-08, + "logits/chosen": 1.1185513734817505, + "logits/rejected": 1.3702727556228638, + "logps/chosen": -388.2076721191406, + "logps/rejected": -313.1037902832031, + "loss": 0.4438, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.8620562553405762, + "rewards/margins": 0.5798792839050293, + "rewards/rejected": 0.2821769118309021, + "step": 150 + }, + { + "epoch": 0.1, + "learning_rate": 1.7130620985010704e-08, + "logits/chosen": 1.1629040241241455, + "logits/rejected": 1.1827285289764404, + "logps/chosen": -448.11517333984375, + "logps/rejected": -330.4544372558594, + "loss": 0.4282, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.1283848285675049, + "rewards/margins": 0.9130982160568237, + "rewards/rejected": 0.2152867317199707, + "step": 160 + }, + { + "epoch": 0.11, + "learning_rate": 1.8201284796573874e-08, + "logits/chosen": 0.9314751625061035, + "logits/rejected": 1.7524402141571045, + "logps/chosen": -433.95159912109375, + "logps/rejected": -306.7549743652344, + "loss": 0.3991, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 1.0592777729034424, + "rewards/margins": 0.8145540952682495, + "rewards/rejected": 0.24472376704216003, + "step": 170 + }, + { + "epoch": 0.12, + "learning_rate": 1.9271948608137044e-08, + "logits/chosen": 0.9510858654975891, + "logits/rejected": 1.7319552898406982, + "logps/chosen": -425.33148193359375, + "logps/rejected": -348.006103515625, + "loss": 0.3595, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 1.3629436492919922, + "rewards/margins": 1.0799864530563354, + "rewards/rejected": 0.2829572558403015, + "step": 180 + }, + { + "epoch": 0.12, + "learning_rate": 2.0342612419700214e-08, + "logits/chosen": 0.8850847482681274, + "logits/rejected": 1.6263946294784546, + "logps/chosen": -368.7896423339844, + "logps/rejected": -318.7091064453125, + "loss": 0.3786, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 1.226855993270874, + "rewards/margins": 0.9453747868537903, + "rewards/rejected": 0.2814810574054718, + "step": 190 + }, + { + "epoch": 0.13, + "learning_rate": 2.1413276231263384e-08, + "logits/chosen": 0.9113885760307312, + "logits/rejected": 1.9366668462753296, + "logps/chosen": -392.1955871582031, + "logps/rejected": -291.93536376953125, + "loss": 0.3425, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 1.2265583276748657, + "rewards/margins": 1.0303113460540771, + "rewards/rejected": 0.19624683260917664, + "step": 200 + }, + { + "epoch": 0.13, + "eval_logits/chosen": 0.33036381006240845, + "eval_logits/rejected": 0.8503063321113586, + "eval_logps/chosen": -385.884765625, + "eval_logps/rejected": -303.293212890625, + "eval_loss": 0.31037652492523193, + "eval_rewards/accuracies": 0.9453125, + "eval_rewards/chosen": 1.345674753189087, + "eval_rewards/margins": 1.1139166355133057, + "eval_rewards/rejected": 0.23175781965255737, + "eval_runtime": 77.5737, + "eval_samples_per_second": 12.891, + "eval_steps_per_second": 0.413, + "step": 200 + }, + { + "epoch": 0.13, + "learning_rate": 2.248394004282655e-08, + "logits/chosen": 1.403585433959961, + "logits/rejected": 1.621122121810913, + "logps/chosen": -385.9792175292969, + "logps/rejected": -314.9217529296875, + "loss": 0.3509, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.412534475326538, + "rewards/margins": 1.2651290893554688, + "rewards/rejected": 0.1474055051803589, + "step": 210 + }, + { + "epoch": 0.14, + "learning_rate": 2.355460385438972e-08, + "logits/chosen": 0.8744575381278992, + "logits/rejected": 2.048430919647217, + "logps/chosen": -420.71820068359375, + "logps/rejected": -308.5834045410156, + "loss": 0.3222, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.4642362594604492, + "rewards/margins": 1.3160054683685303, + "rewards/rejected": 0.14823095500469208, + "step": 220 + }, + { + "epoch": 0.15, + "learning_rate": 2.462526766595289e-08, + "logits/chosen": 1.1990312337875366, + "logits/rejected": 1.7153043746948242, + "logps/chosen": -431.5537109375, + "logps/rejected": -328.9321594238281, + "loss": 0.3009, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.6640676259994507, + "rewards/margins": 1.5493268966674805, + "rewards/rejected": 0.11474086344242096, + "step": 230 + }, + { + "epoch": 0.15, + "learning_rate": 2.569593147751606e-08, + "logits/chosen": 1.2709139585494995, + "logits/rejected": 2.0714378356933594, + "logps/chosen": -390.81512451171875, + "logps/rejected": -353.8885803222656, + "loss": 0.2846, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 1.6071593761444092, + "rewards/margins": 1.5323899984359741, + "rewards/rejected": 0.07476941496133804, + "step": 240 + }, + { + "epoch": 0.16, + "learning_rate": 2.676659528907923e-08, + "logits/chosen": 1.2884495258331299, + "logits/rejected": 1.4190622568130493, + "logps/chosen": -402.27105712890625, + "logps/rejected": -316.29681396484375, + "loss": 0.2703, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 1.6437135934829712, + "rewards/margins": 1.6835206747055054, + "rewards/rejected": -0.039806898683309555, + "step": 250 + }, + { + "epoch": 0.17, + "learning_rate": 2.7837259100642396e-08, + "logits/chosen": 1.0041536092758179, + "logits/rejected": 1.827345848083496, + "logps/chosen": -352.20782470703125, + "logps/rejected": -281.7880859375, + "loss": 0.2568, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 1.6974023580551147, + "rewards/margins": 1.677443265914917, + "rewards/rejected": 0.019959043711423874, + "step": 260 + }, + { + "epoch": 0.17, + "learning_rate": 2.890792291220557e-08, + "logits/chosen": 1.280470848083496, + "logits/rejected": 2.0490353107452393, + "logps/chosen": -396.66546630859375, + "logps/rejected": -352.89923095703125, + "loss": 0.2423, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 1.8935625553131104, + "rewards/margins": 1.9584296941757202, + "rewards/rejected": -0.06486758589744568, + "step": 270 + }, + { + "epoch": 0.18, + "learning_rate": 2.9978586723768736e-08, + "logits/chosen": 1.1651164293289185, + "logits/rejected": 1.8686307668685913, + "logps/chosen": -369.2472229003906, + "logps/rejected": -350.49395751953125, + "loss": 0.2369, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.8536313772201538, + "rewards/margins": 2.0920298099517822, + "rewards/rejected": -0.23839814960956573, + "step": 280 + }, + { + "epoch": 0.19, + "learning_rate": 3.1049250535331906e-08, + "logits/chosen": 0.9764202833175659, + "logits/rejected": 1.5498555898666382, + "logps/chosen": -401.94708251953125, + "logps/rejected": -309.38751220703125, + "loss": 0.2172, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.9584842920303345, + "rewards/margins": 2.0484821796417236, + "rewards/rejected": -0.08999788761138916, + "step": 290 + }, + { + "epoch": 0.19, + "learning_rate": 3.211991434689507e-08, + "logits/chosen": 1.1941782236099243, + "logits/rejected": 2.1323885917663574, + "logps/chosen": -424.00299072265625, + "logps/rejected": -329.6502380371094, + "loss": 0.2046, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 2.1664984226226807, + "rewards/margins": 2.4187095165252686, + "rewards/rejected": -0.2522108256816864, + "step": 300 + }, + { + "epoch": 0.19, + "eval_logits/chosen": 0.5110462307929993, + "eval_logits/rejected": 1.0409064292907715, + "eval_logps/chosen": -381.6942138671875, + "eval_logps/rejected": -308.5774230957031, + "eval_loss": 0.1841452568769455, + "eval_rewards/accuracies": 0.9453125, + "eval_rewards/chosen": 1.764728307723999, + "eval_rewards/margins": 2.061392307281494, + "eval_rewards/rejected": -0.2966638207435608, + "eval_runtime": 77.5919, + "eval_samples_per_second": 12.888, + "eval_steps_per_second": 0.412, + "step": 300 + }, + { + "epoch": 0.2, + "learning_rate": 3.3190578158458246e-08, + "logits/chosen": 0.9687066078186035, + "logits/rejected": 1.7230606079101562, + "logps/chosen": -409.93109130859375, + "logps/rejected": -367.07464599609375, + "loss": 0.183, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 1.9146747589111328, + "rewards/margins": 2.3619022369384766, + "rewards/rejected": -0.4472277760505676, + "step": 310 + }, + { + "epoch": 0.21, + "learning_rate": 3.426124197002141e-08, + "logits/chosen": 0.9972401857376099, + "logits/rejected": 2.0866451263427734, + "logps/chosen": -459.5816345214844, + "logps/rejected": -333.5121154785156, + "loss": 0.1805, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 2.2129783630371094, + "rewards/margins": 2.6930830478668213, + "rewards/rejected": -0.4801049828529358, + "step": 320 + }, + { + "epoch": 0.21, + "learning_rate": 3.533190578158458e-08, + "logits/chosen": 1.417145013809204, + "logits/rejected": 1.8801225423812866, + "logps/chosen": -405.06280517578125, + "logps/rejected": -345.85809326171875, + "loss": 0.1807, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 2.1431164741516113, + "rewards/margins": 2.6557106971740723, + "rewards/rejected": -0.5125941038131714, + "step": 330 + }, + { + "epoch": 0.22, + "learning_rate": 3.640256959314775e-08, + "logits/chosen": 1.1074305772781372, + "logits/rejected": 2.019841194152832, + "logps/chosen": -463.04534912109375, + "logps/rejected": -334.6724853515625, + "loss": 0.1691, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.148191452026367, + "rewards/margins": 2.7028677463531494, + "rewards/rejected": -0.5546759366989136, + "step": 340 + }, + { + "epoch": 0.22, + "learning_rate": 3.747323340471092e-08, + "logits/chosen": 1.4934161901474, + "logits/rejected": 2.0615015029907227, + "logps/chosen": -408.3346862792969, + "logps/rejected": -348.22747802734375, + "loss": 0.2014, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 2.1981351375579834, + "rewards/margins": 2.8584563732147217, + "rewards/rejected": -0.6603211164474487, + "step": 350 + }, + { + "epoch": 0.23, + "learning_rate": 3.854389721627409e-08, + "logits/chosen": 1.234470009803772, + "logits/rejected": 1.3749644756317139, + "logps/chosen": -429.24169921875, + "logps/rejected": -359.1957092285156, + "loss": 0.1651, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.1755428314208984, + "rewards/margins": 3.0690488815307617, + "rewards/rejected": -0.8935060501098633, + "step": 360 + }, + { + "epoch": 0.24, + "learning_rate": 3.961456102783726e-08, + "logits/chosen": 1.5004570484161377, + "logits/rejected": 2.204867124557495, + "logps/chosen": -432.32135009765625, + "logps/rejected": -330.5525207519531, + "loss": 0.1527, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 2.2549405097961426, + "rewards/margins": 2.9733498096466064, + "rewards/rejected": -0.7184091806411743, + "step": 370 + }, + { + "epoch": 0.24, + "learning_rate": 4.068522483940043e-08, + "logits/chosen": 1.4254530668258667, + "logits/rejected": 1.9733701944351196, + "logps/chosen": -372.3633117675781, + "logps/rejected": -325.6457824707031, + "loss": 0.1565, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.8286545276641846, + "rewards/margins": 2.7208027839660645, + "rewards/rejected": -0.8921481966972351, + "step": 380 + }, + { + "epoch": 0.25, + "learning_rate": 4.175588865096359e-08, + "logits/chosen": 1.5646907091140747, + "logits/rejected": 1.819138526916504, + "logps/chosen": -377.5379943847656, + "logps/rejected": -343.8385925292969, + "loss": 0.1354, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.9773420095443726, + "rewards/margins": 2.9212684631347656, + "rewards/rejected": -0.9439260363578796, + "step": 390 + }, + { + "epoch": 0.26, + "learning_rate": 4.282655246252677e-08, + "logits/chosen": 1.4840887784957886, + "logits/rejected": 1.9725840091705322, + "logps/chosen": -434.97784423828125, + "logps/rejected": -331.9248352050781, + "loss": 0.1596, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.0752744674682617, + "rewards/margins": 3.0398762226104736, + "rewards/rejected": -0.9646021127700806, + "step": 400 + }, + { + "epoch": 0.26, + "eval_logits/chosen": 0.6985446214675903, + "eval_logits/rejected": 1.232299566268921, + "eval_logps/chosen": -381.2061767578125, + "eval_logps/rejected": -316.1226806640625, + "eval_loss": 0.13778163492679596, + "eval_rewards/accuracies": 0.953125, + "eval_rewards/chosen": 1.813530683517456, + "eval_rewards/margins": 2.8647243976593018, + "eval_rewards/rejected": -1.0511937141418457, + "eval_runtime": 77.7071, + "eval_samples_per_second": 12.869, + "eval_steps_per_second": 0.412, + "step": 400 + }, + { + "epoch": 0.26, + "learning_rate": 4.389721627408993e-08, + "logits/chosen": 1.466382622718811, + "logits/rejected": 2.043748617172241, + "logps/chosen": -327.84527587890625, + "logps/rejected": -320.1709899902344, + "loss": 0.1566, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 1.6711170673370361, + "rewards/margins": 2.701075315475464, + "rewards/rejected": -1.0299583673477173, + "step": 410 + }, + { + "epoch": 0.27, + "learning_rate": 4.49678800856531e-08, + "logits/chosen": 0.8124781847000122, + "logits/rejected": 2.2421953678131104, + "logps/chosen": -400.47650146484375, + "logps/rejected": -366.773681640625, + "loss": 0.1388, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 2.0225610733032227, + "rewards/margins": 3.3162319660186768, + "rewards/rejected": -1.293670654296875, + "step": 420 + }, + { + "epoch": 0.28, + "learning_rate": 4.603854389721627e-08, + "logits/chosen": 1.411001443862915, + "logits/rejected": 2.1695659160614014, + "logps/chosen": -381.0799255371094, + "logps/rejected": -321.12640380859375, + "loss": 0.1331, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.094374656677246, + "rewards/margins": 3.4128735065460205, + "rewards/rejected": -1.3184987306594849, + "step": 430 + }, + { + "epoch": 0.28, + "learning_rate": 4.710920770877944e-08, + "logits/chosen": 1.3558982610702515, + "logits/rejected": 2.4410338401794434, + "logps/chosen": -366.334716796875, + "logps/rejected": -347.8705139160156, + "loss": 0.1456, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 2.165606737136841, + "rewards/margins": 3.6063430309295654, + "rewards/rejected": -1.4407367706298828, + "step": 440 + }, + { + "epoch": 0.29, + "learning_rate": 4.817987152034261e-08, + "logits/chosen": 1.2681734561920166, + "logits/rejected": 2.0421833992004395, + "logps/chosen": -482.25091552734375, + "logps/rejected": -355.57904052734375, + "loss": 0.1114, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3174893856048584, + "rewards/margins": 3.938343048095703, + "rewards/rejected": -1.6208534240722656, + "step": 450 + }, + { + "epoch": 0.3, + "learning_rate": 4.925053533190578e-08, + "logits/chosen": 1.3757874965667725, + "logits/rejected": 2.598388195037842, + "logps/chosen": -371.0218200683594, + "logps/rejected": -349.3445129394531, + "loss": 0.1312, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.7905902862548828, + "rewards/margins": 3.369292736053467, + "rewards/rejected": -1.5787023305892944, + "step": 460 + }, + { + "epoch": 0.3, + "learning_rate": 5.032119914346895e-08, + "logits/chosen": 1.4324853420257568, + "logits/rejected": 2.03037428855896, + "logps/chosen": -403.80120849609375, + "logps/rejected": -359.4237976074219, + "loss": 0.0932, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 2.2499053478240967, + "rewards/margins": 3.795927047729492, + "rewards/rejected": -1.5460216999053955, + "step": 470 + }, + { + "epoch": 0.31, + "learning_rate": 5.139186295503212e-08, + "logits/chosen": 1.1828899383544922, + "logits/rejected": 2.248396873474121, + "logps/chosen": -442.19970703125, + "logps/rejected": -360.25006103515625, + "loss": 0.123, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 2.486384868621826, + "rewards/margins": 4.412066459655762, + "rewards/rejected": -1.9256811141967773, + "step": 480 + }, + { + "epoch": 0.31, + "learning_rate": 5.246252676659528e-08, + "logits/chosen": 0.9969785809516907, + "logits/rejected": 2.016841173171997, + "logps/chosen": -364.4849853515625, + "logps/rejected": -346.4527893066406, + "loss": 0.1157, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.7432949542999268, + "rewards/margins": 3.481992721557617, + "rewards/rejected": -1.7386982440948486, + "step": 490 + }, + { + "epoch": 0.32, + "learning_rate": 5.353319057815846e-08, + "logits/chosen": 1.628588080406189, + "logits/rejected": 2.0982065200805664, + "logps/chosen": -381.443115234375, + "logps/rejected": -362.29486083984375, + "loss": 0.1153, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 2.0162081718444824, + "rewards/margins": 4.036345958709717, + "rewards/rejected": -2.0201380252838135, + "step": 500 + }, + { + "epoch": 0.32, + "eval_logits/chosen": 0.9163604378700256, + "eval_logits/rejected": 1.455054521560669, + "eval_logps/chosen": -380.87884521484375, + "eval_logps/rejected": -324.88177490234375, + "eval_loss": 0.10337930172681808, + "eval_rewards/accuracies": 0.9375, + "eval_rewards/chosen": 1.8462636470794678, + "eval_rewards/margins": 3.773362159729004, + "eval_rewards/rejected": -1.9270987510681152, + "eval_runtime": 77.7201, + "eval_samples_per_second": 12.867, + "eval_steps_per_second": 0.412, + "step": 500 + }, + { + "epoch": 0.33, + "learning_rate": 5.460385438972163e-08, + "logits/chosen": 1.4893255233764648, + "logits/rejected": 1.7933692932128906, + "logps/chosen": -441.28399658203125, + "logps/rejected": -357.0665588378906, + "loss": 0.0929, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.2136569023132324, + "rewards/margins": 4.3935346603393555, + "rewards/rejected": -2.179877758026123, + "step": 510 + }, + { + "epoch": 0.33, + "learning_rate": 5.567451820128479e-08, + "logits/chosen": 1.6450592279434204, + "logits/rejected": 2.41060209274292, + "logps/chosen": -457.15667724609375, + "logps/rejected": -374.75115966796875, + "loss": 0.0995, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.662111759185791, + "rewards/margins": 5.2003607749938965, + "rewards/rejected": -2.5382492542266846, + "step": 520 + }, + { + "epoch": 0.34, + "learning_rate": 5.6745182012847956e-08, + "logits/chosen": 1.7802613973617554, + "logits/rejected": 2.5348358154296875, + "logps/chosen": -378.1894836425781, + "logps/rejected": -335.26129150390625, + "loss": 0.1062, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 2.415276050567627, + "rewards/margins": 4.646345138549805, + "rewards/rejected": -2.231069564819336, + "step": 530 + }, + { + "epoch": 0.35, + "learning_rate": 5.781584582441114e-08, + "logits/chosen": 1.8911815881729126, + "logits/rejected": 2.2930028438568115, + "logps/chosen": -445.2462463378906, + "logps/rejected": -343.48150634765625, + "loss": 0.102, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.925233244895935, + "rewards/margins": 4.143408298492432, + "rewards/rejected": -2.2181754112243652, + "step": 540 + }, + { + "epoch": 0.35, + "learning_rate": 5.88865096359743e-08, + "logits/chosen": 1.3622030019760132, + "logits/rejected": 2.7591183185577393, + "logps/chosen": -417.8857421875, + "logps/rejected": -337.2507629394531, + "loss": 0.1003, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.4883484840393066, + "rewards/margins": 4.856560707092285, + "rewards/rejected": -2.3682124614715576, + "step": 550 + }, + { + "epoch": 0.36, + "learning_rate": 5.995717344753747e-08, + "logits/chosen": 1.885152816772461, + "logits/rejected": 2.445096492767334, + "logps/chosen": -413.9400939941406, + "logps/rejected": -347.16864013671875, + "loss": 0.1078, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.7049505710601807, + "rewards/margins": 4.317243576049805, + "rewards/rejected": -2.612293243408203, + "step": 560 + }, + { + "epoch": 0.37, + "learning_rate": 6.102783725910064e-08, + "logits/chosen": 1.8548791408538818, + "logits/rejected": 2.9488348960876465, + "logps/chosen": -399.66888427734375, + "logps/rejected": -361.6997375488281, + "loss": 0.0833, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.391327381134033, + "rewards/margins": 5.426538467407227, + "rewards/rejected": -3.0352110862731934, + "step": 570 + }, + { + "epoch": 0.37, + "learning_rate": 6.209850107066381e-08, + "logits/chosen": 1.9805002212524414, + "logits/rejected": 2.253380537033081, + "logps/chosen": -363.88214111328125, + "logps/rejected": -356.90264892578125, + "loss": 0.0771, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.9852304458618164, + "rewards/margins": 4.773615837097168, + "rewards/rejected": -2.7883856296539307, + "step": 580 + }, + { + "epoch": 0.38, + "learning_rate": 6.316916488222698e-08, + "logits/chosen": 1.8189414739608765, + "logits/rejected": 2.1621651649475098, + "logps/chosen": -376.37664794921875, + "logps/rejected": -344.7612609863281, + "loss": 0.1041, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.4361340999603271, + "rewards/margins": 4.317945957183838, + "rewards/rejected": -2.8818118572235107, + "step": 590 + }, + { + "epoch": 0.39, + "learning_rate": 6.423982869379014e-08, + "logits/chosen": 1.9913393259048462, + "logits/rejected": 2.2392477989196777, + "logps/chosen": -409.7967834472656, + "logps/rejected": -371.5293884277344, + "loss": 0.0904, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.0530600547790527, + "rewards/margins": 5.140936851501465, + "rewards/rejected": -3.087876081466675, + "step": 600 + }, + { + "epoch": 0.39, + "eval_logits/chosen": 1.1242132186889648, + "eval_logits/rejected": 1.6480361223220825, + "eval_logps/chosen": -383.8947448730469, + "eval_logps/rejected": -335.6293029785156, + "eval_loss": 0.08573687076568604, + "eval_rewards/accuracies": 0.9453125, + "eval_rewards/chosen": 1.5446751117706299, + "eval_rewards/margins": 4.5465288162231445, + "eval_rewards/rejected": -3.0018532276153564, + "eval_runtime": 77.586, + "eval_samples_per_second": 12.889, + "eval_steps_per_second": 0.412, + "step": 600 + }, + { + "epoch": 0.39, + "learning_rate": 6.531049250535332e-08, + "logits/chosen": 1.5741275548934937, + "logits/rejected": 2.2887330055236816, + "logps/chosen": -405.8610534667969, + "logps/rejected": -368.2715759277344, + "loss": 0.0841, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9225307703018188, + "rewards/margins": 5.187340259552002, + "rewards/rejected": -3.2648093700408936, + "step": 610 + }, + { + "epoch": 0.4, + "learning_rate": 6.638115631691649e-08, + "logits/chosen": 1.3936151266098022, + "logits/rejected": 2.8601880073547363, + "logps/chosen": -414.2528381347656, + "logps/rejected": -394.7596435546875, + "loss": 0.079, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.7199567556381226, + "rewards/margins": 5.62530517578125, + "rewards/rejected": -3.905348300933838, + "step": 620 + }, + { + "epoch": 0.4, + "learning_rate": 6.745182012847965e-08, + "logits/chosen": 1.7785946130752563, + "logits/rejected": 2.8865761756896973, + "logps/chosen": -429.94580078125, + "logps/rejected": -385.0089416503906, + "loss": 0.085, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.8962761163711548, + "rewards/margins": 5.370936393737793, + "rewards/rejected": -3.474660873413086, + "step": 630 + }, + { + "epoch": 0.41, + "learning_rate": 6.852248394004282e-08, + "logits/chosen": 1.801674246788025, + "logits/rejected": 2.2879269123077393, + "logps/chosen": -424.087158203125, + "logps/rejected": -360.2523498535156, + "loss": 0.0998, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.9135773181915283, + "rewards/margins": 5.04227352142334, + "rewards/rejected": -3.1286959648132324, + "step": 640 + }, + { + "epoch": 0.42, + "learning_rate": 6.9593147751606e-08, + "logits/chosen": 1.8188073635101318, + "logits/rejected": 2.1497268676757812, + "logps/chosen": -423.1458435058594, + "logps/rejected": -369.9307861328125, + "loss": 0.082, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.321187138557434, + "rewards/margins": 4.85211706161499, + "rewards/rejected": -3.5309300422668457, + "step": 650 + }, + { + "epoch": 0.42, + "learning_rate": 7.066381156316916e-08, + "logits/chosen": 1.7404190301895142, + "logits/rejected": 2.677952289581299, + "logps/chosen": -402.8115234375, + "logps/rejected": -340.89337158203125, + "loss": 0.0725, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 2.031486749649048, + "rewards/margins": 5.562923908233643, + "rewards/rejected": -3.531437397003174, + "step": 660 + }, + { + "epoch": 0.43, + "learning_rate": 7.173447537473233e-08, + "logits/chosen": 1.8731329441070557, + "logits/rejected": 2.752986431121826, + "logps/chosen": -387.5521240234375, + "logps/rejected": -344.8697204589844, + "loss": 0.0797, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.3953731060028076, + "rewards/margins": 5.006608963012695, + "rewards/rejected": -3.611236095428467, + "step": 670 + }, + { + "epoch": 0.44, + "learning_rate": 7.28051391862955e-08, + "logits/chosen": 1.765631914138794, + "logits/rejected": 2.9511940479278564, + "logps/chosen": -412.4698791503906, + "logps/rejected": -378.50537109375, + "loss": 0.0922, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.7086801528930664, + "rewards/margins": 5.677321434020996, + "rewards/rejected": -3.968641757965088, + "step": 680 + }, + { + "epoch": 0.44, + "learning_rate": 7.387580299785867e-08, + "logits/chosen": 1.7808834314346313, + "logits/rejected": 2.730713367462158, + "logps/chosen": -375.7123718261719, + "logps/rejected": -352.998046875, + "loss": 0.0794, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.92323899269104, + "rewards/margins": 5.590106010437012, + "rewards/rejected": -3.6668670177459717, + "step": 690 + }, + { + "epoch": 0.45, + "learning_rate": 7.494646680942184e-08, + "logits/chosen": 1.7647113800048828, + "logits/rejected": 2.6917636394500732, + "logps/chosen": -393.0909729003906, + "logps/rejected": -375.1470947265625, + "loss": 0.0754, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.228740930557251, + "rewards/margins": 6.255539417266846, + "rewards/rejected": -4.026798248291016, + "step": 700 + }, + { + "epoch": 0.45, + "eval_logits/chosen": 1.1851357221603394, + "eval_logits/rejected": 1.7113410234451294, + "eval_logps/chosen": -382.10595703125, + "eval_logps/rejected": -341.21160888671875, + "eval_loss": 0.07382317632436752, + "eval_rewards/accuracies": 0.96875, + "eval_rewards/chosen": 1.7235567569732666, + "eval_rewards/margins": 5.283637523651123, + "eval_rewards/rejected": -3.5600812435150146, + "eval_runtime": 78.3559, + "eval_samples_per_second": 12.762, + "eval_steps_per_second": 0.408, + "step": 700 + }, + { + "epoch": 0.46, + "learning_rate": 7.601713062098501e-08, + "logits/chosen": 1.8762671947479248, + "logits/rejected": 2.7768630981445312, + "logps/chosen": -403.41461181640625, + "logps/rejected": -348.1267395019531, + "loss": 0.0767, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.008270502090454, + "rewards/margins": 5.45863151550293, + "rewards/rejected": -3.4503607749938965, + "step": 710 + }, + { + "epoch": 0.46, + "learning_rate": 7.708779443254818e-08, + "logits/chosen": 1.9685356616973877, + "logits/rejected": 2.4259753227233887, + "logps/chosen": -423.30694580078125, + "logps/rejected": -355.1989440917969, + "loss": 0.0738, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.1973319053649902, + "rewards/margins": 5.911899566650391, + "rewards/rejected": -3.7145678997039795, + "step": 720 + }, + { + "epoch": 0.47, + "learning_rate": 7.815845824411135e-08, + "logits/chosen": 1.7616288661956787, + "logits/rejected": 2.778398036956787, + "logps/chosen": -415.79833984375, + "logps/rejected": -365.4802551269531, + "loss": 0.0838, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.6791164875030518, + "rewards/margins": 5.412492275238037, + "rewards/rejected": -3.7333762645721436, + "step": 730 + }, + { + "epoch": 0.48, + "learning_rate": 7.922912205567452e-08, + "logits/chosen": 1.911268949508667, + "logits/rejected": 2.5959761142730713, + "logps/chosen": -399.43463134765625, + "logps/rejected": -380.5097961425781, + "loss": 0.0822, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.497511625289917, + "rewards/margins": 5.510095596313477, + "rewards/rejected": -4.0125837326049805, + "step": 740 + }, + { + "epoch": 0.48, + "learning_rate": 8.029978586723767e-08, + "logits/chosen": 1.8361726999282837, + "logits/rejected": 2.3821051120758057, + "logps/chosen": -372.65618896484375, + "logps/rejected": -381.7906799316406, + "loss": 0.0662, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.844112753868103, + "rewards/margins": 6.229226589202881, + "rewards/rejected": -4.385113716125488, + "step": 750 + }, + { + "epoch": 0.49, + "learning_rate": 8.137044967880086e-08, + "logits/chosen": 2.179152011871338, + "logits/rejected": 2.5297486782073975, + "logps/chosen": -396.2829284667969, + "logps/rejected": -352.2908935546875, + "loss": 0.0683, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.0879392623901367, + "rewards/margins": 6.044869422912598, + "rewards/rejected": -3.956930160522461, + "step": 760 + }, + { + "epoch": 0.49, + "learning_rate": 8.244111349036403e-08, + "logits/chosen": 1.5910015106201172, + "logits/rejected": 2.9595401287078857, + "logps/chosen": -386.81573486328125, + "logps/rejected": -350.80303955078125, + "loss": 0.0697, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.7900558710098267, + "rewards/margins": 5.987098693847656, + "rewards/rejected": -4.197042942047119, + "step": 770 + }, + { + "epoch": 0.5, + "learning_rate": 8.351177730192718e-08, + "logits/chosen": 1.8572914600372314, + "logits/rejected": 2.7143654823303223, + "logps/chosen": -416.63238525390625, + "logps/rejected": -403.2386169433594, + "loss": 0.0668, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.9369878768920898, + "rewards/margins": 6.391732692718506, + "rewards/rejected": -4.4547438621521, + "step": 780 + }, + { + "epoch": 0.51, + "learning_rate": 8.458244111349035e-08, + "logits/chosen": 1.7877603769302368, + "logits/rejected": 3.0778090953826904, + "logps/chosen": -436.93121337890625, + "logps/rejected": -375.30499267578125, + "loss": 0.0639, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.7189620733261108, + "rewards/margins": 6.110939979553223, + "rewards/rejected": -4.391977787017822, + "step": 790 + }, + { + "epoch": 0.51, + "learning_rate": 8.565310492505354e-08, + "logits/chosen": 1.8236795663833618, + "logits/rejected": 3.053520441055298, + "logps/chosen": -440.10711669921875, + "logps/rejected": -371.2597351074219, + "loss": 0.0748, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6093488931655884, + "rewards/margins": 5.858896732330322, + "rewards/rejected": -4.249547004699707, + "step": 800 + }, + { + "epoch": 0.51, + "eval_logits/chosen": 1.2858448028564453, + "eval_logits/rejected": 1.8164113759994507, + "eval_logps/chosen": -386.32391357421875, + "eval_logps/rejected": -349.60479736328125, + "eval_loss": 0.0711909607052803, + "eval_rewards/accuracies": 0.9765625, + "eval_rewards/chosen": 1.3017570972442627, + "eval_rewards/margins": 5.701159477233887, + "eval_rewards/rejected": -4.399402618408203, + "eval_runtime": 78.5065, + "eval_samples_per_second": 12.738, + "eval_steps_per_second": 0.408, + "step": 800 + }, + { + "epoch": 0.52, + "learning_rate": 8.672376873661669e-08, + "logits/chosen": 2.046654462814331, + "logits/rejected": 2.875682830810547, + "logps/chosen": -367.9713134765625, + "logps/rejected": -358.0971984863281, + "loss": 0.0733, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.6088149547576904, + "rewards/margins": 6.157548427581787, + "rewards/rejected": -4.548734188079834, + "step": 810 + }, + { + "epoch": 0.53, + "learning_rate": 8.779443254817986e-08, + "logits/chosen": 2.2108118534088135, + "logits/rejected": 2.811412811279297, + "logps/chosen": -388.9371032714844, + "logps/rejected": -359.3924255371094, + "loss": 0.0554, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.500943899154663, + "rewards/margins": 6.306944370269775, + "rewards/rejected": -4.806000709533691, + "step": 820 + }, + { + "epoch": 0.53, + "learning_rate": 8.886509635974304e-08, + "logits/chosen": 1.7628936767578125, + "logits/rejected": 2.9087016582489014, + "logps/chosen": -436.14886474609375, + "logps/rejected": -375.5445251464844, + "loss": 0.0587, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.7326514720916748, + "rewards/margins": 6.393059730529785, + "rewards/rejected": -4.660407543182373, + "step": 830 + }, + { + "epoch": 0.54, + "learning_rate": 8.99357601713062e-08, + "logits/chosen": 1.880814552307129, + "logits/rejected": 2.7161240577697754, + "logps/chosen": -395.0550231933594, + "logps/rejected": -363.26849365234375, + "loss": 0.0616, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.268677830696106, + "rewards/margins": 5.726746559143066, + "rewards/rejected": -4.45806884765625, + "step": 840 + }, + { + "epoch": 0.55, + "learning_rate": 9.100642398286937e-08, + "logits/chosen": 1.7003024816513062, + "logits/rejected": 2.424133777618408, + "logps/chosen": -389.845703125, + "logps/rejected": -358.5777282714844, + "loss": 0.0776, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.434398889541626, + "rewards/margins": 5.904941082000732, + "rewards/rejected": -4.470543384552002, + "step": 850 + }, + { + "epoch": 0.55, + "learning_rate": 9.207708779443254e-08, + "logits/chosen": 1.8526960611343384, + "logits/rejected": 2.986250638961792, + "logps/chosen": -414.87744140625, + "logps/rejected": -394.83477783203125, + "loss": 0.0555, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.5953035354614258, + "rewards/margins": 6.830922603607178, + "rewards/rejected": -5.23561954498291, + "step": 860 + }, + { + "epoch": 0.56, + "learning_rate": 9.314775160599571e-08, + "logits/chosen": 2.066636800765991, + "logits/rejected": 2.8474645614624023, + "logps/chosen": -422.1144104003906, + "logps/rejected": -373.4290466308594, + "loss": 0.059, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.4623820781707764, + "rewards/margins": 6.585521697998047, + "rewards/rejected": -5.123138904571533, + "step": 870 + }, + { + "epoch": 0.57, + "learning_rate": 9.421841541755888e-08, + "logits/chosen": 2.3123459815979004, + "logits/rejected": 2.8036293983459473, + "logps/chosen": -407.44525146484375, + "logps/rejected": -358.09771728515625, + "loss": 0.0736, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 1.7242000102996826, + "rewards/margins": 6.361567974090576, + "rewards/rejected": -4.637368202209473, + "step": 880 + }, + { + "epoch": 0.57, + "learning_rate": 9.528907922912205e-08, + "logits/chosen": 1.5694644451141357, + "logits/rejected": 2.9323623180389404, + "logps/chosen": -440.52264404296875, + "logps/rejected": -416.00543212890625, + "loss": 0.0513, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.229987621307373, + "rewards/margins": 7.332463264465332, + "rewards/rejected": -5.102475166320801, + "step": 890 + }, + { + "epoch": 0.58, + "learning_rate": 9.635974304068522e-08, + "logits/chosen": 1.9838998317718506, + "logits/rejected": 3.293696165084839, + "logps/chosen": -377.60546875, + "logps/rejected": -400.6976013183594, + "loss": 0.0418, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.361082911491394, + "rewards/margins": 7.032995700836182, + "rewards/rejected": -5.6719136238098145, + "step": 900 + }, + { + "epoch": 0.58, + "eval_logits/chosen": 1.431064486503601, + "eval_logits/rejected": 1.926888108253479, + "eval_logps/chosen": -389.20965576171875, + "eval_logps/rejected": -357.00994873046875, + "eval_loss": 0.06582893431186676, + "eval_rewards/accuracies": 0.96875, + "eval_rewards/chosen": 1.0131824016571045, + "eval_rewards/margins": 6.153097629547119, + "eval_rewards/rejected": -5.1399149894714355, + "eval_runtime": 78.1749, + "eval_samples_per_second": 12.792, + "eval_steps_per_second": 0.409, + "step": 900 + }, + { + "epoch": 0.58, + "learning_rate": 9.743040685224839e-08, + "logits/chosen": 2.3193790912628174, + "logits/rejected": 2.9782004356384277, + "logps/chosen": -355.58978271484375, + "logps/rejected": -374.8011779785156, + "loss": 0.0504, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.7792609930038452, + "rewards/margins": 6.809684753417969, + "rewards/rejected": -5.030424118041992, + "step": 910 + }, + { + "epoch": 0.59, + "learning_rate": 9.850107066381156e-08, + "logits/chosen": 1.8106858730316162, + "logits/rejected": 2.700303316116333, + "logps/chosen": -436.91912841796875, + "logps/rejected": -376.2388610839844, + "loss": 0.0524, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.020312786102295, + "rewards/margins": 7.166808128356934, + "rewards/rejected": -5.146495342254639, + "step": 920 + }, + { + "epoch": 0.6, + "learning_rate": 9.957173447537473e-08, + "logits/chosen": 1.956599235534668, + "logits/rejected": 2.98834490776062, + "logps/chosen": -432.9703063964844, + "logps/rejected": -390.3727722167969, + "loss": 0.0802, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.9984395503997803, + "rewards/margins": 7.112124443054199, + "rewards/rejected": -5.11368465423584, + "step": 930 + }, + { + "epoch": 0.6, + "learning_rate": 9.992858843132586e-08, + "logits/chosen": 2.160102605819702, + "logits/rejected": 2.821474313735962, + "logps/chosen": -424.42962646484375, + "logps/rejected": -402.60235595703125, + "loss": 0.059, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0432326793670654, + "rewards/margins": 7.653304100036621, + "rewards/rejected": -5.610072135925293, + "step": 940 + }, + { + "epoch": 0.61, + "learning_rate": 9.980956915020233e-08, + "logits/chosen": 1.941819190979004, + "logits/rejected": 3.0604381561279297, + "logps/chosen": -397.3748474121094, + "logps/rejected": -423.2496643066406, + "loss": 0.0557, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.0661425590515137, + "rewards/margins": 7.666254997253418, + "rewards/rejected": -5.600112438201904, + "step": 950 + }, + { + "epoch": 0.62, + "learning_rate": 9.969054986907879e-08, + "logits/chosen": 1.859452247619629, + "logits/rejected": 3.1880416870117188, + "logps/chosen": -423.7708435058594, + "logps/rejected": -383.41259765625, + "loss": 0.0419, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7157636880874634, + "rewards/margins": 7.163266181945801, + "rewards/rejected": -5.447502136230469, + "step": 960 + }, + { + "epoch": 0.62, + "learning_rate": 9.957153058795524e-08, + "logits/chosen": 1.755894660949707, + "logits/rejected": 2.667271852493286, + "logps/chosen": -405.3694763183594, + "logps/rejected": -380.8141784667969, + "loss": 0.0777, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.459249496459961, + "rewards/margins": 6.949099540710449, + "rewards/rejected": -5.489850044250488, + "step": 970 + }, + { + "epoch": 0.63, + "learning_rate": 9.94525113068317e-08, + "logits/chosen": 1.9811556339263916, + "logits/rejected": 3.005946397781372, + "logps/chosen": -364.09521484375, + "logps/rejected": -364.8800048828125, + "loss": 0.0533, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.8379303216934204, + "rewards/margins": 6.930338382720947, + "rewards/rejected": -5.092407703399658, + "step": 980 + }, + { + "epoch": 0.64, + "learning_rate": 9.933349202570817e-08, + "logits/chosen": 1.7156349420547485, + "logits/rejected": 2.972471237182617, + "logps/chosen": -424.38519287109375, + "logps/rejected": -384.4205627441406, + "loss": 0.0545, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.169966459274292, + "rewards/margins": 7.766200065612793, + "rewards/rejected": -5.596234321594238, + "step": 990 + }, + { + "epoch": 0.64, + "learning_rate": 9.921447274458463e-08, + "logits/chosen": 1.65463387966156, + "logits/rejected": 3.0111804008483887, + "logps/chosen": -466.137451171875, + "logps/rejected": -387.94317626953125, + "loss": 0.0597, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.5395125150680542, + "rewards/margins": 7.153542518615723, + "rewards/rejected": -5.614029884338379, + "step": 1000 + }, + { + "epoch": 0.64, + "eval_logits/chosen": 1.426473617553711, + "eval_logits/rejected": 1.9225867986679077, + "eval_logps/chosen": -391.24505615234375, + "eval_logps/rejected": -361.6107482910156, + "eval_loss": 0.06333575397729874, + "eval_rewards/accuracies": 0.96875, + "eval_rewards/chosen": 0.8096399307250977, + "eval_rewards/margins": 6.409637928009033, + "eval_rewards/rejected": -5.5999979972839355, + "eval_runtime": 78.1719, + "eval_samples_per_second": 12.792, + "eval_steps_per_second": 0.409, + "step": 1000 + }, + { + "epoch": 0.65, + "learning_rate": 9.909545346346108e-08, + "logits/chosen": 2.0751636028289795, + "logits/rejected": 2.8259801864624023, + "logps/chosen": -391.50396728515625, + "logps/rejected": -390.41680908203125, + "loss": 0.0479, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.0614975690841675, + "rewards/margins": 6.492199897766113, + "rewards/rejected": -5.430701732635498, + "step": 1010 + }, + { + "epoch": 0.66, + "learning_rate": 9.897643418233753e-08, + "logits/chosen": 1.8207337856292725, + "logits/rejected": 2.810199737548828, + "logps/chosen": -412.99658203125, + "logps/rejected": -401.36126708984375, + "loss": 0.0404, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.5562200546264648, + "rewards/margins": 8.119148254394531, + "rewards/rejected": -6.562928199768066, + "step": 1020 + }, + { + "epoch": 0.66, + "learning_rate": 9.885741490121398e-08, + "logits/chosen": 2.1346724033355713, + "logits/rejected": 2.99312424659729, + "logps/chosen": -427.60601806640625, + "logps/rejected": -379.160400390625, + "loss": 0.0493, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 2.149390697479248, + "rewards/margins": 7.403092384338379, + "rewards/rejected": -5.253701210021973, + "step": 1030 + }, + { + "epoch": 0.67, + "learning_rate": 9.873839562009045e-08, + "logits/chosen": 1.81674063205719, + "logits/rejected": 3.14642071723938, + "logps/chosen": -410.53436279296875, + "logps/rejected": -422.78790283203125, + "loss": 0.0625, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.664518117904663, + "rewards/margins": 7.533532619476318, + "rewards/rejected": -5.869014739990234, + "step": 1040 + }, + { + "epoch": 0.67, + "learning_rate": 9.861937633896691e-08, + "logits/chosen": 2.1859567165374756, + "logits/rejected": 2.774512767791748, + "logps/chosen": -371.1358642578125, + "logps/rejected": -376.1512451171875, + "loss": 0.0397, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.8732522130012512, + "rewards/margins": 6.792318820953369, + "rewards/rejected": -5.919065952301025, + "step": 1050 + }, + { + "epoch": 0.68, + "learning_rate": 9.850035705784336e-08, + "logits/chosen": 2.508104085922241, + "logits/rejected": 2.7455363273620605, + "logps/chosen": -438.8089294433594, + "logps/rejected": -426.82464599609375, + "loss": 0.0433, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.799186110496521, + "rewards/margins": 8.152082443237305, + "rewards/rejected": -6.352896690368652, + "step": 1060 + }, + { + "epoch": 0.69, + "learning_rate": 9.838133777671982e-08, + "logits/chosen": 2.162297010421753, + "logits/rejected": 2.9005820751190186, + "logps/chosen": -393.8660583496094, + "logps/rejected": -402.96649169921875, + "loss": 0.0523, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.1076323986053467, + "rewards/margins": 7.283668518066406, + "rewards/rejected": -6.1760358810424805, + "step": 1070 + }, + { + "epoch": 0.69, + "learning_rate": 9.826231849559629e-08, + "logits/chosen": 2.274879217147827, + "logits/rejected": 2.933121681213379, + "logps/chosen": -436.8518981933594, + "logps/rejected": -405.71246337890625, + "loss": 0.0622, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.486023187637329, + "rewards/margins": 7.599495887756348, + "rewards/rejected": -6.1134724617004395, + "step": 1080 + }, + { + "epoch": 0.7, + "learning_rate": 9.814329921447275e-08, + "logits/chosen": 2.3372159004211426, + "logits/rejected": 2.4765231609344482, + "logps/chosen": -394.52398681640625, + "logps/rejected": -383.6842956542969, + "loss": 0.047, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.5595790147781372, + "rewards/margins": 7.81160831451416, + "rewards/rejected": -6.2520294189453125, + "step": 1090 + }, + { + "epoch": 0.71, + "learning_rate": 9.80242799333492e-08, + "logits/chosen": 2.076772689819336, + "logits/rejected": 3.134021282196045, + "logps/chosen": -424.93914794921875, + "logps/rejected": -395.6697998046875, + "loss": 0.045, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.405705451965332, + "rewards/margins": 7.619529724121094, + "rewards/rejected": -6.213824272155762, + "step": 1100 + }, + { + "epoch": 0.71, + "eval_logits/chosen": 1.5631608963012695, + "eval_logits/rejected": 2.0626633167266846, + "eval_logps/chosen": -392.5325012207031, + "eval_logps/rejected": -367.3223876953125, + "eval_loss": 0.060555677860975266, + "eval_rewards/accuracies": 0.9609375, + "eval_rewards/chosen": 0.6809001564979553, + "eval_rewards/margins": 6.852060317993164, + "eval_rewards/rejected": -6.171159744262695, + "eval_runtime": 78.2923, + "eval_samples_per_second": 12.773, + "eval_steps_per_second": 0.409, + "step": 1100 + }, + { + "epoch": 0.71, + "learning_rate": 9.790526065222565e-08, + "logits/chosen": 2.313490629196167, + "logits/rejected": 2.873136281967163, + "logps/chosen": -429.7516174316406, + "logps/rejected": -405.4490051269531, + "loss": 0.0419, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.5168659687042236, + "rewards/margins": 7.972109794616699, + "rewards/rejected": -6.455244541168213, + "step": 1110 + }, + { + "epoch": 0.72, + "learning_rate": 9.778624137110211e-08, + "logits/chosen": 2.1459672451019287, + "logits/rejected": 2.9044458866119385, + "logps/chosen": -420.87744140625, + "logps/rejected": -377.68896484375, + "loss": 0.0336, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6729495525360107, + "rewards/margins": 7.566412925720215, + "rewards/rejected": -5.893463134765625, + "step": 1120 + }, + { + "epoch": 0.73, + "learning_rate": 9.766722208997857e-08, + "logits/chosen": 2.3557143211364746, + "logits/rejected": 2.725691318511963, + "logps/chosen": -405.68109130859375, + "logps/rejected": -419.1329040527344, + "loss": 0.0552, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.7717654705047607, + "rewards/margins": 8.059925079345703, + "rewards/rejected": -6.288159370422363, + "step": 1130 + }, + { + "epoch": 0.73, + "learning_rate": 9.754820280885503e-08, + "logits/chosen": 2.216296672821045, + "logits/rejected": 2.695742130279541, + "logps/chosen": -415.5310974121094, + "logps/rejected": -400.45263671875, + "loss": 0.0422, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.6496362686157227, + "rewards/margins": 7.936199188232422, + "rewards/rejected": -6.286562919616699, + "step": 1140 + }, + { + "epoch": 0.74, + "learning_rate": 9.742918352773148e-08, + "logits/chosen": 2.093048572540283, + "logits/rejected": 2.7978243827819824, + "logps/chosen": -450.3804626464844, + "logps/rejected": -412.62908935546875, + "loss": 0.0573, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.3106613159179688, + "rewards/margins": 8.593067169189453, + "rewards/rejected": -6.282405853271484, + "step": 1150 + }, + { + "epoch": 0.75, + "learning_rate": 9.731016424660795e-08, + "logits/chosen": 2.472761869430542, + "logits/rejected": 3.0465588569641113, + "logps/chosen": -426.41412353515625, + "logps/rejected": -407.08612060546875, + "loss": 0.0443, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.5518306493759155, + "rewards/margins": 8.157126426696777, + "rewards/rejected": -6.6052961349487305, + "step": 1160 + }, + { + "epoch": 0.75, + "learning_rate": 9.719114496548441e-08, + "logits/chosen": 1.941318154335022, + "logits/rejected": 3.335367202758789, + "logps/chosen": -418.33428955078125, + "logps/rejected": -403.39337158203125, + "loss": 0.0404, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.0809710025787354, + "rewards/margins": 8.985780715942383, + "rewards/rejected": -6.90480899810791, + "step": 1170 + }, + { + "epoch": 0.76, + "learning_rate": 9.707212568436087e-08, + "logits/chosen": 2.241579532623291, + "logits/rejected": 2.8625073432922363, + "logps/chosen": -408.77178955078125, + "logps/rejected": -389.53778076171875, + "loss": 0.0384, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.2259656190872192, + "rewards/margins": 8.003189086914062, + "rewards/rejected": -6.777223110198975, + "step": 1180 + }, + { + "epoch": 0.76, + "learning_rate": 9.695310640323732e-08, + "logits/chosen": 2.914790630340576, + "logits/rejected": 2.8142731189727783, + "logps/chosen": -387.0242614746094, + "logps/rejected": -399.86749267578125, + "loss": 0.0371, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.323935627937317, + "rewards/margins": 8.005804061889648, + "rewards/rejected": -6.681868553161621, + "step": 1190 + }, + { + "epoch": 0.77, + "learning_rate": 9.683408712211378e-08, + "logits/chosen": 2.1048951148986816, + "logits/rejected": 2.841226816177368, + "logps/chosen": -451.27459716796875, + "logps/rejected": -403.28240966796875, + "loss": 0.0669, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.7710305452346802, + "rewards/margins": 7.861186981201172, + "rewards/rejected": -7.090156555175781, + "step": 1200 + }, + { + "epoch": 0.77, + "eval_logits/chosen": 1.5480470657348633, + "eval_logits/rejected": 2.058847188949585, + "eval_logps/chosen": -392.0873718261719, + "eval_logps/rejected": -371.0377197265625, + "eval_loss": 0.06311403959989548, + "eval_rewards/accuracies": 0.9609375, + "eval_rewards/chosen": 0.7254116535186768, + "eval_rewards/margins": 7.268110275268555, + "eval_rewards/rejected": -6.542698860168457, + "eval_runtime": 78.0374, + "eval_samples_per_second": 12.814, + "eval_steps_per_second": 0.41, + "step": 1200 + }, + { + "epoch": 0.78, + "learning_rate": 9.671506784099024e-08, + "logits/chosen": 2.5204672813415527, + "logits/rejected": 3.2195708751678467, + "logps/chosen": -350.8345031738281, + "logps/rejected": -381.27569580078125, + "loss": 0.0321, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.312830924987793, + "rewards/margins": 8.153268814086914, + "rewards/rejected": -6.840437889099121, + "step": 1210 + }, + { + "epoch": 0.78, + "learning_rate": 9.659604855986669e-08, + "logits/chosen": 2.0493383407592773, + "logits/rejected": 2.925226926803589, + "logps/chosen": -462.79638671875, + "logps/rejected": -402.0813903808594, + "loss": 0.0583, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.8163875341415405, + "rewards/margins": 8.08704948425293, + "rewards/rejected": -6.270661354064941, + "step": 1220 + }, + { + "epoch": 0.79, + "learning_rate": 9.647702927874315e-08, + "logits/chosen": 2.3467869758605957, + "logits/rejected": 2.963789463043213, + "logps/chosen": -448.3075256347656, + "logps/rejected": -411.7268981933594, + "loss": 0.0308, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.1376395225524902, + "rewards/margins": 8.935708999633789, + "rewards/rejected": -6.798068046569824, + "step": 1230 + }, + { + "epoch": 0.8, + "learning_rate": 9.63580099976196e-08, + "logits/chosen": 2.0899271965026855, + "logits/rejected": 3.0291128158569336, + "logps/chosen": -395.5885009765625, + "logps/rejected": -395.0786437988281, + "loss": 0.0437, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.3425300121307373, + "rewards/margins": 7.873586177825928, + "rewards/rejected": -6.5310564041137695, + "step": 1240 + }, + { + "epoch": 0.8, + "learning_rate": 9.623899071649607e-08, + "logits/chosen": 2.6475844383239746, + "logits/rejected": 3.3692593574523926, + "logps/chosen": -355.3011169433594, + "logps/rejected": -404.4001770019531, + "loss": 0.036, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.1569859981536865, + "rewards/margins": 8.337442398071289, + "rewards/rejected": -6.18045711517334, + "step": 1250 + }, + { + "epoch": 0.81, + "learning_rate": 9.611997143537253e-08, + "logits/chosen": 2.364384412765503, + "logits/rejected": 2.7631328105926514, + "logps/chosen": -441.07391357421875, + "logps/rejected": -394.32122802734375, + "loss": 0.0434, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.301563024520874, + "rewards/margins": 8.577180862426758, + "rewards/rejected": -6.275616645812988, + "step": 1260 + }, + { + "epoch": 0.82, + "learning_rate": 9.600095215424899e-08, + "logits/chosen": 2.9918723106384277, + "logits/rejected": 3.4367504119873047, + "logps/chosen": -358.42230224609375, + "logps/rejected": -392.7529296875, + "loss": 0.0499, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.5515286922454834, + "rewards/margins": 8.478203773498535, + "rewards/rejected": -6.926675319671631, + "step": 1270 + }, + { + "epoch": 0.82, + "learning_rate": 9.588193287312544e-08, + "logits/chosen": 2.2631287574768066, + "logits/rejected": 3.006317138671875, + "logps/chosen": -417.509033203125, + "logps/rejected": -430.56353759765625, + "loss": 0.0396, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.066652536392212, + "rewards/margins": 8.158655166625977, + "rewards/rejected": -7.092002868652344, + "step": 1280 + }, + { + "epoch": 0.83, + "learning_rate": 9.57629135920019e-08, + "logits/chosen": 2.1602940559387207, + "logits/rejected": 3.0440659523010254, + "logps/chosen": -381.5716247558594, + "logps/rejected": -442.3666076660156, + "loss": 0.0371, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.255479335784912, + "rewards/margins": 10.695677757263184, + "rewards/rejected": -9.440199851989746, + "step": 1290 + }, + { + "epoch": 0.84, + "learning_rate": 9.564389431087836e-08, + "logits/chosen": 2.3889143466949463, + "logits/rejected": 3.3642821311950684, + "logps/chosen": -386.8828430175781, + "logps/rejected": -414.80157470703125, + "loss": 0.0349, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5850082635879517, + "rewards/margins": 7.8508620262146, + "rewards/rejected": -7.265854835510254, + "step": 1300 + }, + { + "epoch": 0.84, + "eval_logits/chosen": 1.5706590414047241, + "eval_logits/rejected": 2.0521459579467773, + "eval_logps/chosen": -390.2462158203125, + "eval_logps/rejected": -372.8173522949219, + "eval_loss": 0.05391751974821091, + "eval_rewards/accuracies": 0.9609375, + "eval_rewards/chosen": 0.9095280170440674, + "eval_rewards/margins": 7.630187034606934, + "eval_rewards/rejected": -6.720658302307129, + "eval_runtime": 77.9662, + "eval_samples_per_second": 12.826, + "eval_steps_per_second": 0.41, + "step": 1300 + }, + { + "epoch": 0.84, + "learning_rate": 9.552487502975481e-08, + "logits/chosen": 2.109424114227295, + "logits/rejected": 3.372615098953247, + "logps/chosen": -425.7452087402344, + "logps/rejected": -420.9171447753906, + "loss": 0.0561, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.8334258794784546, + "rewards/margins": 8.975770950317383, + "rewards/rejected": -7.142345428466797, + "step": 1310 + }, + { + "epoch": 0.85, + "learning_rate": 9.540585574863127e-08, + "logits/chosen": 1.9445127248764038, + "logits/rejected": 2.690776824951172, + "logps/chosen": -418.9251403808594, + "logps/rejected": -406.52813720703125, + "loss": 0.0519, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2240526676177979, + "rewards/margins": 8.488465309143066, + "rewards/rejected": -7.264412879943848, + "step": 1320 + }, + { + "epoch": 0.85, + "learning_rate": 9.528683646750774e-08, + "logits/chosen": 2.327258586883545, + "logits/rejected": 3.642822265625, + "logps/chosen": -422.3324279785156, + "logps/rejected": -427.5508728027344, + "loss": 0.0439, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.9490644335746765, + "rewards/margins": 8.496426582336426, + "rewards/rejected": -7.547361850738525, + "step": 1330 + }, + { + "epoch": 0.86, + "learning_rate": 9.51678171863842e-08, + "logits/chosen": 2.0918898582458496, + "logits/rejected": 3.0718982219696045, + "logps/chosen": -451.8070373535156, + "logps/rejected": -396.4839782714844, + "loss": 0.0282, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.8088127374649048, + "rewards/margins": 8.987968444824219, + "rewards/rejected": -7.179154872894287, + "step": 1340 + }, + { + "epoch": 0.87, + "learning_rate": 9.504879790526065e-08, + "logits/chosen": 2.251584529876709, + "logits/rejected": 2.643411636352539, + "logps/chosen": -390.68804931640625, + "logps/rejected": -401.1186218261719, + "loss": 0.0389, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.3335015773773193, + "rewards/margins": 8.997198104858398, + "rewards/rejected": -7.663697242736816, + "step": 1350 + }, + { + "epoch": 0.87, + "learning_rate": 9.49297786241371e-08, + "logits/chosen": 2.6715595722198486, + "logits/rejected": 2.812282085418701, + "logps/chosen": -398.098388671875, + "logps/rejected": -429.861083984375, + "loss": 0.0371, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4172677993774414, + "rewards/margins": 9.3431396484375, + "rewards/rejected": -7.925871849060059, + "step": 1360 + }, + { + "epoch": 0.88, + "learning_rate": 9.481075934301356e-08, + "logits/chosen": 2.6521875858306885, + "logits/rejected": 3.3430676460266113, + "logps/chosen": -384.2088928222656, + "logps/rejected": -394.98565673828125, + "loss": 0.0281, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5810503959655762, + "rewards/margins": 8.41893482208252, + "rewards/rejected": -6.837882995605469, + "step": 1370 + }, + { + "epoch": 0.89, + "learning_rate": 9.469174006189002e-08, + "logits/chosen": 1.8557714223861694, + "logits/rejected": 2.911968946456909, + "logps/chosen": -493.2205505371094, + "logps/rejected": -419.2433166503906, + "loss": 0.038, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.404146432876587, + "rewards/margins": 9.705537796020508, + "rewards/rejected": -7.3013916015625, + "step": 1380 + }, + { + "epoch": 0.89, + "learning_rate": 9.457272078076648e-08, + "logits/chosen": 2.248764991760254, + "logits/rejected": 2.723816394805908, + "logps/chosen": -448.24749755859375, + "logps/rejected": -399.8122253417969, + "loss": 0.0282, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.05086612701416, + "rewards/margins": 8.73229694366455, + "rewards/rejected": -6.681430816650391, + "step": 1390 + }, + { + "epoch": 0.9, + "learning_rate": 9.445370149964293e-08, + "logits/chosen": 2.173478603363037, + "logits/rejected": 3.2225327491760254, + "logps/chosen": -396.5003356933594, + "logps/rejected": -385.8442687988281, + "loss": 0.0493, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8472175598144531, + "rewards/margins": 8.179679870605469, + "rewards/rejected": -6.332461357116699, + "step": 1400 + }, + { + "epoch": 0.9, + "eval_logits/chosen": 1.612717866897583, + "eval_logits/rejected": 2.088186264038086, + "eval_logps/chosen": -388.5294189453125, + "eval_logps/rejected": -372.8053894042969, + "eval_loss": 0.05290338769555092, + "eval_rewards/accuracies": 0.9609375, + "eval_rewards/chosen": 1.0812093019485474, + "eval_rewards/margins": 7.80067253112793, + "eval_rewards/rejected": -6.719463348388672, + "eval_runtime": 77.9512, + "eval_samples_per_second": 12.829, + "eval_steps_per_second": 0.411, + "step": 1400 + }, + { + "epoch": 0.91, + "learning_rate": 9.43346822185194e-08, + "logits/chosen": 1.9332910776138306, + "logits/rejected": 3.120542526245117, + "logps/chosen": -405.4175109863281, + "logps/rejected": -406.57781982421875, + "loss": 0.0388, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.5408118963241577, + "rewards/margins": 8.476791381835938, + "rewards/rejected": -6.935980319976807, + "step": 1410 + }, + { + "epoch": 0.91, + "learning_rate": 9.421566293739586e-08, + "logits/chosen": 2.6357414722442627, + "logits/rejected": 2.7694199085235596, + "logps/chosen": -403.54632568359375, + "logps/rejected": -362.77557373046875, + "loss": 0.05, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.2265160083770752, + "rewards/margins": 7.877467155456543, + "rewards/rejected": -6.650951385498047, + "step": 1420 + }, + { + "epoch": 0.92, + "learning_rate": 9.409664365627231e-08, + "logits/chosen": 2.213944673538208, + "logits/rejected": 2.804633617401123, + "logps/chosen": -411.54608154296875, + "logps/rejected": -416.67498779296875, + "loss": 0.0319, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.015861988067627, + "rewards/margins": 9.614884376525879, + "rewards/rejected": -7.599021911621094, + "step": 1430 + }, + { + "epoch": 0.93, + "learning_rate": 9.397762437514877e-08, + "logits/chosen": 2.344874620437622, + "logits/rejected": 2.9800264835357666, + "logps/chosen": -387.05206298828125, + "logps/rejected": -409.482421875, + "loss": 0.0328, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7274560928344727, + "rewards/margins": 8.34837818145752, + "rewards/rejected": -7.620922088623047, + "step": 1440 + }, + { + "epoch": 0.93, + "learning_rate": 9.385860509402523e-08, + "logits/chosen": 2.6229679584503174, + "logits/rejected": 2.767507553100586, + "logps/chosen": -400.82647705078125, + "logps/rejected": -437.8872985839844, + "loss": 0.0437, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.4043718576431274, + "rewards/margins": 9.666748046875, + "rewards/rejected": -8.26237678527832, + "step": 1450 + }, + { + "epoch": 0.94, + "learning_rate": 9.373958581290168e-08, + "logits/chosen": 1.9819673299789429, + "logits/rejected": 3.2101433277130127, + "logps/chosen": -395.5453186035156, + "logps/rejected": -420.27020263671875, + "loss": 0.0334, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1504669189453125, + "rewards/margins": 9.256044387817383, + "rewards/rejected": -8.105578422546387, + "step": 1460 + }, + { + "epoch": 0.94, + "learning_rate": 9.362056653177814e-08, + "logits/chosen": 2.1626055240631104, + "logits/rejected": 2.6654152870178223, + "logps/chosen": -443.01470947265625, + "logps/rejected": -387.11956787109375, + "loss": 0.0519, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.9894819259643555, + "rewards/margins": 8.866507530212402, + "rewards/rejected": -7.877026557922363, + "step": 1470 + }, + { + "epoch": 0.95, + "learning_rate": 9.35015472506546e-08, + "logits/chosen": 2.113184928894043, + "logits/rejected": 2.9894702434539795, + "logps/chosen": -426.97454833984375, + "logps/rejected": -392.66937255859375, + "loss": 0.0455, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.242422342300415, + "rewards/margins": 8.285688400268555, + "rewards/rejected": -7.043266296386719, + "step": 1480 + }, + { + "epoch": 0.96, + "learning_rate": 9.338252796953105e-08, + "logits/chosen": 2.179399013519287, + "logits/rejected": 2.653637409210205, + "logps/chosen": -417.0636291503906, + "logps/rejected": -418.0546875, + "loss": 0.0329, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5582334995269775, + "rewards/margins": 9.643311500549316, + "rewards/rejected": -8.085078239440918, + "step": 1490 + }, + { + "epoch": 0.96, + "learning_rate": 9.326350868840752e-08, + "logits/chosen": 2.342013120651245, + "logits/rejected": 3.294114589691162, + "logps/chosen": -410.2916564941406, + "logps/rejected": -398.54022216796875, + "loss": 0.0361, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9324003458023071, + "rewards/margins": 8.479290008544922, + "rewards/rejected": -7.5468902587890625, + "step": 1500 + }, + { + "epoch": 0.96, + "eval_logits/chosen": 1.6103559732437134, + "eval_logits/rejected": 2.1083178520202637, + "eval_logps/chosen": -395.0100402832031, + "eval_logps/rejected": -380.8038330078125, + "eval_loss": 0.05386331304907799, + "eval_rewards/accuracies": 0.96875, + "eval_rewards/chosen": 0.4331449270248413, + "eval_rewards/margins": 7.952449798583984, + "eval_rewards/rejected": -7.519304275512695, + "eval_runtime": 77.9615, + "eval_samples_per_second": 12.827, + "eval_steps_per_second": 0.41, + "step": 1500 + }, + { + "epoch": 0.97, + "learning_rate": 9.314448940728398e-08, + "logits/chosen": 1.7690677642822266, + "logits/rejected": 3.269226551055908, + "logps/chosen": -419.20867919921875, + "logps/rejected": -420.2843322753906, + "loss": 0.0332, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7724257707595825, + "rewards/margins": 8.932862281799316, + "rewards/rejected": -8.160436630249023, + "step": 1510 + }, + { + "epoch": 0.98, + "learning_rate": 9.302547012616043e-08, + "logits/chosen": 2.311052083969116, + "logits/rejected": 2.7695891857147217, + "logps/chosen": -420.0890197753906, + "logps/rejected": -437.40863037109375, + "loss": 0.0506, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.9803401827812195, + "rewards/margins": 9.851530075073242, + "rewards/rejected": -8.871191024780273, + "step": 1520 + }, + { + "epoch": 0.98, + "learning_rate": 9.290645084503689e-08, + "logits/chosen": 2.257835626602173, + "logits/rejected": 2.8854148387908936, + "logps/chosen": -388.00152587890625, + "logps/rejected": -398.3310546875, + "loss": 0.0312, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.7348248362541199, + "rewards/margins": 8.951128959655762, + "rewards/rejected": -8.216302871704102, + "step": 1530 + }, + { + "epoch": 0.99, + "learning_rate": 9.278743156391336e-08, + "logits/chosen": 1.8644654750823975, + "logits/rejected": 3.318366527557373, + "logps/chosen": -398.03961181640625, + "logps/rejected": -404.6298522949219, + "loss": 0.0329, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.6025460958480835, + "rewards/margins": 8.443681716918945, + "rewards/rejected": -7.8411359786987305, + "step": 1540 + }, + { + "epoch": 1.0, + "learning_rate": 9.26684122827898e-08, + "logits/chosen": 2.1022090911865234, + "logits/rejected": 3.08918833732605, + "logps/chosen": -380.1417236328125, + "logps/rejected": -418.6334533691406, + "loss": 0.0319, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.10860241949558258, + "rewards/margins": 8.798811912536621, + "rewards/rejected": -8.9074125289917, + "step": 1550 + }, + { + "epoch": 1.0, + "learning_rate": 9.254939300166626e-08, + "logits/chosen": 1.9660978317260742, + "logits/rejected": 3.0992188453674316, + "logps/chosen": -432.70379638671875, + "logps/rejected": -398.1070861816406, + "loss": 0.0321, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.7886090874671936, + "rewards/margins": 9.03126335144043, + "rewards/rejected": -8.242653846740723, + "step": 1560 + }, + { + "epoch": 1.01, + "learning_rate": 9.243037372054272e-08, + "logits/chosen": 1.8959643840789795, + "logits/rejected": 3.006300687789917, + "logps/chosen": -485.46435546875, + "logps/rejected": -439.8067321777344, + "loss": 0.0181, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.288769006729126, + "rewards/margins": 10.527647018432617, + "rewards/rejected": -9.238879203796387, + "step": 1570 + }, + { + "epoch": 1.02, + "learning_rate": 9.231135443941919e-08, + "logits/chosen": 2.1207528114318848, + "logits/rejected": 3.0929980278015137, + "logps/chosen": -427.466796875, + "logps/rejected": -432.8934631347656, + "loss": 0.0109, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.8015705943107605, + "rewards/margins": 10.440237998962402, + "rewards/rejected": -9.638667106628418, + "step": 1580 + }, + { + "epoch": 1.02, + "learning_rate": 9.219233515829564e-08, + "logits/chosen": 2.253310441970825, + "logits/rejected": 3.08237886428833, + "logps/chosen": -423.25775146484375, + "logps/rejected": -393.71966552734375, + "loss": 0.0153, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.1011648178100586, + "rewards/margins": 8.963667869567871, + "rewards/rejected": -7.8625030517578125, + "step": 1590 + }, + { + "epoch": 1.03, + "learning_rate": 9.20733158771721e-08, + "logits/chosen": 1.8514906167984009, + "logits/rejected": 2.837451934814453, + "logps/chosen": -476.034912109375, + "logps/rejected": -431.28369140625, + "loss": 0.0107, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7295993566513062, + "rewards/margins": 10.104246139526367, + "rewards/rejected": -8.37464714050293, + "step": 1600 + }, + { + "epoch": 1.03, + "eval_logits/chosen": 1.5608395338058472, + "eval_logits/rejected": 2.154280662536621, + "eval_logps/chosen": -400.8282470703125, + "eval_logps/rejected": -387.81903076171875, + "eval_loss": 0.05462770164012909, + "eval_rewards/accuracies": 0.96875, + "eval_rewards/chosen": -0.1486767828464508, + "eval_rewards/margins": 8.072151184082031, + "eval_rewards/rejected": -8.22082805633545, + "eval_runtime": 78.122, + "eval_samples_per_second": 12.8, + "eval_steps_per_second": 0.41, + "step": 1600 + }, + { + "epoch": 1.03, + "learning_rate": 9.195429659604855e-08, + "logits/chosen": 2.027790069580078, + "logits/rejected": 3.0657553672790527, + "logps/chosen": -432.85369873046875, + "logps/rejected": -412.8692321777344, + "loss": 0.0163, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8429557681083679, + "rewards/margins": 9.376811027526855, + "rewards/rejected": -8.533855438232422, + "step": 1610 + }, + { + "epoch": 1.04, + "learning_rate": 9.183527731492501e-08, + "logits/chosen": 2.6315040588378906, + "logits/rejected": 3.4493613243103027, + "logps/chosen": -425.31756591796875, + "logps/rejected": -431.07989501953125, + "loss": 0.0162, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8139169812202454, + "rewards/margins": 9.736674308776855, + "rewards/rejected": -8.922757148742676, + "step": 1620 + }, + { + "epoch": 1.05, + "learning_rate": 9.171625803380148e-08, + "logits/chosen": 2.329026699066162, + "logits/rejected": 3.3126883506774902, + "logps/chosen": -426.11114501953125, + "logps/rejected": -430.76934814453125, + "loss": 0.0136, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6953893899917603, + "rewards/margins": 10.516650199890137, + "rewards/rejected": -8.821261405944824, + "step": 1630 + }, + { + "epoch": 1.05, + "learning_rate": 9.159723875267794e-08, + "logits/chosen": 2.058751106262207, + "logits/rejected": 2.7939858436584473, + "logps/chosen": -388.2889099121094, + "logps/rejected": -386.5802307128906, + "loss": 0.0199, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.9611308574676514, + "rewards/margins": 9.938085556030273, + "rewards/rejected": -7.976954460144043, + "step": 1640 + }, + { + "epoch": 1.06, + "learning_rate": 9.147821947155438e-08, + "logits/chosen": 2.1123156547546387, + "logits/rejected": 3.6038296222686768, + "logps/chosen": -393.5766296386719, + "logps/rejected": -402.1626892089844, + "loss": 0.0107, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.8590052723884583, + "rewards/margins": 9.311236381530762, + "rewards/rejected": -8.452230453491211, + "step": 1650 + }, + { + "epoch": 1.07, + "learning_rate": 9.135920019043084e-08, + "logits/chosen": 2.1106104850769043, + "logits/rejected": 3.1583571434020996, + "logps/chosen": -374.95001220703125, + "logps/rejected": -392.5960998535156, + "loss": 0.0114, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.768711268901825, + "rewards/margins": 8.690801620483398, + "rewards/rejected": -7.922091007232666, + "step": 1660 + }, + { + "epoch": 1.07, + "learning_rate": 9.12401809093073e-08, + "logits/chosen": 2.2228665351867676, + "logits/rejected": 3.330571413040161, + "logps/chosen": -443.146484375, + "logps/rejected": -426.9916076660156, + "loss": 0.0153, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.233867883682251, + "rewards/margins": 10.038155555725098, + "rewards/rejected": -8.804287910461426, + "step": 1670 + }, + { + "epoch": 1.08, + "learning_rate": 9.112116162818376e-08, + "logits/chosen": 2.2720413208007812, + "logits/rejected": 3.6448235511779785, + "logps/chosen": -428.63763427734375, + "logps/rejected": -414.93768310546875, + "loss": 0.0107, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4396426677703857, + "rewards/margins": 9.752705574035645, + "rewards/rejected": -8.31306266784668, + "step": 1680 + }, + { + "epoch": 1.09, + "learning_rate": 9.100214234706022e-08, + "logits/chosen": 2.2004613876342773, + "logits/rejected": 3.101625919342041, + "logps/chosen": -419.28143310546875, + "logps/rejected": -431.95684814453125, + "loss": 0.0106, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1783888339996338, + "rewards/margins": 10.366841316223145, + "rewards/rejected": -9.188451766967773, + "step": 1690 + }, + { + "epoch": 1.09, + "learning_rate": 9.088312306593667e-08, + "logits/chosen": 2.8064794540405273, + "logits/rejected": 3.2689521312713623, + "logps/chosen": -354.8829650878906, + "logps/rejected": -406.4047546386719, + "loss": 0.0089, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7891177535057068, + "rewards/margins": 9.862220764160156, + "rewards/rejected": -9.073102951049805, + "step": 1700 + }, + { + "epoch": 1.09, + "eval_logits/chosen": 1.625260591506958, + "eval_logits/rejected": 2.238248109817505, + "eval_logps/chosen": -400.59375, + "eval_logps/rejected": -390.6427917480469, + "eval_loss": 0.057591091841459274, + "eval_rewards/accuracies": 0.96875, + "eval_rewards/chosen": -0.12522682547569275, + "eval_rewards/margins": 8.377971649169922, + "eval_rewards/rejected": -8.503198623657227, + "eval_runtime": 78.0334, + "eval_samples_per_second": 12.815, + "eval_steps_per_second": 0.41, + "step": 1700 + }, + { + "epoch": 1.1, + "learning_rate": 9.076410378481314e-08, + "logits/chosen": 1.9525015354156494, + "logits/rejected": 3.1185505390167236, + "logps/chosen": -422.75225830078125, + "logps/rejected": -421.87445068359375, + "loss": 0.013, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9433252811431885, + "rewards/margins": 10.827123641967773, + "rewards/rejected": -8.883798599243164, + "step": 1710 + }, + { + "epoch": 1.11, + "learning_rate": 9.06450845036896e-08, + "logits/chosen": 2.2783799171447754, + "logits/rejected": 3.1842360496520996, + "logps/chosen": -418.703369140625, + "logps/rejected": -418.055419921875, + "loss": 0.012, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2181668281555176, + "rewards/margins": 9.945693016052246, + "rewards/rejected": -8.727526664733887, + "step": 1720 + }, + { + "epoch": 1.11, + "learning_rate": 9.052606522256606e-08, + "logits/chosen": 2.0897390842437744, + "logits/rejected": 3.332200288772583, + "logps/chosen": -436.8134765625, + "logps/rejected": -428.2256774902344, + "loss": 0.0087, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.5979622006416321, + "rewards/margins": 9.419112205505371, + "rewards/rejected": -8.821150779724121, + "step": 1730 + }, + { + "epoch": 1.12, + "learning_rate": 9.04070459414425e-08, + "logits/chosen": 1.7566314935684204, + "logits/rejected": 3.1516499519348145, + "logps/chosen": -444.9219665527344, + "logps/rejected": -409.5821228027344, + "loss": 0.0086, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8818896412849426, + "rewards/margins": 10.398978233337402, + "rewards/rejected": -9.517088890075684, + "step": 1740 + }, + { + "epoch": 1.12, + "learning_rate": 9.028802666031897e-08, + "logits/chosen": 2.2249584197998047, + "logits/rejected": 3.220370054244995, + "logps/chosen": -373.8511962890625, + "logps/rejected": -429.1431579589844, + "loss": 0.0122, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.09718599170446396, + "rewards/margins": 9.384894371032715, + "rewards/rejected": -9.28770923614502, + "step": 1750 + }, + { + "epoch": 1.13, + "learning_rate": 9.016900737919543e-08, + "logits/chosen": 2.468928337097168, + "logits/rejected": 3.5063624382019043, + "logps/chosen": -375.80279541015625, + "logps/rejected": -422.55450439453125, + "loss": 0.0088, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.36610403656959534, + "rewards/margins": 9.850809097290039, + "rewards/rejected": -9.484704971313477, + "step": 1760 + }, + { + "epoch": 1.14, + "learning_rate": 9.004998809807188e-08, + "logits/chosen": 2.5144405364990234, + "logits/rejected": 3.0172762870788574, + "logps/chosen": -398.01959228515625, + "logps/rejected": -412.0162048339844, + "loss": 0.0116, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.71649569272995, + "rewards/margins": 10.736013412475586, + "rewards/rejected": -10.01951789855957, + "step": 1770 + }, + { + "epoch": 1.14, + "learning_rate": 8.993096881694834e-08, + "logits/chosen": 2.411824941635132, + "logits/rejected": 3.4212913513183594, + "logps/chosen": -441.16741943359375, + "logps/rejected": -410.988525390625, + "loss": 0.0107, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.526915967464447, + "rewards/margins": 9.955665588378906, + "rewards/rejected": -9.428749084472656, + "step": 1780 + }, + { + "epoch": 1.15, + "learning_rate": 8.981194953582481e-08, + "logits/chosen": 2.0398449897766113, + "logits/rejected": 2.990245819091797, + "logps/chosen": -424.50830078125, + "logps/rejected": -440.29034423828125, + "loss": 0.0161, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1279420852661133, + "rewards/margins": 10.929909706115723, + "rewards/rejected": -9.801966667175293, + "step": 1790 + }, + { + "epoch": 1.16, + "learning_rate": 8.969293025470126e-08, + "logits/chosen": 2.0134599208831787, + "logits/rejected": 3.4384913444519043, + "logps/chosen": -431.4124450683594, + "logps/rejected": -422.57275390625, + "loss": 0.0245, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.387521505355835, + "rewards/margins": 10.490163803100586, + "rewards/rejected": -9.102643013000488, + "step": 1800 + }, + { + "epoch": 1.16, + "eval_logits/chosen": 1.578616738319397, + "eval_logits/rejected": 2.1524062156677246, + "eval_logps/chosen": -396.13555908203125, + "eval_logps/rejected": -386.849365234375, + "eval_loss": 0.054019615054130554, + "eval_rewards/accuracies": 0.96875, + "eval_rewards/chosen": 0.32059139013290405, + "eval_rewards/margins": 8.444451332092285, + "eval_rewards/rejected": -8.123859405517578, + "eval_runtime": 78.1156, + "eval_samples_per_second": 12.802, + "eval_steps_per_second": 0.41, + "step": 1800 + }, + { + "epoch": 1.16, + "learning_rate": 8.957391097357772e-08, + "logits/chosen": 1.8505395650863647, + "logits/rejected": 2.4895451068878174, + "logps/chosen": -411.90081787109375, + "logps/rejected": -446.5166931152344, + "loss": 0.0093, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.4874722957611084, + "rewards/margins": 10.799659729003906, + "rewards/rejected": -9.312187194824219, + "step": 1810 + }, + { + "epoch": 1.17, + "learning_rate": 8.945489169245418e-08, + "logits/chosen": 2.4672083854675293, + "logits/rejected": 3.3016533851623535, + "logps/chosen": -407.24810791015625, + "logps/rejected": -421.69207763671875, + "loss": 0.0156, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6584242582321167, + "rewards/margins": 9.77189826965332, + "rewards/rejected": -9.113473892211914, + "step": 1820 + }, + { + "epoch": 1.18, + "learning_rate": 8.933587241133062e-08, + "logits/chosen": 2.9424712657928467, + "logits/rejected": 3.1963260173797607, + "logps/chosen": -393.42730712890625, + "logps/rejected": -405.26776123046875, + "loss": 0.0107, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.141961693763733, + "rewards/margins": 9.95053768157959, + "rewards/rejected": -8.808575630187988, + "step": 1830 + }, + { + "epoch": 1.18, + "learning_rate": 8.921685313020709e-08, + "logits/chosen": 2.1705727577209473, + "logits/rejected": 3.132054090499878, + "logps/chosen": -458.5648498535156, + "logps/rejected": -425.5223693847656, + "loss": 0.0144, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.0647785663604736, + "rewards/margins": 10.456083297729492, + "rewards/rejected": -8.391304969787598, + "step": 1840 + }, + { + "epoch": 1.19, + "learning_rate": 8.909783384908355e-08, + "logits/chosen": 2.2793197631835938, + "logits/rejected": 3.1851863861083984, + "logps/chosen": -428.22161865234375, + "logps/rejected": -393.50201416015625, + "loss": 0.0142, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.3557255268096924, + "rewards/margins": 9.879143714904785, + "rewards/rejected": -8.523417472839355, + "step": 1850 + }, + { + "epoch": 1.2, + "learning_rate": 8.897881456796e-08, + "logits/chosen": 2.092120409011841, + "logits/rejected": 3.1506097316741943, + "logps/chosen": -423.780517578125, + "logps/rejected": -435.0636291503906, + "loss": 0.0164, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.041473150253296, + "rewards/margins": 10.483617782592773, + "rewards/rejected": -9.442144393920898, + "step": 1860 + }, + { + "epoch": 1.2, + "learning_rate": 8.885979528683646e-08, + "logits/chosen": 2.077341079711914, + "logits/rejected": 2.840765953063965, + "logps/chosen": -467.50335693359375, + "logps/rejected": -450.4947814941406, + "loss": 0.01, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0977306365966797, + "rewards/margins": 10.936107635498047, + "rewards/rejected": -9.838376998901367, + "step": 1870 + }, + { + "epoch": 1.21, + "learning_rate": 8.874077600571293e-08, + "logits/chosen": 2.388977527618408, + "logits/rejected": 3.17130970954895, + "logps/chosen": -415.8170471191406, + "logps/rejected": -431.8157653808594, + "loss": 0.0172, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1165374517440796, + "rewards/margins": 11.083638191223145, + "rewards/rejected": -9.967100143432617, + "step": 1880 + }, + { + "epoch": 1.21, + "learning_rate": 8.862175672458938e-08, + "logits/chosen": 2.3494229316711426, + "logits/rejected": 3.3899810314178467, + "logps/chosen": -398.8847351074219, + "logps/rejected": -420.1104431152344, + "loss": 0.0119, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9555309414863586, + "rewards/margins": 9.63708209991455, + "rewards/rejected": -8.681550979614258, + "step": 1890 + }, + { + "epoch": 1.22, + "learning_rate": 8.850273744346584e-08, + "logits/chosen": 2.377115488052368, + "logits/rejected": 3.1841914653778076, + "logps/chosen": -384.47821044921875, + "logps/rejected": -382.5137634277344, + "loss": 0.0091, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.495150089263916, + "rewards/margins": 9.905587196350098, + "rewards/rejected": -8.410436630249023, + "step": 1900 + }, + { + "epoch": 1.22, + "eval_logits/chosen": 1.5970555543899536, + "eval_logits/rejected": 2.134828805923462, + "eval_logps/chosen": -395.612060546875, + "eval_logps/rejected": -388.520751953125, + "eval_loss": 0.05577890947461128, + "eval_rewards/accuracies": 0.96875, + "eval_rewards/chosen": 0.3729451596736908, + "eval_rewards/margins": 8.663945198059082, + "eval_rewards/rejected": -8.290999412536621, + "eval_runtime": 78.1703, + "eval_samples_per_second": 12.793, + "eval_steps_per_second": 0.409, + "step": 1900 + }, + { + "epoch": 1.23, + "learning_rate": 8.83837181623423e-08, + "logits/chosen": 2.162808895111084, + "logits/rejected": 2.938401937484741, + "logps/chosen": -416.3312072753906, + "logps/rejected": -431.4833984375, + "loss": 0.0112, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4468110799789429, + "rewards/margins": 11.13166332244873, + "rewards/rejected": -9.684852600097656, + "step": 1910 + }, + { + "epoch": 1.23, + "learning_rate": 8.826469888121875e-08, + "logits/chosen": 1.8704240322113037, + "logits/rejected": 3.454761028289795, + "logps/chosen": -394.39312744140625, + "logps/rejected": -435.0038146972656, + "loss": 0.0103, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.1534335613250732, + "rewards/margins": 10.062942504882812, + "rewards/rejected": -8.909509658813477, + "step": 1920 + }, + { + "epoch": 1.24, + "learning_rate": 8.814567960009521e-08, + "logits/chosen": 2.3629403114318848, + "logits/rejected": 3.146951675415039, + "logps/chosen": -385.4207458496094, + "logps/rejected": -421.41552734375, + "loss": 0.0174, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5849992036819458, + "rewards/margins": 10.956562995910645, + "rewards/rejected": -9.371562957763672, + "step": 1930 + }, + { + "epoch": 1.25, + "learning_rate": 8.802666031897167e-08, + "logits/chosen": 1.9549896717071533, + "logits/rejected": 3.260117769241333, + "logps/chosen": -438.8719177246094, + "logps/rejected": -423.6161193847656, + "loss": 0.0114, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.8324927091598511, + "rewards/margins": 9.532448768615723, + "rewards/rejected": -8.699956893920898, + "step": 1940 + }, + { + "epoch": 1.25, + "learning_rate": 8.790764103784812e-08, + "logits/chosen": 2.376079797744751, + "logits/rejected": 2.626861572265625, + "logps/chosen": -420.97320556640625, + "logps/rejected": -430.754638671875, + "loss": 0.0101, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4511768817901611, + "rewards/margins": 10.815112113952637, + "rewards/rejected": -9.363935470581055, + "step": 1950 + }, + { + "epoch": 1.26, + "learning_rate": 8.778862175672459e-08, + "logits/chosen": 2.016657829284668, + "logits/rejected": 3.148374557495117, + "logps/chosen": -399.3986511230469, + "logps/rejected": -409.0765075683594, + "loss": 0.0126, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7621505260467529, + "rewards/margins": 9.929964065551758, + "rewards/rejected": -9.167813301086426, + "step": 1960 + }, + { + "epoch": 1.27, + "learning_rate": 8.766960247560105e-08, + "logits/chosen": 2.107110023498535, + "logits/rejected": 2.8882648944854736, + "logps/chosen": -467.67999267578125, + "logps/rejected": -445.00054931640625, + "loss": 0.0072, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5498336553573608, + "rewards/margins": 11.310626983642578, + "rewards/rejected": -9.760791778564453, + "step": 1970 + }, + { + "epoch": 1.27, + "learning_rate": 8.75505831944775e-08, + "logits/chosen": 2.1148109436035156, + "logits/rejected": 2.921837568283081, + "logps/chosen": -419.919677734375, + "logps/rejected": -438.27392578125, + "loss": 0.0105, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5840842723846436, + "rewards/margins": 10.156820297241211, + "rewards/rejected": -9.572736740112305, + "step": 1980 + }, + { + "epoch": 1.28, + "learning_rate": 8.743156391335396e-08, + "logits/chosen": 2.2187628746032715, + "logits/rejected": 3.3866772651672363, + "logps/chosen": -381.0819396972656, + "logps/rejected": -417.1609802246094, + "loss": 0.0088, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2810596525669098, + "rewards/margins": 9.59456729888916, + "rewards/rejected": -9.313508033752441, + "step": 1990 + }, + { + "epoch": 1.29, + "learning_rate": 8.731254463223042e-08, + "logits/chosen": 2.337141513824463, + "logits/rejected": 2.984570026397705, + "logps/chosen": -428.96307373046875, + "logps/rejected": -433.2093811035156, + "loss": 0.0119, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.118440866470337, + "rewards/margins": 10.297574043273926, + "rewards/rejected": -9.179132461547852, + "step": 2000 + }, + { + "epoch": 1.29, + "eval_logits/chosen": 1.633168339729309, + "eval_logits/rejected": 2.174915313720703, + "eval_logps/chosen": -397.74981689453125, + "eval_logps/rejected": -392.74847412109375, + "eval_loss": 0.0574236661195755, + "eval_rewards/accuracies": 0.96875, + "eval_rewards/chosen": 0.15916621685028076, + "eval_rewards/margins": 8.872934341430664, + "eval_rewards/rejected": -8.713767051696777, + "eval_runtime": 77.9326, + "eval_samples_per_second": 12.832, + "eval_steps_per_second": 0.411, + "step": 2000 + }, + { + "epoch": 1.29, + "learning_rate": 8.719352535110687e-08, + "logits/chosen": 2.470761775970459, + "logits/rejected": 3.466107130050659, + "logps/chosen": -330.568115234375, + "logps/rejected": -396.0406799316406, + "loss": 0.0094, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5657675862312317, + "rewards/margins": 9.64977741241455, + "rewards/rejected": -9.084009170532227, + "step": 2010 + }, + { + "epoch": 1.3, + "learning_rate": 8.707450606998333e-08, + "logits/chosen": 2.0616042613983154, + "logits/rejected": 3.1946635246276855, + "logps/chosen": -374.26641845703125, + "logps/rejected": -438.85565185546875, + "loss": 0.0125, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9465324282646179, + "rewards/margins": 10.824884414672852, + "rewards/rejected": -9.8783540725708, + "step": 2020 + }, + { + "epoch": 1.3, + "learning_rate": 8.695548678885979e-08, + "logits/chosen": 2.382939100265503, + "logits/rejected": 2.7525863647460938, + "logps/chosen": -404.4330139160156, + "logps/rejected": -403.4361572265625, + "loss": 0.0093, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8075125813484192, + "rewards/margins": 9.784753799438477, + "rewards/rejected": -8.977242469787598, + "step": 2030 + }, + { + "epoch": 1.31, + "learning_rate": 8.683646750773624e-08, + "logits/chosen": 2.528764247894287, + "logits/rejected": 3.0091147422790527, + "logps/chosen": -448.67852783203125, + "logps/rejected": -431.8081970214844, + "loss": 0.0146, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1726857423782349, + "rewards/margins": 11.344512939453125, + "rewards/rejected": -10.17182731628418, + "step": 2040 + }, + { + "epoch": 1.32, + "learning_rate": 8.671744822661271e-08, + "logits/chosen": 2.7697949409484863, + "logits/rejected": 3.7019259929656982, + "logps/chosen": -388.6200256347656, + "logps/rejected": -417.25152587890625, + "loss": 0.0083, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.27654901146888733, + "rewards/margins": 9.896730422973633, + "rewards/rejected": -9.620182037353516, + "step": 2050 + }, + { + "epoch": 1.32, + "learning_rate": 8.659842894548917e-08, + "logits/chosen": 2.3201236724853516, + "logits/rejected": 3.4561610221862793, + "logps/chosen": -425.83770751953125, + "logps/rejected": -425.7290954589844, + "loss": 0.01, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1046048402786255, + "rewards/margins": 10.805675506591797, + "rewards/rejected": -9.701070785522461, + "step": 2060 + }, + { + "epoch": 1.33, + "learning_rate": 8.647940966436562e-08, + "logits/chosen": 2.3291468620300293, + "logits/rejected": 3.3310623168945312, + "logps/chosen": -429.80987548828125, + "logps/rejected": -432.98992919921875, + "loss": 0.011, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8949478268623352, + "rewards/margins": 11.059711456298828, + "rewards/rejected": -10.164762496948242, + "step": 2070 + }, + { + "epoch": 1.34, + "learning_rate": 8.636039038324208e-08, + "logits/chosen": 2.225471019744873, + "logits/rejected": 3.3019492626190186, + "logps/chosen": -387.1333923339844, + "logps/rejected": -433.3799743652344, + "loss": 0.0126, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5047739744186401, + "rewards/margins": 12.810396194458008, + "rewards/rejected": -12.305620193481445, + "step": 2080 + }, + { + "epoch": 1.34, + "learning_rate": 8.624137110211854e-08, + "logits/chosen": 2.510704517364502, + "logits/rejected": 3.3782081604003906, + "logps/chosen": -409.83270263671875, + "logps/rejected": -438.6222229003906, + "loss": 0.0084, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7321760654449463, + "rewards/margins": 12.49064826965332, + "rewards/rejected": -10.75847339630127, + "step": 2090 + }, + { + "epoch": 1.35, + "learning_rate": 8.6122351820995e-08, + "logits/chosen": 1.8151687383651733, + "logits/rejected": 2.730388879776001, + "logps/chosen": -369.6325378417969, + "logps/rejected": -416.94476318359375, + "loss": 0.0105, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7672537565231323, + "rewards/margins": 10.405468940734863, + "rewards/rejected": -9.638215065002441, + "step": 2100 + }, + { + "epoch": 1.35, + "eval_logits/chosen": 1.7073872089385986, + "eval_logits/rejected": 2.287001371383667, + "eval_logps/chosen": -401.92828369140625, + "eval_logps/rejected": -399.6173095703125, + "eval_loss": 0.0646829605102539, + "eval_rewards/accuracies": 0.96875, + "eval_rewards/chosen": -0.2586807608604431, + "eval_rewards/margins": 9.141975402832031, + "eval_rewards/rejected": -9.400656700134277, + "eval_runtime": 77.8864, + "eval_samples_per_second": 12.839, + "eval_steps_per_second": 0.411, + "step": 2100 + }, + { + "epoch": 1.36, + "learning_rate": 8.600333253987145e-08, + "logits/chosen": 2.291734218597412, + "logits/rejected": 3.170888662338257, + "logps/chosen": -423.4766540527344, + "logps/rejected": -427.091796875, + "loss": 0.0109, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.495410680770874, + "rewards/margins": 11.364627838134766, + "rewards/rejected": -9.869218826293945, + "step": 2110 + }, + { + "epoch": 1.36, + "learning_rate": 8.58843132587479e-08, + "logits/chosen": 2.2076480388641357, + "logits/rejected": 2.9256691932678223, + "logps/chosen": -432.29248046875, + "logps/rejected": -417.88055419921875, + "loss": 0.0098, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4364680051803589, + "rewards/margins": 11.401215553283691, + "rewards/rejected": -9.964746475219727, + "step": 2120 + }, + { + "epoch": 1.37, + "learning_rate": 8.576529397762438e-08, + "logits/chosen": 2.705327272415161, + "logits/rejected": 3.241112232208252, + "logps/chosen": -381.64923095703125, + "logps/rejected": -452.84588623046875, + "loss": 0.0093, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8446656465530396, + "rewards/margins": 11.237610816955566, + "rewards/rejected": -10.392945289611816, + "step": 2130 + }, + { + "epoch": 1.38, + "learning_rate": 8.564627469650083e-08, + "logits/chosen": 2.1531457901000977, + "logits/rejected": 3.033215284347534, + "logps/chosen": -409.0959167480469, + "logps/rejected": -468.179443359375, + "loss": 0.0085, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9750633239746094, + "rewards/margins": 11.535894393920898, + "rewards/rejected": -10.560831069946289, + "step": 2140 + }, + { + "epoch": 1.38, + "learning_rate": 8.552725541537729e-08, + "logits/chosen": 2.3740274906158447, + "logits/rejected": 3.0594983100891113, + "logps/chosen": -408.73284912109375, + "logps/rejected": -432.64154052734375, + "loss": 0.0111, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.3420227766036987, + "rewards/margins": 11.06401252746582, + "rewards/rejected": -9.721988677978516, + "step": 2150 + }, + { + "epoch": 1.39, + "learning_rate": 8.540823613425374e-08, + "logits/chosen": 2.0250580310821533, + "logits/rejected": 3.1984283924102783, + "logps/chosen": -414.43865966796875, + "logps/rejected": -414.484375, + "loss": 0.0109, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4251967668533325, + "rewards/margins": 11.274666786193848, + "rewards/rejected": -9.849469184875488, + "step": 2160 + }, + { + "epoch": 1.39, + "learning_rate": 8.528921685313021e-08, + "logits/chosen": 2.57132625579834, + "logits/rejected": 3.3775405883789062, + "logps/chosen": -492.7275390625, + "logps/rejected": -453.9833984375, + "loss": 0.0129, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.017164707183838, + "rewards/margins": 12.133844375610352, + "rewards/rejected": -10.116679191589355, + "step": 2170 + }, + { + "epoch": 1.4, + "learning_rate": 8.517019757200666e-08, + "logits/chosen": 2.3586134910583496, + "logits/rejected": 3.1494510173797607, + "logps/chosen": -419.758056640625, + "logps/rejected": -436.7438049316406, + "loss": 0.0125, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5768086910247803, + "rewards/margins": 11.310070037841797, + "rewards/rejected": -9.733260154724121, + "step": 2180 + }, + { + "epoch": 1.41, + "learning_rate": 8.505117829088311e-08, + "logits/chosen": 2.436389684677124, + "logits/rejected": 3.2424912452697754, + "logps/chosen": -443.7950744628906, + "logps/rejected": -466.5213317871094, + "loss": 0.0116, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.40640783309936523, + "rewards/margins": 10.672332763671875, + "rewards/rejected": -10.265925407409668, + "step": 2190 + }, + { + "epoch": 1.41, + "learning_rate": 8.493215900975957e-08, + "logits/chosen": 2.098741054534912, + "logits/rejected": 2.9514975547790527, + "logps/chosen": -424.73052978515625, + "logps/rejected": -460.8973083496094, + "loss": 0.0107, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6627575159072876, + "rewards/margins": 12.264575004577637, + "rewards/rejected": -11.60181713104248, + "step": 2200 + }, + { + "epoch": 1.41, + "eval_logits/chosen": 1.8804820775985718, + "eval_logits/rejected": 2.3825998306274414, + "eval_logps/chosen": -409.8846130371094, + "eval_logps/rejected": -406.9892578125, + "eval_loss": 0.06976839900016785, + "eval_rewards/accuracies": 0.9609375, + "eval_rewards/chosen": -1.0543094873428345, + "eval_rewards/margins": 9.083538055419922, + "eval_rewards/rejected": -10.137847900390625, + "eval_runtime": 78.0306, + "eval_samples_per_second": 12.815, + "eval_steps_per_second": 0.41, + "step": 2200 + }, + { + "epoch": 1.42, + "learning_rate": 8.481313972863604e-08, + "logits/chosen": 2.7101387977600098, + "logits/rejected": 3.5361340045928955, + "logps/chosen": -386.73162841796875, + "logps/rejected": -412.102783203125, + "loss": 0.01, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5167805552482605, + "rewards/margins": 11.045450210571289, + "rewards/rejected": -10.528669357299805, + "step": 2210 + }, + { + "epoch": 1.43, + "learning_rate": 8.46941204475125e-08, + "logits/chosen": 2.4018852710723877, + "logits/rejected": 2.6615495681762695, + "logps/chosen": -435.355712890625, + "logps/rejected": -460.0689392089844, + "loss": 0.0141, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7065870761871338, + "rewards/margins": 11.625639915466309, + "rewards/rejected": -9.919052124023438, + "step": 2220 + }, + { + "epoch": 1.43, + "learning_rate": 8.457510116638895e-08, + "logits/chosen": 2.355992078781128, + "logits/rejected": 3.142991304397583, + "logps/chosen": -436.5738830566406, + "logps/rejected": -442.658447265625, + "loss": 0.0084, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.008822202682495, + "rewards/margins": 11.25963020324707, + "rewards/rejected": -9.250809669494629, + "step": 2230 + }, + { + "epoch": 1.44, + "learning_rate": 8.445608188526541e-08, + "logits/chosen": 2.113661527633667, + "logits/rejected": 3.376765727996826, + "logps/chosen": -363.7850036621094, + "logps/rejected": -417.59130859375, + "loss": 0.0067, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0049993991851807, + "rewards/margins": 10.82304573059082, + "rewards/rejected": -9.818044662475586, + "step": 2240 + }, + { + "epoch": 1.45, + "learning_rate": 8.433706260414186e-08, + "logits/chosen": 2.2611918449401855, + "logits/rejected": 2.977774143218994, + "logps/chosen": -412.16259765625, + "logps/rejected": -402.7505798339844, + "loss": 0.0124, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.9179121255874634, + "rewards/margins": 10.67682933807373, + "rewards/rejected": -8.758916854858398, + "step": 2250 + }, + { + "epoch": 1.45, + "learning_rate": 8.421804332301833e-08, + "logits/chosen": 2.2166171073913574, + "logits/rejected": 3.1910576820373535, + "logps/chosen": -415.0889587402344, + "logps/rejected": -410.3279724121094, + "loss": 0.0128, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.147801399230957, + "rewards/margins": 11.321202278137207, + "rewards/rejected": -9.17340087890625, + "step": 2260 + }, + { + "epoch": 1.46, + "learning_rate": 8.409902404189478e-08, + "logits/chosen": 2.3937222957611084, + "logits/rejected": 3.467958450317383, + "logps/chosen": -410.2650451660156, + "logps/rejected": -424.83074951171875, + "loss": 0.0172, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3514193296432495, + "rewards/margins": 11.13292121887207, + "rewards/rejected": -9.781502723693848, + "step": 2270 + }, + { + "epoch": 1.47, + "learning_rate": 8.398000476077123e-08, + "logits/chosen": 2.627056360244751, + "logits/rejected": 2.9553661346435547, + "logps/chosen": -422.7137145996094, + "logps/rejected": -420.8788146972656, + "loss": 0.0098, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.3633685111999512, + "rewards/margins": 10.64666748046875, + "rewards/rejected": -9.283299446105957, + "step": 2280 + }, + { + "epoch": 1.47, + "learning_rate": 8.386098547964769e-08, + "logits/chosen": 2.7881388664245605, + "logits/rejected": 2.7855353355407715, + "logps/chosen": -463.8809509277344, + "logps/rejected": -449.48004150390625, + "loss": 0.0125, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.032374382019043, + "rewards/margins": 12.262084007263184, + "rewards/rejected": -10.22970962524414, + "step": 2290 + }, + { + "epoch": 1.48, + "learning_rate": 8.374196619852416e-08, + "logits/chosen": 1.9589307308197021, + "logits/rejected": 2.762117862701416, + "logps/chosen": -450.49609375, + "logps/rejected": -444.59149169921875, + "loss": 0.0098, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6292638778686523, + "rewards/margins": 11.14265251159668, + "rewards/rejected": -9.513387680053711, + "step": 2300 + }, + { + "epoch": 1.48, + "eval_logits/chosen": 1.7084078788757324, + "eval_logits/rejected": 2.194572687149048, + "eval_logps/chosen": -397.28363037109375, + "eval_logps/rejected": -393.7378845214844, + "eval_loss": 0.06439676135778427, + "eval_rewards/accuracies": 0.96875, + "eval_rewards/chosen": 0.20578746497631073, + "eval_rewards/margins": 9.018497467041016, + "eval_rewards/rejected": -8.812708854675293, + "eval_runtime": 78.0234, + "eval_samples_per_second": 12.817, + "eval_steps_per_second": 0.41, + "step": 2300 + }, + { + "epoch": 1.48, + "learning_rate": 8.362294691740062e-08, + "logits/chosen": 2.6022965908050537, + "logits/rejected": 3.01656436920166, + "logps/chosen": -399.0681457519531, + "logps/rejected": -423.8030700683594, + "loss": 0.0097, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.5128016471862793, + "rewards/margins": 10.698533058166504, + "rewards/rejected": -9.18572998046875, + "step": 2310 + }, + { + "epoch": 1.49, + "learning_rate": 8.350392763627707e-08, + "logits/chosen": 2.14322566986084, + "logits/rejected": 3.216325044631958, + "logps/chosen": -432.8180236816406, + "logps/rejected": -448.30645751953125, + "loss": 0.0107, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.6714401245117188, + "rewards/margins": 11.98411750793457, + "rewards/rejected": -10.312677383422852, + "step": 2320 + }, + { + "epoch": 1.5, + "learning_rate": 8.338490835515353e-08, + "logits/chosen": 2.0054235458374023, + "logits/rejected": 3.5831961631774902, + "logps/chosen": -402.33892822265625, + "logps/rejected": -419.5140686035156, + "loss": 0.0103, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3268228769302368, + "rewards/margins": 11.421789169311523, + "rewards/rejected": -10.094966888427734, + "step": 2330 + }, + { + "epoch": 1.5, + "learning_rate": 8.326588907403e-08, + "logits/chosen": 2.380002498626709, + "logits/rejected": 3.5054984092712402, + "logps/chosen": -452.48748779296875, + "logps/rejected": -460.12786865234375, + "loss": 0.0075, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.852749228477478, + "rewards/margins": 12.324037551879883, + "rewards/rejected": -10.471287727355957, + "step": 2340 + }, + { + "epoch": 1.51, + "learning_rate": 8.314686979290645e-08, + "logits/chosen": 2.2875988483428955, + "logits/rejected": 3.5421195030212402, + "logps/chosen": -428.4679260253906, + "logps/rejected": -430.433349609375, + "loss": 0.0082, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6448265314102173, + "rewards/margins": 11.238059997558594, + "rewards/rejected": -9.593233108520508, + "step": 2350 + }, + { + "epoch": 1.52, + "learning_rate": 8.30278505117829e-08, + "logits/chosen": 2.450359582901001, + "logits/rejected": 3.1287174224853516, + "logps/chosen": -432.19329833984375, + "logps/rejected": -433.89111328125, + "loss": 0.0104, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.381823182106018, + "rewards/margins": 12.04507064819336, + "rewards/rejected": -10.663248062133789, + "step": 2360 + }, + { + "epoch": 1.52, + "learning_rate": 8.290883123065935e-08, + "logits/chosen": 2.806814670562744, + "logits/rejected": 3.616931200027466, + "logps/chosen": -421.779052734375, + "logps/rejected": -449.85260009765625, + "loss": 0.0113, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1789385080337524, + "rewards/margins": 11.785478591918945, + "rewards/rejected": -10.606538772583008, + "step": 2370 + }, + { + "epoch": 1.53, + "learning_rate": 8.278981194953582e-08, + "logits/chosen": 2.440140724182129, + "logits/rejected": 3.0209367275238037, + "logps/chosen": -454.8857421875, + "logps/rejected": -450.9249572753906, + "loss": 0.0102, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.156050443649292, + "rewards/margins": 11.445282936096191, + "rewards/rejected": -10.28923225402832, + "step": 2380 + }, + { + "epoch": 1.54, + "learning_rate": 8.267079266841228e-08, + "logits/chosen": 2.5684947967529297, + "logits/rejected": 3.107997417449951, + "logps/chosen": -408.620849609375, + "logps/rejected": -425.724365234375, + "loss": 0.008, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.03949282318353653, + "rewards/margins": 10.728104591369629, + "rewards/rejected": -10.767596244812012, + "step": 2390 + }, + { + "epoch": 1.54, + "learning_rate": 8.255177338728874e-08, + "logits/chosen": 2.5037713050842285, + "logits/rejected": 3.1614301204681396, + "logps/chosen": -447.5420837402344, + "logps/rejected": -419.99981689453125, + "loss": 0.0078, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0575414896011353, + "rewards/margins": 11.080565452575684, + "rewards/rejected": -10.02302360534668, + "step": 2400 + }, + { + "epoch": 1.54, + "eval_logits/chosen": 1.8239837884902954, + "eval_logits/rejected": 2.3104217052459717, + "eval_logps/chosen": -404.14984130859375, + "eval_logps/rejected": -404.53826904296875, + "eval_loss": 0.06748179346323013, + "eval_rewards/accuracies": 0.96875, + "eval_rewards/chosen": -0.4808317720890045, + "eval_rewards/margins": 9.411918640136719, + "eval_rewards/rejected": -9.892749786376953, + "eval_runtime": 78.0282, + "eval_samples_per_second": 12.816, + "eval_steps_per_second": 0.41, + "step": 2400 + }, + { + "epoch": 1.55, + "learning_rate": 8.243275410616519e-08, + "logits/chosen": 2.3226962089538574, + "logits/rejected": 3.3303439617156982, + "logps/chosen": -418.3297424316406, + "logps/rejected": -439.53143310546875, + "loss": 0.0057, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6141597032546997, + "rewards/margins": 10.954813003540039, + "rewards/rejected": -10.340652465820312, + "step": 2410 + }, + { + "epoch": 1.56, + "learning_rate": 8.231373482504166e-08, + "logits/chosen": 2.913877487182617, + "logits/rejected": 3.016862392425537, + "logps/chosen": -340.18499755859375, + "logps/rejected": -396.21722412109375, + "loss": 0.0071, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.8227875232696533, + "rewards/margins": 11.620365142822266, + "rewards/rejected": -9.797577857971191, + "step": 2420 + }, + { + "epoch": 1.56, + "learning_rate": 8.219471554391812e-08, + "logits/chosen": 2.501906633377075, + "logits/rejected": 2.8194351196289062, + "logps/chosen": -432.50787353515625, + "logps/rejected": -426.0884704589844, + "loss": 0.0083, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8617050647735596, + "rewards/margins": 11.113122940063477, + "rewards/rejected": -9.251418113708496, + "step": 2430 + }, + { + "epoch": 1.57, + "learning_rate": 8.207569626279457e-08, + "logits/chosen": 2.374406337738037, + "logits/rejected": 3.2433886528015137, + "logps/chosen": -430.30352783203125, + "logps/rejected": -446.37286376953125, + "loss": 0.0101, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8586057424545288, + "rewards/margins": 11.883955001831055, + "rewards/rejected": -10.025348663330078, + "step": 2440 + }, + { + "epoch": 1.57, + "learning_rate": 8.195667698167103e-08, + "logits/chosen": 2.630288600921631, + "logits/rejected": 3.096122980117798, + "logps/chosen": -458.97149658203125, + "logps/rejected": -464.8585510253906, + "loss": 0.0133, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6575380563735962, + "rewards/margins": 13.09735107421875, + "rewards/rejected": -11.439813613891602, + "step": 2450 + }, + { + "epoch": 1.58, + "learning_rate": 8.183765770054747e-08, + "logits/chosen": 2.9291977882385254, + "logits/rejected": 3.383643388748169, + "logps/chosen": -426.6884765625, + "logps/rejected": -418.20172119140625, + "loss": 0.01, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3758227825164795, + "rewards/margins": 11.36131477355957, + "rewards/rejected": -9.985492706298828, + "step": 2460 + }, + { + "epoch": 1.59, + "learning_rate": 8.171863841942394e-08, + "logits/chosen": 2.36037278175354, + "logits/rejected": 3.379338026046753, + "logps/chosen": -409.152587890625, + "logps/rejected": -405.7203063964844, + "loss": 0.0081, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7479127049446106, + "rewards/margins": 10.87302303314209, + "rewards/rejected": -10.12511157989502, + "step": 2470 + }, + { + "epoch": 1.59, + "learning_rate": 8.15996191383004e-08, + "logits/chosen": 2.3077635765075684, + "logits/rejected": 2.982712745666504, + "logps/chosen": -440.3170471191406, + "logps/rejected": -441.4231872558594, + "loss": 0.0101, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2002421617507935, + "rewards/margins": 11.192548751831055, + "rewards/rejected": -9.99230670928955, + "step": 2480 + }, + { + "epoch": 1.6, + "learning_rate": 8.148059985717686e-08, + "logits/chosen": 2.3768467903137207, + "logits/rejected": 3.2358105182647705, + "logps/chosen": -450.10040283203125, + "logps/rejected": -459.655517578125, + "loss": 0.0068, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5886727571487427, + "rewards/margins": 11.561334609985352, + "rewards/rejected": -10.972661018371582, + "step": 2490 + }, + { + "epoch": 1.61, + "learning_rate": 8.136158057605331e-08, + "logits/chosen": 1.9187663793563843, + "logits/rejected": 3.099363088607788, + "logps/chosen": -489.19549560546875, + "logps/rejected": -454.474609375, + "loss": 0.0117, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.78690505027771, + "rewards/margins": 11.946157455444336, + "rewards/rejected": -10.159250259399414, + "step": 2500 + }, + { + "epoch": 1.61, + "eval_logits/chosen": 1.7307677268981934, + "eval_logits/rejected": 2.2528133392333984, + "eval_logps/chosen": -395.5159606933594, + "eval_logps/rejected": -398.804443359375, + "eval_loss": 0.06332825124263763, + "eval_rewards/accuracies": 0.9609375, + "eval_rewards/chosen": 0.38255468010902405, + "eval_rewards/margins": 9.701919555664062, + "eval_rewards/rejected": -9.319364547729492, + "eval_runtime": 77.9753, + "eval_samples_per_second": 12.825, + "eval_steps_per_second": 0.41, + "step": 2500 + }, + { + "epoch": 1.61, + "learning_rate": 8.124256129492978e-08, + "logits/chosen": 1.937443494796753, + "logits/rejected": 2.9953453540802, + "logps/chosen": -452.47900390625, + "logps/rejected": -460.998046875, + "loss": 0.0145, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4172687530517578, + "rewards/margins": 12.328100204467773, + "rewards/rejected": -10.910831451416016, + "step": 2510 + }, + { + "epoch": 1.62, + "learning_rate": 8.112354201380624e-08, + "logits/chosen": 2.033159017562866, + "logits/rejected": 2.923642158508301, + "logps/chosen": -385.84124755859375, + "logps/rejected": -427.04095458984375, + "loss": 0.0114, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.8549903631210327, + "rewards/margins": 11.425082206726074, + "rewards/rejected": -9.57009220123291, + "step": 2520 + }, + { + "epoch": 1.63, + "learning_rate": 8.10045227326827e-08, + "logits/chosen": 2.1730008125305176, + "logits/rejected": 2.7437562942504883, + "logps/chosen": -401.44744873046875, + "logps/rejected": -461.93597412109375, + "loss": 0.009, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0297939777374268, + "rewards/margins": 13.02344799041748, + "rewards/rejected": -10.993656158447266, + "step": 2530 + }, + { + "epoch": 1.63, + "learning_rate": 8.088550345155915e-08, + "logits/chosen": 2.1667115688323975, + "logits/rejected": 2.8589937686920166, + "logps/chosen": -411.73284912109375, + "logps/rejected": -428.7911682128906, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2299811840057373, + "rewards/margins": 11.306024551391602, + "rewards/rejected": -10.076042175292969, + "step": 2540 + }, + { + "epoch": 1.64, + "learning_rate": 8.076648417043561e-08, + "logits/chosen": 2.363246202468872, + "logits/rejected": 2.9583797454833984, + "logps/chosen": -473.9364318847656, + "logps/rejected": -445.84625244140625, + "loss": 0.0158, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.21724534034729, + "rewards/margins": 11.700216293334961, + "rewards/rejected": -9.482972145080566, + "step": 2550 + }, + { + "epoch": 1.65, + "learning_rate": 8.064746488931206e-08, + "logits/chosen": 2.5506205558776855, + "logits/rejected": 3.462920665740967, + "logps/chosen": -411.14678955078125, + "logps/rejected": -444.2726135253906, + "loss": 0.0103, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6240037083625793, + "rewards/margins": 11.176986694335938, + "rewards/rejected": -10.552982330322266, + "step": 2560 + }, + { + "epoch": 1.65, + "learning_rate": 8.052844560818852e-08, + "logits/chosen": 2.1937804222106934, + "logits/rejected": 3.3300259113311768, + "logps/chosen": -457.53814697265625, + "logps/rejected": -449.9571838378906, + "loss": 0.0083, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.528435230255127, + "rewards/margins": 11.644414901733398, + "rewards/rejected": -10.115981101989746, + "step": 2570 + }, + { + "epoch": 1.66, + "learning_rate": 8.040942632706498e-08, + "logits/chosen": 2.55604887008667, + "logits/rejected": 3.0880343914031982, + "logps/chosen": -420.58782958984375, + "logps/rejected": -388.7262268066406, + "loss": 0.0075, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.4543565511703491, + "rewards/margins": 10.79311752319336, + "rewards/rejected": -9.338762283325195, + "step": 2580 + }, + { + "epoch": 1.66, + "learning_rate": 8.029040704594145e-08, + "logits/chosen": 2.547492504119873, + "logits/rejected": 3.6884007453918457, + "logps/chosen": -380.635009765625, + "logps/rejected": -423.63018798828125, + "loss": 0.0098, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.180540680885315, + "rewards/margins": 11.04011344909668, + "rewards/rejected": -9.859573364257812, + "step": 2590 + }, + { + "epoch": 1.67, + "learning_rate": 8.01713877648179e-08, + "logits/chosen": 2.499735116958618, + "logits/rejected": 3.318908214569092, + "logps/chosen": -406.8338623046875, + "logps/rejected": -433.6927795410156, + "loss": 0.0059, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4649948179721832, + "rewards/margins": 11.194982528686523, + "rewards/rejected": -10.729987144470215, + "step": 2600 + }, + { + "epoch": 1.67, + "eval_logits/chosen": 1.8153432607650757, + "eval_logits/rejected": 2.3447887897491455, + "eval_logps/chosen": -408.9284973144531, + "eval_logps/rejected": -409.33148193359375, + "eval_loss": 0.07508327066898346, + "eval_rewards/accuracies": 0.9609375, + "eval_rewards/chosen": -0.9586971402168274, + "eval_rewards/margins": 9.413373947143555, + "eval_rewards/rejected": -10.3720703125, + "eval_runtime": 78.0392, + "eval_samples_per_second": 12.814, + "eval_steps_per_second": 0.41, + "step": 2600 + }, + { + "epoch": 1.68, + "learning_rate": 8.005236848369436e-08, + "logits/chosen": 2.228560447692871, + "logits/rejected": 3.2750918865203857, + "logps/chosen": -443.6728515625, + "logps/rejected": -464.92108154296875, + "loss": 0.0084, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.657561182975769, + "rewards/margins": 11.451318740844727, + "rewards/rejected": -10.793758392333984, + "step": 2610 + }, + { + "epoch": 1.68, + "learning_rate": 7.993334920257082e-08, + "logits/chosen": 2.6553711891174316, + "logits/rejected": 3.3680367469787598, + "logps/chosen": -449.4634704589844, + "logps/rejected": -419.002197265625, + "loss": 0.0096, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8493421673774719, + "rewards/margins": 11.447193145751953, + "rewards/rejected": -10.597851753234863, + "step": 2620 + }, + { + "epoch": 1.69, + "learning_rate": 7.981432992144727e-08, + "logits/chosen": 2.665163278579712, + "logits/rejected": 3.3812012672424316, + "logps/chosen": -468.38714599609375, + "logps/rejected": -457.9358825683594, + "loss": 0.007, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7476972341537476, + "rewards/margins": 12.187990188598633, + "rewards/rejected": -11.440293312072754, + "step": 2630 + }, + { + "epoch": 1.7, + "learning_rate": 7.969531064032373e-08, + "logits/chosen": 2.5799193382263184, + "logits/rejected": 3.3646559715270996, + "logps/chosen": -375.97174072265625, + "logps/rejected": -422.7632751464844, + "loss": 0.0115, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5605910420417786, + "rewards/margins": 11.750158309936523, + "rewards/rejected": -11.189568519592285, + "step": 2640 + }, + { + "epoch": 1.7, + "learning_rate": 7.957629135920018e-08, + "logits/chosen": 2.4693617820739746, + "logits/rejected": 3.5580387115478516, + "logps/chosen": -430.0306091308594, + "logps/rejected": -447.8194274902344, + "loss": 0.0089, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3709467649459839, + "rewards/margins": 11.733332633972168, + "rewards/rejected": -11.362385749816895, + "step": 2650 + }, + { + "epoch": 1.71, + "learning_rate": 7.945727207807664e-08, + "logits/chosen": 2.5020830631256104, + "logits/rejected": 3.252861738204956, + "logps/chosen": -435.47222900390625, + "logps/rejected": -447.82373046875, + "loss": 0.0088, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.05917937681078911, + "rewards/margins": 11.433965682983398, + "rewards/rejected": -11.493144035339355, + "step": 2660 + }, + { + "epoch": 1.72, + "learning_rate": 7.93382527969531e-08, + "logits/chosen": 2.116004705429077, + "logits/rejected": 3.1918604373931885, + "logps/chosen": -463.59344482421875, + "logps/rejected": -442.08203125, + "loss": 0.0137, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2367243766784668, + "rewards/margins": 12.351614952087402, + "rewards/rejected": -11.11489200592041, + "step": 2670 + }, + { + "epoch": 1.72, + "learning_rate": 7.921923351582957e-08, + "logits/chosen": 2.5514450073242188, + "logits/rejected": 2.77209734916687, + "logps/chosen": -384.43865966796875, + "logps/rejected": -423.64892578125, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14157016575336456, + "rewards/margins": 11.351391792297363, + "rewards/rejected": -11.209821701049805, + "step": 2680 + }, + { + "epoch": 1.73, + "learning_rate": 7.910021423470602e-08, + "logits/chosen": 2.9975619316101074, + "logits/rejected": 2.824094295501709, + "logps/chosen": -428.303466796875, + "logps/rejected": -438.8485412597656, + "loss": 0.0099, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5388720631599426, + "rewards/margins": 12.036158561706543, + "rewards/rejected": -11.497285842895508, + "step": 2690 + }, + { + "epoch": 1.74, + "learning_rate": 7.898119495358248e-08, + "logits/chosen": 2.3330235481262207, + "logits/rejected": 3.021794080734253, + "logps/chosen": -417.4193420410156, + "logps/rejected": -474.170166015625, + "loss": 0.0054, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.0910119041800499, + "rewards/margins": 12.0770902633667, + "rewards/rejected": -12.168102264404297, + "step": 2700 + }, + { + "epoch": 1.74, + "eval_logits/chosen": 1.7441036701202393, + "eval_logits/rejected": 2.28686785697937, + "eval_logps/chosen": -404.1935119628906, + "eval_logps/rejected": -408.94500732421875, + "eval_loss": 0.0632321760058403, + "eval_rewards/accuracies": 0.9609375, + "eval_rewards/chosen": -0.48519980907440186, + "eval_rewards/margins": 9.848224639892578, + "eval_rewards/rejected": -10.333423614501953, + "eval_runtime": 77.9801, + "eval_samples_per_second": 12.824, + "eval_steps_per_second": 0.41, + "step": 2700 + }, + { + "epoch": 1.74, + "learning_rate": 7.886217567245894e-08, + "logits/chosen": 2.2133631706237793, + "logits/rejected": 3.3354735374450684, + "logps/chosen": -428.5398864746094, + "logps/rejected": -432.5113220214844, + "loss": 0.0077, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9777836799621582, + "rewards/margins": 12.670408248901367, + "rewards/rejected": -11.692625045776367, + "step": 2710 + }, + { + "epoch": 1.75, + "learning_rate": 7.874315639133539e-08, + "logits/chosen": 2.4246087074279785, + "logits/rejected": 3.052839756011963, + "logps/chosen": -421.6343688964844, + "logps/rejected": -421.95697021484375, + "loss": 0.0077, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3704569339752197, + "rewards/margins": 12.342972755432129, + "rewards/rejected": -10.972516059875488, + "step": 2720 + }, + { + "epoch": 1.75, + "learning_rate": 7.862413711021185e-08, + "logits/chosen": 3.059401273727417, + "logits/rejected": 3.639910936355591, + "logps/chosen": -407.8876037597656, + "logps/rejected": -447.3395080566406, + "loss": 0.0123, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.906929612159729, + "rewards/margins": 12.296318054199219, + "rewards/rejected": -11.389389991760254, + "step": 2730 + }, + { + "epoch": 1.76, + "learning_rate": 7.85051178290883e-08, + "logits/chosen": 2.043600082397461, + "logits/rejected": 2.944366931915283, + "logps/chosen": -399.25140380859375, + "logps/rejected": -418.093994140625, + "loss": 0.0117, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.2023608684539795, + "rewards/margins": 10.754437446594238, + "rewards/rejected": -10.55207633972168, + "step": 2740 + }, + { + "epoch": 1.77, + "learning_rate": 7.838609854796476e-08, + "logits/chosen": 2.511198043823242, + "logits/rejected": 3.7167916297912598, + "logps/chosen": -428.31396484375, + "logps/rejected": -459.7771911621094, + "loss": 0.0065, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.24688346683979034, + "rewards/margins": 11.848150253295898, + "rewards/rejected": -11.601266860961914, + "step": 2750 + }, + { + "epoch": 1.77, + "learning_rate": 7.826707926684123e-08, + "logits/chosen": 2.4253451824188232, + "logits/rejected": 3.1690335273742676, + "logps/chosen": -426.9501953125, + "logps/rejected": -453.51531982421875, + "loss": 0.0053, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.7976835370063782, + "rewards/margins": 12.751691818237305, + "rewards/rejected": -11.954008102416992, + "step": 2760 + }, + { + "epoch": 1.78, + "learning_rate": 7.814805998571769e-08, + "logits/chosen": 2.323408365249634, + "logits/rejected": 3.649256467819214, + "logps/chosen": -410.73699951171875, + "logps/rejected": -436.67669677734375, + "loss": 0.0125, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.16351564228534698, + "rewards/margins": 10.847258567810059, + "rewards/rejected": -10.683743476867676, + "step": 2770 + }, + { + "epoch": 1.79, + "learning_rate": 7.802904070459414e-08, + "logits/chosen": 2.627110004425049, + "logits/rejected": 3.4923622608184814, + "logps/chosen": -434.0037536621094, + "logps/rejected": -436.61669921875, + "loss": 0.0092, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.193922996520996, + "rewards/margins": 11.6904296875, + "rewards/rejected": -10.496505737304688, + "step": 2780 + }, + { + "epoch": 1.79, + "learning_rate": 7.79100214234706e-08, + "logits/chosen": 2.3079676628112793, + "logits/rejected": 3.4073116779327393, + "logps/chosen": -419.6051330566406, + "logps/rejected": -428.28326416015625, + "loss": 0.008, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.7547308802604675, + "rewards/margins": 11.233491897583008, + "rewards/rejected": -10.478760719299316, + "step": 2790 + }, + { + "epoch": 1.8, + "learning_rate": 7.779100214234706e-08, + "logits/chosen": 2.747231960296631, + "logits/rejected": 3.3032188415527344, + "logps/chosen": -440.6018981933594, + "logps/rejected": -423.44952392578125, + "loss": 0.0083, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2705620527267456, + "rewards/margins": 11.606225967407227, + "rewards/rejected": -10.335662841796875, + "step": 2800 + }, + { + "epoch": 1.8, + "eval_logits/chosen": 1.7818334102630615, + "eval_logits/rejected": 2.3191139698028564, + "eval_logps/chosen": -398.98333740234375, + "eval_logps/rejected": -405.5218811035156, + "eval_loss": 0.05957724153995514, + "eval_rewards/accuracies": 0.953125, + "eval_rewards/chosen": 0.03581659495830536, + "eval_rewards/margins": 10.026926040649414, + "eval_rewards/rejected": -9.991110801696777, + "eval_runtime": 77.8303, + "eval_samples_per_second": 12.848, + "eval_steps_per_second": 0.411, + "step": 2800 + }, + { + "epoch": 1.81, + "learning_rate": 7.767198286122351e-08, + "logits/chosen": 2.7316629886627197, + "logits/rejected": 3.3055825233459473, + "logps/chosen": -377.58599853515625, + "logps/rejected": -423.8103942871094, + "loss": 0.0101, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7823238968849182, + "rewards/margins": 11.870222091674805, + "rewards/rejected": -11.087898254394531, + "step": 2810 + }, + { + "epoch": 1.81, + "learning_rate": 7.755296358009997e-08, + "logits/chosen": 2.6006321907043457, + "logits/rejected": 3.5436511039733887, + "logps/chosen": -359.8816833496094, + "logps/rejected": -414.5054626464844, + "loss": 0.0065, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05306140333414078, + "rewards/margins": 11.554253578186035, + "rewards/rejected": -11.501191139221191, + "step": 2820 + }, + { + "epoch": 1.82, + "learning_rate": 7.743394429897642e-08, + "logits/chosen": 2.4547390937805176, + "logits/rejected": 3.4132580757141113, + "logps/chosen": -448.56866455078125, + "logps/rejected": -452.8617248535156, + "loss": 0.0109, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.098752737045288, + "rewards/margins": 12.525891304016113, + "rewards/rejected": -11.427138328552246, + "step": 2830 + }, + { + "epoch": 1.83, + "learning_rate": 7.731492501785288e-08, + "logits/chosen": 2.644763469696045, + "logits/rejected": 3.5032525062561035, + "logps/chosen": -416.0933532714844, + "logps/rejected": -436.23486328125, + "loss": 0.0095, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0677202939987183, + "rewards/margins": 11.958089828491211, + "rewards/rejected": -10.890369415283203, + "step": 2840 + }, + { + "epoch": 1.83, + "learning_rate": 7.719590573672935e-08, + "logits/chosen": 2.381120204925537, + "logits/rejected": 3.223132371902466, + "logps/chosen": -429.01861572265625, + "logps/rejected": -421.361328125, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6816972494125366, + "rewards/margins": 11.070103645324707, + "rewards/rejected": -10.388406753540039, + "step": 2850 + }, + { + "epoch": 1.84, + "learning_rate": 7.70768864556058e-08, + "logits/chosen": 3.1485886573791504, + "logits/rejected": 3.6244475841522217, + "logps/chosen": -411.9215393066406, + "logps/rejected": -425.89837646484375, + "loss": 0.0052, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4478631615638733, + "rewards/margins": 11.844661712646484, + "rewards/rejected": -11.39680004119873, + "step": 2860 + }, + { + "epoch": 1.84, + "learning_rate": 7.695786717448226e-08, + "logits/chosen": 2.1998846530914307, + "logits/rejected": 3.2429378032684326, + "logps/chosen": -422.6412658691406, + "logps/rejected": -422.876953125, + "loss": 0.0085, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1410973072052002, + "rewards/margins": 11.749425888061523, + "rewards/rejected": -10.608327865600586, + "step": 2870 + }, + { + "epoch": 1.85, + "learning_rate": 7.683884789335872e-08, + "logits/chosen": 2.5578694343566895, + "logits/rejected": 3.4331531524658203, + "logps/chosen": -455.40301513671875, + "logps/rejected": -470.500732421875, + "loss": 0.0062, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.343795657157898, + "rewards/margins": 12.484710693359375, + "rewards/rejected": -11.140914916992188, + "step": 2880 + }, + { + "epoch": 1.86, + "learning_rate": 7.671982861223519e-08, + "logits/chosen": 2.706578254699707, + "logits/rejected": 3.628066301345825, + "logps/chosen": -389.6455383300781, + "logps/rejected": -412.09686279296875, + "loss": 0.0101, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3088476657867432, + "rewards/margins": 12.140149116516113, + "rewards/rejected": -10.831302642822266, + "step": 2890 + }, + { + "epoch": 1.86, + "learning_rate": 7.660080933111163e-08, + "logits/chosen": 2.5364797115325928, + "logits/rejected": 3.7345130443573, + "logps/chosen": -440.4591369628906, + "logps/rejected": -508.75164794921875, + "loss": 0.0066, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2763676643371582, + "rewards/margins": 16.67254638671875, + "rewards/rejected": -15.3961763381958, + "step": 2900 + }, + { + "epoch": 1.86, + "eval_logits/chosen": 1.9009253978729248, + "eval_logits/rejected": 2.4084951877593994, + "eval_logps/chosen": -403.50592041015625, + "eval_logps/rejected": -410.25238037109375, + "eval_loss": 0.06930559128522873, + "eval_rewards/accuracies": 0.9453125, + "eval_rewards/chosen": -0.4164417088031769, + "eval_rewards/margins": 10.047719955444336, + "eval_rewards/rejected": -10.464160919189453, + "eval_runtime": 78.1903, + "eval_samples_per_second": 12.789, + "eval_steps_per_second": 0.409, + "step": 2900 + }, + { + "epoch": 1.87, + "learning_rate": 7.648179004998809e-08, + "logits/chosen": 2.4650321006774902, + "logits/rejected": 2.9449849128723145, + "logps/chosen": -446.25445556640625, + "logps/rejected": -468.34954833984375, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0595293045043945, + "rewards/margins": 12.140164375305176, + "rewards/rejected": -11.080634117126465, + "step": 2910 + }, + { + "epoch": 1.88, + "learning_rate": 7.636277076886454e-08, + "logits/chosen": 2.7862162590026855, + "logits/rejected": 3.3674285411834717, + "logps/chosen": -447.1483459472656, + "logps/rejected": -483.9112854003906, + "loss": 0.0073, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6543643474578857, + "rewards/margins": 13.146313667297363, + "rewards/rejected": -11.491949081420898, + "step": 2920 + }, + { + "epoch": 1.88, + "learning_rate": 7.624375148774101e-08, + "logits/chosen": 2.4357800483703613, + "logits/rejected": 3.8338236808776855, + "logps/chosen": -415.13055419921875, + "logps/rejected": -419.103515625, + "loss": 0.0108, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.2529149055480957, + "rewards/margins": 11.368078231811523, + "rewards/rejected": -10.11516284942627, + "step": 2930 + }, + { + "epoch": 1.89, + "learning_rate": 7.612473220661747e-08, + "logits/chosen": 3.0562214851379395, + "logits/rejected": 2.8682100772857666, + "logps/chosen": -413.2584533691406, + "logps/rejected": -434.45428466796875, + "loss": 0.0078, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3094024956226349, + "rewards/margins": 11.557034492492676, + "rewards/rejected": -11.24763298034668, + "step": 2940 + }, + { + "epoch": 1.9, + "learning_rate": 7.600571292549393e-08, + "logits/chosen": 2.3887839317321777, + "logits/rejected": 3.3047709465026855, + "logps/chosen": -442.767333984375, + "logps/rejected": -466.8439025878906, + "loss": 0.0107, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5330932140350342, + "rewards/margins": 12.568510055541992, + "rewards/rejected": -11.035417556762695, + "step": 2950 + }, + { + "epoch": 1.9, + "learning_rate": 7.588669364437038e-08, + "logits/chosen": 2.524888515472412, + "logits/rejected": 3.4284675121307373, + "logps/chosen": -402.7549743652344, + "logps/rejected": -445.7957458496094, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.39540719985961914, + "rewards/margins": 11.571582794189453, + "rewards/rejected": -11.176176071166992, + "step": 2960 + }, + { + "epoch": 1.91, + "learning_rate": 7.576767436324685e-08, + "logits/chosen": 2.311568260192871, + "logits/rejected": 3.2848758697509766, + "logps/chosen": -423.08416748046875, + "logps/rejected": -438.42718505859375, + "loss": 0.0098, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0096681118011475, + "rewards/margins": 11.884648323059082, + "rewards/rejected": -10.874979972839355, + "step": 2970 + }, + { + "epoch": 1.92, + "learning_rate": 7.564865508212331e-08, + "logits/chosen": 2.790484666824341, + "logits/rejected": 3.298766613006592, + "logps/chosen": -360.9853515625, + "logps/rejected": -404.8863830566406, + "loss": 0.0089, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.7994908094406128, + "rewards/margins": 10.370939254760742, + "rewards/rejected": -9.571449279785156, + "step": 2980 + }, + { + "epoch": 1.92, + "learning_rate": 7.552963580099975e-08, + "logits/chosen": 2.6981308460235596, + "logits/rejected": 3.2417426109313965, + "logps/chosen": -399.3218994140625, + "logps/rejected": -431.1190490722656, + "loss": 0.0105, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.13810133934021, + "rewards/margins": 12.226466178894043, + "rewards/rejected": -11.08836555480957, + "step": 2990 + }, + { + "epoch": 1.93, + "learning_rate": 7.541061651987621e-08, + "logits/chosen": 2.300840377807617, + "logits/rejected": 3.730071544647217, + "logps/chosen": -458.31536865234375, + "logps/rejected": -440.55523681640625, + "loss": 0.0125, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.555953860282898, + "rewards/margins": 11.838624954223633, + "rewards/rejected": -10.282670974731445, + "step": 3000 + }, + { + "epoch": 1.93, + "eval_logits/chosen": 1.8183174133300781, + "eval_logits/rejected": 2.3427019119262695, + "eval_logps/chosen": -399.7039794921875, + "eval_logps/rejected": -408.1551513671875, + "eval_loss": 0.059715636074543, + "eval_rewards/accuracies": 0.953125, + "eval_rewards/chosen": -0.03624638170003891, + "eval_rewards/margins": 10.218188285827637, + "eval_rewards/rejected": -10.254435539245605, + "eval_runtime": 78.1202, + "eval_samples_per_second": 12.801, + "eval_steps_per_second": 0.41, + "step": 3000 + }, + { + "epoch": 1.93, + "learning_rate": 7.529159723875268e-08, + "logits/chosen": 2.5125820636749268, + "logits/rejected": 2.929962635040283, + "logps/chosen": -435.52734375, + "logps/rejected": -480.02215576171875, + "loss": 0.0104, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.0602343082427979, + "rewards/margins": 12.658498764038086, + "rewards/rejected": -11.59826374053955, + "step": 3010 + }, + { + "epoch": 1.94, + "learning_rate": 7.517257795762913e-08, + "logits/chosen": 2.5797853469848633, + "logits/rejected": 3.4592716693878174, + "logps/chosen": -427.9090881347656, + "logps/rejected": -460.9290466308594, + "loss": 0.0091, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.9765796661376953, + "rewards/margins": 12.438034057617188, + "rewards/rejected": -11.461454391479492, + "step": 3020 + }, + { + "epoch": 1.95, + "learning_rate": 7.505355867650559e-08, + "logits/chosen": 2.317451000213623, + "logits/rejected": 3.6136322021484375, + "logps/chosen": -426.43841552734375, + "logps/rejected": -431.55279541015625, + "loss": 0.0083, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6562097072601318, + "rewards/margins": 11.710824966430664, + "rewards/rejected": -10.05461597442627, + "step": 3030 + }, + { + "epoch": 1.95, + "learning_rate": 7.493453939538205e-08, + "logits/chosen": 2.4118270874023438, + "logits/rejected": 3.3411223888397217, + "logps/chosen": -433.496337890625, + "logps/rejected": -413.53826904296875, + "loss": 0.0106, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.49976015090942383, + "rewards/margins": 10.6139554977417, + "rewards/rejected": -10.114194869995117, + "step": 3040 + }, + { + "epoch": 1.96, + "learning_rate": 7.48155201142585e-08, + "logits/chosen": 2.3743271827697754, + "logits/rejected": 3.5587539672851562, + "logps/chosen": -422.71014404296875, + "logps/rejected": -442.51556396484375, + "loss": 0.014, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9052066802978516, + "rewards/margins": 11.696832656860352, + "rewards/rejected": -10.791627883911133, + "step": 3050 + }, + { + "epoch": 1.97, + "learning_rate": 7.469650083313497e-08, + "logits/chosen": 2.606020450592041, + "logits/rejected": 3.443312168121338, + "logps/chosen": -397.82208251953125, + "logps/rejected": -423.3816833496094, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.306690514087677, + "rewards/margins": 11.203126907348633, + "rewards/rejected": -10.896435737609863, + "step": 3060 + }, + { + "epoch": 1.97, + "learning_rate": 7.457748155201143e-08, + "logits/chosen": 2.916489839553833, + "logits/rejected": 3.762838840484619, + "logps/chosen": -384.3777770996094, + "logps/rejected": -457.49041748046875, + "loss": 0.0061, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04699459671974182, + "rewards/margins": 11.909761428833008, + "rewards/rejected": -11.86276626586914, + "step": 3070 + }, + { + "epoch": 1.98, + "learning_rate": 7.445846227088787e-08, + "logits/chosen": 2.0478157997131348, + "logits/rejected": 3.478813886642456, + "logps/chosen": -432.97686767578125, + "logps/rejected": -439.83221435546875, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5262401103973389, + "rewards/margins": 12.083934783935547, + "rewards/rejected": -10.557694435119629, + "step": 3080 + }, + { + "epoch": 1.99, + "learning_rate": 7.433944298976433e-08, + "logits/chosen": 2.548459053039551, + "logits/rejected": 3.1747069358825684, + "logps/chosen": -429.4046936035156, + "logps/rejected": -495.275146484375, + "loss": 0.0074, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3433806300163269, + "rewards/margins": 12.296731948852539, + "rewards/rejected": -11.953351974487305, + "step": 3090 + }, + { + "epoch": 1.99, + "learning_rate": 7.42204237086408e-08, + "logits/chosen": 2.3155176639556885, + "logits/rejected": 3.0107157230377197, + "logps/chosen": -458.78216552734375, + "logps/rejected": -431.2632751464844, + "loss": 0.0093, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5119603872299194, + "rewards/margins": 12.521995544433594, + "rewards/rejected": -11.010034561157227, + "step": 3100 + }, + { + "epoch": 1.99, + "eval_logits/chosen": 1.7790201902389526, + "eval_logits/rejected": 2.328681468963623, + "eval_logps/chosen": -406.3954772949219, + "eval_logps/rejected": -412.999755859375, + "eval_loss": 0.0663955956697464, + "eval_rewards/accuracies": 0.9609375, + "eval_rewards/chosen": -0.70539790391922, + "eval_rewards/margins": 10.033500671386719, + "eval_rewards/rejected": -10.738900184631348, + "eval_runtime": 78.1747, + "eval_samples_per_second": 12.792, + "eval_steps_per_second": 0.409, + "step": 3100 + }, + { + "epoch": 2.0, + "learning_rate": 7.410140442751725e-08, + "logits/chosen": 2.371422290802002, + "logits/rejected": 3.271979808807373, + "logps/chosen": -402.83258056640625, + "logps/rejected": -439.00079345703125, + "loss": 0.0137, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6034157872200012, + "rewards/margins": 12.108831405639648, + "rewards/rejected": -11.505415916442871, + "step": 3110 + } + ], + "logging_steps": 10, + "max_steps": 9336, + "num_train_epochs": 6, + "save_steps": 500, + "total_flos": 0.0, + "trial_name": null, + "trial_params": null +}